[llvm] [AMDGPU] Avoid bundling a SCHED_BARRIER with memops (PR #153533)

Yoonseo Choi via llvm-commits llvm-commits at lists.llvm.org
Fri Aug 22 09:09:50 PDT 2025


https://github.com/yoonseoch updated https://github.com/llvm/llvm-project/pull/153533

>From d63e693c178980d6eabfd60a0f8df1d6899e1e4f Mon Sep 17 00:00:00 2001
From: Yoonseo Choi <Yoonseo.Choi at amd.com>
Date: Wed, 13 Aug 2025 20:40:04 -0500
Subject: [PATCH 01/11] [AMDGPU] Do not bundle a meta-instruction with memory
 operations

---
 llvm/lib/Target/AMDGPU/SIPostRABundler.cpp    |  19 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll  | 930 +++++++++---------
 .../CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll   |  46 +-
 .../AMDGPU/gfx-callable-argument-types.ll     |  52 +-
 llvm/test/CodeGen/AMDGPU/max.i16.ll           |   1 -
 .../CodeGen/AMDGPU/postra-bundle-memops.mir   |  58 +-
 6 files changed, 553 insertions(+), 553 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
index efdc55b8e68be..79be639993343 100644
--- a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
@@ -179,20 +179,19 @@ bool SIPostRABundler::run(MachineFunction &MF) {
         Next = std::next(I);
 
         assert(BundleEnd != I);
-        if (canBundle(*BundleEnd, *I)) {
+        if (!canBundle(*BundleEnd, *I)) {
+          // Do not allow even meta instructions (e.g. SCHED_BARRIER) bundled
+          // between memory operations.
+          // SCHED_BARRIERs are added by users for a finer control over schedule
+          // than bundling.
+          // Examples of meta instructions: WAVE_BARRIER, SCHED_{GROUP_}BARRIER,
+          // IGLP_OPT, amdgcn.unreachable.
+          break;
+        }
           BundleEnd = I;
           if (I->getNumExplicitDefs() != 0)
             Defs.insert(I->defs().begin()->getReg());
           ++ClauseLength;
-        } else if (!I->isMetaInstruction()) {
-          // Allow meta instructions in between bundle candidates, but do not
-          // start or end a bundle on one.
-          //
-          // TODO: It may be better to move meta instructions like dbg_value
-          // after the bundle. We're relying on the memory legalizer to unbundle
-          // these.
-          break;
-        }
       }
 
       Next = std::next(BundleEnd);
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
index d03d6a8940b2f..6b47ed6e43984 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
@@ -4333,9 +4333,14 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr59
+; VI-NEXT:    ; implicit-def: $vgpr58
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr58
 ; VI-NEXT:    ; implicit-def: $vgpr39
 ; VI-NEXT:    ; kill: killed $vgpr39
 ; VI-NEXT:    ; implicit-def: $vgpr39
@@ -4437,18 +4442,45 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; VI-NEXT:    ; implicit-def: $vgpr39
 ; VI-NEXT:    ; kill: killed $vgpr39
 ; VI-NEXT:    ; implicit-def: $vgpr39
-; VI-NEXT:    ; implicit-def: $vgpr59
 ; VI-NEXT:    ; kill: killed $vgpr39
 ; VI-NEXT:    ; implicit-def: $vgpr39
-; VI-NEXT:    ; implicit-def: $vgpr58
 ; VI-NEXT:    ; kill: killed $vgpr39
 ; VI-NEXT:    ; implicit-def: $vgpr39
-; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr58
+; VI-NEXT:    ; implicit-def: $vgpr48
+; VI-NEXT:    ; implicit-def: $vgpr53
+; VI-NEXT:    ; implicit-def: $vgpr57
+; VI-NEXT:    ; implicit-def: $vgpr56
+; VI-NEXT:    ; implicit-def: $vgpr34
+; VI-NEXT:    ; implicit-def: $vgpr38
+; VI-NEXT:    ; implicit-def: $vgpr52
+; VI-NEXT:    ; implicit-def: $vgpr47
+; VI-NEXT:    ; implicit-def: $vgpr46
+; VI-NEXT:    ; implicit-def: $vgpr45
+; VI-NEXT:    ; implicit-def: $vgpr44
+; VI-NEXT:    ; implicit-def: $vgpr51
+; VI-NEXT:    ; implicit-def: $vgpr37
+; VI-NEXT:    ; implicit-def: $vgpr43
+; VI-NEXT:    ; implicit-def: $vgpr50
+; VI-NEXT:    ; implicit-def: $vgpr63
+; VI-NEXT:    ; implicit-def: $vgpr36
+; VI-NEXT:    ; implicit-def: $vgpr62
+; VI-NEXT:    ; implicit-def: $vgpr61
+; VI-NEXT:    ; implicit-def: $vgpr49
+; VI-NEXT:    ; implicit-def: $vgpr60
+; VI-NEXT:    ; implicit-def: $vgpr35
+; VI-NEXT:    ; kill: killed $vgpr39
+; VI-NEXT:    ; implicit-def: $vgpr42
+; VI-NEXT:    ; implicit-def: $vgpr55
+; VI-NEXT:    ; implicit-def: $vgpr41
+; VI-NEXT:    ; implicit-def: $vgpr40
+; VI-NEXT:    ; implicit-def: $vgpr39
+; VI-NEXT:    ; implicit-def: $vgpr54
 ; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr58
+; VI-NEXT:    s_waitcnt vmcnt(5)
+; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
+; VI-NEXT:    ; implicit-def: $vgpr33
 ; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr58
@@ -4478,45 +4510,14 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; VI-NEXT:    ; implicit-def: $vgpr58
 ; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr48
-; VI-NEXT:    ; implicit-def: $vgpr53
-; VI-NEXT:    ; implicit-def: $vgpr57
-; VI-NEXT:    ; implicit-def: $vgpr56
-; VI-NEXT:    ; implicit-def: $vgpr34
-; VI-NEXT:    ; implicit-def: $vgpr38
-; VI-NEXT:    ; implicit-def: $vgpr52
-; VI-NEXT:    ; implicit-def: $vgpr47
-; VI-NEXT:    ; implicit-def: $vgpr46
-; VI-NEXT:    ; implicit-def: $vgpr45
-; VI-NEXT:    ; implicit-def: $vgpr44
-; VI-NEXT:    ; implicit-def: $vgpr51
-; VI-NEXT:    ; implicit-def: $vgpr37
-; VI-NEXT:    ; implicit-def: $vgpr43
-; VI-NEXT:    ; implicit-def: $vgpr50
-; VI-NEXT:    ; implicit-def: $vgpr63
-; VI-NEXT:    ; implicit-def: $vgpr36
-; VI-NEXT:    ; implicit-def: $vgpr62
-; VI-NEXT:    ; implicit-def: $vgpr61
-; VI-NEXT:    ; implicit-def: $vgpr49
-; VI-NEXT:    ; implicit-def: $vgpr60
-; VI-NEXT:    ; implicit-def: $vgpr35
-; VI-NEXT:    ; kill: killed $vgpr39
-; VI-NEXT:    ; implicit-def: $vgpr42
-; VI-NEXT:    ; implicit-def: $vgpr55
-; VI-NEXT:    ; implicit-def: $vgpr41
-; VI-NEXT:    ; implicit-def: $vgpr40
-; VI-NEXT:    ; implicit-def: $vgpr39
-; VI-NEXT:    ; implicit-def: $vgpr54
 ; VI-NEXT:    ; implicit-def: $vgpr58
-; VI-NEXT:    s_waitcnt vmcnt(14)
-; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
-; VI-NEXT:    ; implicit-def: $vgpr33
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; VI-NEXT:    s_cbranch_execz .LBB12_2
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v32
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v31
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v31
@@ -4694,6 +4695,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; VI-NEXT:    s_cbranch_execz .LBB12_4
 ; VI-NEXT:  ; %bb.3: ; %cmp.true
 ; VI-NEXT:    v_add_u32_e32 v32, vcc, 3, v32
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_add_u32_e32 v31, vcc, 3, v31
 ; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[31:32]
 ; VI-NEXT:    v_add_u32_e32 v30, vcc, 3, v30
@@ -5302,9 +5304,16 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT:    ; implicit-def: $vgpr43
+; GFX9-NEXT:    ; implicit-def: $vgpr42
 ; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-NEXT:    ; implicit-def: $vgpr42
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
 ; GFX9-NEXT:    ; kill: killed $vgpr40
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
@@ -5398,14 +5407,12 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr50
 ; GFX9-NEXT:    ; kill: killed $vgpr40
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
-; GFX9-NEXT:    ; implicit-def: $vgpr43
 ; GFX9-NEXT:    ; kill: killed $vgpr36
 ; GFX9-NEXT:    ; implicit-def: $vgpr36
 ; GFX9-NEXT:    ; kill: killed $vgpr50
 ; GFX9-NEXT:    ; implicit-def: $vgpr50
 ; GFX9-NEXT:    ; kill: killed $vgpr40
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
-; GFX9-NEXT:    ; implicit-def: $vgpr42
 ; GFX9-NEXT:    ; implicit-def: $vgpr56
 ; GFX9-NEXT:    ; implicit-def: $vgpr44
 ; GFX9-NEXT:    ; implicit-def: $vgpr38
@@ -5437,15 +5444,15 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; GFX9-NEXT:    ; kill: killed $vgpr40
 ; GFX9-NEXT:    ; implicit-def: $vgpr41
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; GFX9-NEXT:    ; implicit-def: $vgpr42
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr42
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
+; GFX9-NEXT:    ; implicit-def: $vgpr33
+; GFX9-NEXT:    ; kill: killed $vgpr33
+; GFX9-NEXT:    ; implicit-def: $vgpr33
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
@@ -5493,11 +5500,6 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(29)
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
-; GFX9-NEXT:    ; implicit-def: $vgpr33
-; GFX9-NEXT:    ; kill: killed $vgpr33
-; GFX9-NEXT:    ; implicit-def: $vgpr33
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB12_2
@@ -40052,9 +40054,14 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr59
+; VI-NEXT:    ; implicit-def: $vgpr58
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr58
 ; VI-NEXT:    ; implicit-def: $vgpr39
 ; VI-NEXT:    ; kill: killed $vgpr39
 ; VI-NEXT:    ; implicit-def: $vgpr39
@@ -40156,18 +40163,45 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; VI-NEXT:    ; implicit-def: $vgpr39
 ; VI-NEXT:    ; kill: killed $vgpr39
 ; VI-NEXT:    ; implicit-def: $vgpr39
-; VI-NEXT:    ; implicit-def: $vgpr59
 ; VI-NEXT:    ; kill: killed $vgpr39
 ; VI-NEXT:    ; implicit-def: $vgpr39
-; VI-NEXT:    ; implicit-def: $vgpr58
 ; VI-NEXT:    ; kill: killed $vgpr39
 ; VI-NEXT:    ; implicit-def: $vgpr39
-; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr58
+; VI-NEXT:    ; implicit-def: $vgpr48
+; VI-NEXT:    ; implicit-def: $vgpr53
+; VI-NEXT:    ; implicit-def: $vgpr57
+; VI-NEXT:    ; implicit-def: $vgpr56
+; VI-NEXT:    ; implicit-def: $vgpr34
+; VI-NEXT:    ; implicit-def: $vgpr38
+; VI-NEXT:    ; implicit-def: $vgpr52
+; VI-NEXT:    ; implicit-def: $vgpr47
+; VI-NEXT:    ; implicit-def: $vgpr46
+; VI-NEXT:    ; implicit-def: $vgpr45
+; VI-NEXT:    ; implicit-def: $vgpr44
+; VI-NEXT:    ; implicit-def: $vgpr51
+; VI-NEXT:    ; implicit-def: $vgpr37
+; VI-NEXT:    ; implicit-def: $vgpr43
+; VI-NEXT:    ; implicit-def: $vgpr50
+; VI-NEXT:    ; implicit-def: $vgpr63
+; VI-NEXT:    ; implicit-def: $vgpr36
+; VI-NEXT:    ; implicit-def: $vgpr62
+; VI-NEXT:    ; implicit-def: $vgpr61
+; VI-NEXT:    ; implicit-def: $vgpr49
+; VI-NEXT:    ; implicit-def: $vgpr60
+; VI-NEXT:    ; implicit-def: $vgpr35
+; VI-NEXT:    ; kill: killed $vgpr39
+; VI-NEXT:    ; implicit-def: $vgpr42
+; VI-NEXT:    ; implicit-def: $vgpr55
+; VI-NEXT:    ; implicit-def: $vgpr41
+; VI-NEXT:    ; implicit-def: $vgpr40
+; VI-NEXT:    ; implicit-def: $vgpr39
+; VI-NEXT:    ; implicit-def: $vgpr54
 ; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr58
+; VI-NEXT:    s_waitcnt vmcnt(5)
+; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
+; VI-NEXT:    ; implicit-def: $vgpr33
 ; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr58
@@ -40197,45 +40231,14 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; VI-NEXT:    ; implicit-def: $vgpr58
 ; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr48
-; VI-NEXT:    ; implicit-def: $vgpr53
-; VI-NEXT:    ; implicit-def: $vgpr57
-; VI-NEXT:    ; implicit-def: $vgpr56
-; VI-NEXT:    ; implicit-def: $vgpr34
-; VI-NEXT:    ; implicit-def: $vgpr38
-; VI-NEXT:    ; implicit-def: $vgpr52
-; VI-NEXT:    ; implicit-def: $vgpr47
-; VI-NEXT:    ; implicit-def: $vgpr46
-; VI-NEXT:    ; implicit-def: $vgpr45
-; VI-NEXT:    ; implicit-def: $vgpr44
-; VI-NEXT:    ; implicit-def: $vgpr51
-; VI-NEXT:    ; implicit-def: $vgpr37
-; VI-NEXT:    ; implicit-def: $vgpr43
-; VI-NEXT:    ; implicit-def: $vgpr50
-; VI-NEXT:    ; implicit-def: $vgpr63
-; VI-NEXT:    ; implicit-def: $vgpr36
-; VI-NEXT:    ; implicit-def: $vgpr62
-; VI-NEXT:    ; implicit-def: $vgpr61
-; VI-NEXT:    ; implicit-def: $vgpr49
-; VI-NEXT:    ; implicit-def: $vgpr60
-; VI-NEXT:    ; implicit-def: $vgpr35
-; VI-NEXT:    ; kill: killed $vgpr39
-; VI-NEXT:    ; implicit-def: $vgpr42
-; VI-NEXT:    ; implicit-def: $vgpr55
-; VI-NEXT:    ; implicit-def: $vgpr41
-; VI-NEXT:    ; implicit-def: $vgpr40
-; VI-NEXT:    ; implicit-def: $vgpr39
-; VI-NEXT:    ; implicit-def: $vgpr54
 ; VI-NEXT:    ; implicit-def: $vgpr58
-; VI-NEXT:    s_waitcnt vmcnt(14)
-; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
-; VI-NEXT:    ; implicit-def: $vgpr33
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; VI-NEXT:    s_cbranch_execz .LBB36_2
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v32
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v31
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v31
@@ -40413,6 +40416,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; VI-NEXT:    s_cbranch_execz .LBB36_4
 ; VI-NEXT:  ; %bb.3: ; %cmp.true
 ; VI-NEXT:    v_add_f32_e32 v32, 1.0, v32
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_add_f32_e32 v31, 1.0, v31
 ; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[31:32]
 ; VI-NEXT:    v_add_f32_e32 v30, 1.0, v30
@@ -41021,9 +41025,16 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT:    ; implicit-def: $vgpr43
+; GFX9-NEXT:    ; implicit-def: $vgpr42
 ; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-NEXT:    ; implicit-def: $vgpr42
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
 ; GFX9-NEXT:    ; kill: killed $vgpr40
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
@@ -41117,14 +41128,12 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr50
 ; GFX9-NEXT:    ; kill: killed $vgpr40
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
-; GFX9-NEXT:    ; implicit-def: $vgpr43
 ; GFX9-NEXT:    ; kill: killed $vgpr36
 ; GFX9-NEXT:    ; implicit-def: $vgpr36
 ; GFX9-NEXT:    ; kill: killed $vgpr50
 ; GFX9-NEXT:    ; implicit-def: $vgpr50
 ; GFX9-NEXT:    ; kill: killed $vgpr40
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
-; GFX9-NEXT:    ; implicit-def: $vgpr42
 ; GFX9-NEXT:    ; implicit-def: $vgpr56
 ; GFX9-NEXT:    ; implicit-def: $vgpr44
 ; GFX9-NEXT:    ; implicit-def: $vgpr38
@@ -41156,15 +41165,15 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; GFX9-NEXT:    ; kill: killed $vgpr40
 ; GFX9-NEXT:    ; implicit-def: $vgpr41
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; GFX9-NEXT:    ; implicit-def: $vgpr42
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr42
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
+; GFX9-NEXT:    ; implicit-def: $vgpr33
+; GFX9-NEXT:    ; kill: killed $vgpr33
+; GFX9-NEXT:    ; implicit-def: $vgpr33
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
@@ -41212,11 +41221,6 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(29)
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
-; GFX9-NEXT:    ; implicit-def: $vgpr33
-; GFX9-NEXT:    ; kill: killed $vgpr33
-; GFX9-NEXT:    ; implicit-def: $vgpr33
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB36_2
@@ -75706,9 +75710,14 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr59
+; VI-NEXT:    ; implicit-def: $vgpr58
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr58
 ; VI-NEXT:    ; implicit-def: $vgpr39
 ; VI-NEXT:    ; kill: killed $vgpr39
 ; VI-NEXT:    ; implicit-def: $vgpr39
@@ -75810,18 +75819,45 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; VI-NEXT:    ; implicit-def: $vgpr39
 ; VI-NEXT:    ; kill: killed $vgpr39
 ; VI-NEXT:    ; implicit-def: $vgpr39
-; VI-NEXT:    ; implicit-def: $vgpr59
 ; VI-NEXT:    ; kill: killed $vgpr39
 ; VI-NEXT:    ; implicit-def: $vgpr39
-; VI-NEXT:    ; implicit-def: $vgpr58
 ; VI-NEXT:    ; kill: killed $vgpr39
 ; VI-NEXT:    ; implicit-def: $vgpr39
-; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr58
+; VI-NEXT:    ; implicit-def: $vgpr48
+; VI-NEXT:    ; implicit-def: $vgpr53
+; VI-NEXT:    ; implicit-def: $vgpr57
+; VI-NEXT:    ; implicit-def: $vgpr56
+; VI-NEXT:    ; implicit-def: $vgpr34
+; VI-NEXT:    ; implicit-def: $vgpr38
+; VI-NEXT:    ; implicit-def: $vgpr52
+; VI-NEXT:    ; implicit-def: $vgpr47
+; VI-NEXT:    ; implicit-def: $vgpr46
+; VI-NEXT:    ; implicit-def: $vgpr45
+; VI-NEXT:    ; implicit-def: $vgpr44
+; VI-NEXT:    ; implicit-def: $vgpr51
+; VI-NEXT:    ; implicit-def: $vgpr37
+; VI-NEXT:    ; implicit-def: $vgpr43
+; VI-NEXT:    ; implicit-def: $vgpr50
+; VI-NEXT:    ; implicit-def: $vgpr63
+; VI-NEXT:    ; implicit-def: $vgpr36
+; VI-NEXT:    ; implicit-def: $vgpr62
+; VI-NEXT:    ; implicit-def: $vgpr61
+; VI-NEXT:    ; implicit-def: $vgpr49
+; VI-NEXT:    ; implicit-def: $vgpr60
+; VI-NEXT:    ; implicit-def: $vgpr35
+; VI-NEXT:    ; kill: killed $vgpr39
+; VI-NEXT:    ; implicit-def: $vgpr42
+; VI-NEXT:    ; implicit-def: $vgpr55
+; VI-NEXT:    ; implicit-def: $vgpr41
+; VI-NEXT:    ; implicit-def: $vgpr40
+; VI-NEXT:    ; implicit-def: $vgpr39
+; VI-NEXT:    ; implicit-def: $vgpr54
 ; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr58
+; VI-NEXT:    s_waitcnt vmcnt(5)
+; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
+; VI-NEXT:    ; implicit-def: $vgpr33
 ; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr58
@@ -75851,45 +75887,14 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; VI-NEXT:    ; implicit-def: $vgpr58
 ; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr48
-; VI-NEXT:    ; implicit-def: $vgpr53
-; VI-NEXT:    ; implicit-def: $vgpr57
-; VI-NEXT:    ; implicit-def: $vgpr56
-; VI-NEXT:    ; implicit-def: $vgpr34
-; VI-NEXT:    ; implicit-def: $vgpr38
-; VI-NEXT:    ; implicit-def: $vgpr52
-; VI-NEXT:    ; implicit-def: $vgpr47
-; VI-NEXT:    ; implicit-def: $vgpr46
-; VI-NEXT:    ; implicit-def: $vgpr45
-; VI-NEXT:    ; implicit-def: $vgpr44
-; VI-NEXT:    ; implicit-def: $vgpr51
-; VI-NEXT:    ; implicit-def: $vgpr37
-; VI-NEXT:    ; implicit-def: $vgpr43
-; VI-NEXT:    ; implicit-def: $vgpr50
-; VI-NEXT:    ; implicit-def: $vgpr63
-; VI-NEXT:    ; implicit-def: $vgpr36
-; VI-NEXT:    ; implicit-def: $vgpr62
-; VI-NEXT:    ; implicit-def: $vgpr61
-; VI-NEXT:    ; implicit-def: $vgpr49
-; VI-NEXT:    ; implicit-def: $vgpr60
-; VI-NEXT:    ; implicit-def: $vgpr35
-; VI-NEXT:    ; kill: killed $vgpr39
-; VI-NEXT:    ; implicit-def: $vgpr42
-; VI-NEXT:    ; implicit-def: $vgpr55
-; VI-NEXT:    ; implicit-def: $vgpr41
-; VI-NEXT:    ; implicit-def: $vgpr40
-; VI-NEXT:    ; implicit-def: $vgpr39
-; VI-NEXT:    ; implicit-def: $vgpr54
 ; VI-NEXT:    ; implicit-def: $vgpr58
-; VI-NEXT:    s_waitcnt vmcnt(14)
-; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
-; VI-NEXT:    ; implicit-def: $vgpr33
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; VI-NEXT:    s_cbranch_execz .LBB56_2
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v32
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v31
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v31
@@ -76096,6 +76101,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; VI-NEXT:    v_addc_u32_e32 v28, vcc, 0, v28, vcc
 ; VI-NEXT:    v_add_u32_e32 v29, vcc, 3, v29
 ; VI-NEXT:    v_addc_u32_e32 v30, vcc, 0, v30, vcc
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_add_u32_e32 v31, vcc, 3, v31
 ; VI-NEXT:    v_addc_u32_e32 v32, vcc, 0, v32, vcc
 ; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[31:32]
@@ -76675,9 +76681,16 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT:    ; implicit-def: $vgpr43
+; GFX9-NEXT:    ; implicit-def: $vgpr42
 ; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-NEXT:    ; implicit-def: $vgpr42
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
 ; GFX9-NEXT:    ; kill: killed $vgpr40
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
@@ -76771,14 +76784,12 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr50
 ; GFX9-NEXT:    ; kill: killed $vgpr40
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
-; GFX9-NEXT:    ; implicit-def: $vgpr43
 ; GFX9-NEXT:    ; kill: killed $vgpr36
 ; GFX9-NEXT:    ; implicit-def: $vgpr36
 ; GFX9-NEXT:    ; kill: killed $vgpr50
 ; GFX9-NEXT:    ; implicit-def: $vgpr50
 ; GFX9-NEXT:    ; kill: killed $vgpr40
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
-; GFX9-NEXT:    ; implicit-def: $vgpr42
 ; GFX9-NEXT:    ; implicit-def: $vgpr56
 ; GFX9-NEXT:    ; implicit-def: $vgpr44
 ; GFX9-NEXT:    ; implicit-def: $vgpr38
@@ -76810,15 +76821,15 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; GFX9-NEXT:    ; kill: killed $vgpr40
 ; GFX9-NEXT:    ; implicit-def: $vgpr41
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; GFX9-NEXT:    ; implicit-def: $vgpr42
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr42
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
+; GFX9-NEXT:    ; implicit-def: $vgpr33
+; GFX9-NEXT:    ; kill: killed $vgpr33
+; GFX9-NEXT:    ; implicit-def: $vgpr33
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
@@ -76866,11 +76877,6 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(29)
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
-; GFX9-NEXT:    ; implicit-def: $vgpr33
-; GFX9-NEXT:    ; kill: killed $vgpr33
-; GFX9-NEXT:    ; implicit-def: $vgpr33
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB56_2
@@ -109461,9 +109467,14 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr57
+; VI-NEXT:    ; implicit-def: $vgpr56
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr56
 ; VI-NEXT:    ; implicit-def: $vgpr39
 ; VI-NEXT:    ; kill: killed $vgpr39
 ; VI-NEXT:    ; implicit-def: $vgpr39
@@ -109563,18 +109574,45 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; VI-NEXT:    ; implicit-def: $vgpr39
 ; VI-NEXT:    ; kill: killed $vgpr39
 ; VI-NEXT:    ; implicit-def: $vgpr39
-; VI-NEXT:    ; implicit-def: $vgpr57
 ; VI-NEXT:    ; kill: killed $vgpr39
 ; VI-NEXT:    ; implicit-def: $vgpr39
-; VI-NEXT:    ; implicit-def: $vgpr56
 ; VI-NEXT:    ; kill: killed $vgpr39
 ; VI-NEXT:    ; implicit-def: $vgpr39
-; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr56
+; VI-NEXT:    ; implicit-def: $vgpr55
+; VI-NEXT:    ; implicit-def: $vgpr47
+; VI-NEXT:    ; implicit-def: $vgpr54
+; VI-NEXT:    ; implicit-def: $vgpr48
+; VI-NEXT:    ; implicit-def: $vgpr46
+; VI-NEXT:    ; implicit-def: $vgpr53
+; VI-NEXT:    ; implicit-def: $vgpr45
+; VI-NEXT:    ; implicit-def: $vgpr44
+; VI-NEXT:    ; implicit-def: $vgpr34
+; VI-NEXT:    ; implicit-def: $vgpr52
+; VI-NEXT:    ; implicit-def: $vgpr38
+; VI-NEXT:    ; implicit-def: $vgpr43
+; VI-NEXT:    ; implicit-def: $vgpr63
+; VI-NEXT:    ; implicit-def: $vgpr62
+; VI-NEXT:    ; implicit-def: $vgpr51
+; VI-NEXT:    ; implicit-def: $vgpr37
+; VI-NEXT:    ; implicit-def: $vgpr61
+; VI-NEXT:    ; implicit-def: $vgpr50
+; VI-NEXT:    ; implicit-def: $vgpr60
+; VI-NEXT:    ; implicit-def: $vgpr59
+; VI-NEXT:    ; implicit-def: $vgpr36
+; VI-NEXT:    ; implicit-def: $vgpr58
+; VI-NEXT:    ; implicit-def: $vgpr35
+; VI-NEXT:    ; implicit-def: $vgpr49
+; VI-NEXT:    ; kill: killed $vgpr39
+; VI-NEXT:    ; implicit-def: $vgpr42
+; VI-NEXT:    ; implicit-def: $vgpr41
+; VI-NEXT:    ; implicit-def: $vgpr40
+; VI-NEXT:    ; implicit-def: $vgpr39
 ; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr56
+; VI-NEXT:    s_waitcnt vmcnt(5)
+; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
+; VI-NEXT:    ; implicit-def: $vgpr33
 ; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr56
@@ -109607,39 +109645,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; VI-NEXT:    ; implicit-def: $vgpr56
 ; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr55
-; VI-NEXT:    ; implicit-def: $vgpr47
-; VI-NEXT:    ; implicit-def: $vgpr54
-; VI-NEXT:    ; implicit-def: $vgpr48
-; VI-NEXT:    ; implicit-def: $vgpr46
-; VI-NEXT:    ; implicit-def: $vgpr53
-; VI-NEXT:    ; implicit-def: $vgpr45
-; VI-NEXT:    ; implicit-def: $vgpr44
-; VI-NEXT:    ; implicit-def: $vgpr34
-; VI-NEXT:    ; implicit-def: $vgpr52
-; VI-NEXT:    ; implicit-def: $vgpr38
-; VI-NEXT:    ; implicit-def: $vgpr43
-; VI-NEXT:    ; implicit-def: $vgpr63
-; VI-NEXT:    ; implicit-def: $vgpr62
-; VI-NEXT:    ; implicit-def: $vgpr51
-; VI-NEXT:    ; implicit-def: $vgpr37
-; VI-NEXT:    ; implicit-def: $vgpr61
-; VI-NEXT:    ; implicit-def: $vgpr50
-; VI-NEXT:    ; implicit-def: $vgpr60
-; VI-NEXT:    ; implicit-def: $vgpr59
-; VI-NEXT:    ; implicit-def: $vgpr36
-; VI-NEXT:    ; implicit-def: $vgpr58
-; VI-NEXT:    ; implicit-def: $vgpr35
-; VI-NEXT:    ; implicit-def: $vgpr49
-; VI-NEXT:    ; kill: killed $vgpr39
-; VI-NEXT:    ; implicit-def: $vgpr42
-; VI-NEXT:    ; implicit-def: $vgpr41
-; VI-NEXT:    ; implicit-def: $vgpr40
-; VI-NEXT:    ; implicit-def: $vgpr39
 ; VI-NEXT:    ; implicit-def: $vgpr56
-; VI-NEXT:    s_waitcnt vmcnt(14)
-; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
-; VI-NEXT:    ; implicit-def: $vgpr33
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; VI-NEXT:    s_cbranch_execz .LBB72_2
@@ -109648,6 +109654,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v32
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v31
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v31
@@ -109822,6 +109829,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; VI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; VI-NEXT:    s_cbranch_execz .LBB72_4
 ; VI-NEXT:  ; %bb.3: ; %cmp.true
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_add_f64 v[31:32], v[31:32], 1.0
 ; VI-NEXT:    v_add_f64 v[29:30], v[29:30], 1.0
 ; VI-NEXT:    v_add_f64 v[27:28], v[27:28], 1.0
@@ -110417,9 +110425,16 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT:    ; implicit-def: $vgpr43
+; GFX9-NEXT:    ; implicit-def: $vgpr42
 ; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; GFX9-NEXT:    ; implicit-def: $vgpr42
 ; GFX9-NEXT:    ; implicit-def: $vgpr41
 ; GFX9-NEXT:    ; kill: killed $vgpr41
 ; GFX9-NEXT:    ; implicit-def: $vgpr41
@@ -110513,14 +110528,12 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr48
 ; GFX9-NEXT:    ; kill: killed $vgpr41
 ; GFX9-NEXT:    ; implicit-def: $vgpr41
-; GFX9-NEXT:    ; implicit-def: $vgpr43
 ; GFX9-NEXT:    ; kill: killed $vgpr36
 ; GFX9-NEXT:    ; implicit-def: $vgpr36
 ; GFX9-NEXT:    ; kill: killed $vgpr48
 ; GFX9-NEXT:    ; implicit-def: $vgpr48
 ; GFX9-NEXT:    ; kill: killed $vgpr41
 ; GFX9-NEXT:    ; implicit-def: $vgpr41
-; GFX9-NEXT:    ; implicit-def: $vgpr42
 ; GFX9-NEXT:    ; implicit-def: $vgpr46
 ; GFX9-NEXT:    ; implicit-def: $vgpr54
 ; GFX9-NEXT:    ; implicit-def: $vgpr45
@@ -110552,15 +110565,13 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr48
 ; GFX9-NEXT:    ; kill: killed $vgpr41
 ; GFX9-NEXT:    ; implicit-def: $vgpr41
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; GFX9-NEXT:    ; implicit-def: $vgpr42
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr42
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
+; GFX9-NEXT:    ; implicit-def: $vgpr33
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
@@ -110612,9 +110623,6 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(31)
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
-; GFX9-NEXT:    ; implicit-def: $vgpr33
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB72_2
@@ -156456,12 +156464,16 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr35
+; VI-NEXT:    ; implicit-def: $vgpr34
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; VI-NEXT:    ; implicit-def: $vgpr35
-; VI-NEXT:    ; implicit-def: $vgpr45
+; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr34
+; VI-NEXT:    ; implicit-def: $vgpr45
+; VI-NEXT:    ; implicit-def: $vgpr44
 ; VI-NEXT:    ; implicit-def: $vgpr38
 ; VI-NEXT:    ; implicit-def: $vgpr55
 ; VI-NEXT:    ; implicit-def: $vgpr54
@@ -156486,52 +156498,11 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; VI-NEXT:    ; implicit-def: $vgpr48
 ; VI-NEXT:    ; implicit-def: $vgpr52
 ; VI-NEXT:    ; implicit-def: $vgpr37
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr34
+; VI-NEXT:    ; implicit-def: $vgpr56
 ; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr34
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr34
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr34
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr34
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr34
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr34
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr34
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr34
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr34
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr34
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr56
-; VI-NEXT:    ; implicit-def: $vgpr34
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr44
-; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr44
-; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
 ; VI-NEXT:    ; implicit-def: $vgpr33
 ; VI-NEXT:    ; kill: killed $vgpr33
@@ -156640,6 +156611,43 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; VI-NEXT:    ; implicit-def: $vgpr33
 ; VI-NEXT:    ; kill: killed $vgpr33
 ; VI-NEXT:    ; implicit-def: $vgpr33
+; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr34
+; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr34
+; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr34
+; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr34
+; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr34
+; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr34
+; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr34
+; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr34
+; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr34
+; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr34
+; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr44
+; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; VI-NEXT:    s_cbranch_execz .LBB90_2
@@ -156774,6 +156782,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; VI-NEXT:    v_lshrrev_b64 v[44:45], 24, v[1:2]
 ; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_lshrrev_b64 v[44:45], 24, v[31:32]
 ; VI-NEXT:    v_lshrrev_b32_e32 v46, 24, v12
 ; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
@@ -158002,10 +158011,60 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT:    ; implicit-def: $vgpr58
+; GFX9-NEXT:    ; kill: killed $vgpr58
+; GFX9-NEXT:    ; implicit-def: $vgpr58
+; GFX9-NEXT:    ; kill: killed $vgpr58
+; GFX9-NEXT:    ; implicit-def: $vgpr58
+; GFX9-NEXT:    ; kill: killed $vgpr58
+; GFX9-NEXT:    ; implicit-def: $vgpr58
+; GFX9-NEXT:    ; kill: killed $vgpr58
+; GFX9-NEXT:    ; implicit-def: $vgpr58
+; GFX9-NEXT:    ; kill: killed $vgpr58
+; GFX9-NEXT:    ; implicit-def: $vgpr58
+; GFX9-NEXT:    ; kill: killed $vgpr58
+; GFX9-NEXT:    ; implicit-def: $vgpr58
+; GFX9-NEXT:    ; kill: killed $vgpr58
+; GFX9-NEXT:    ; implicit-def: $vgpr58
+; GFX9-NEXT:    ; kill: killed $vgpr58
+; GFX9-NEXT:    ; implicit-def: $vgpr58
+; GFX9-NEXT:    ; kill: killed $vgpr58
+; GFX9-NEXT:    ; implicit-def: $vgpr58
+; GFX9-NEXT:    ; kill: killed $vgpr58
+; GFX9-NEXT:    ; implicit-def: $vgpr58
+; GFX9-NEXT:    ; kill: killed $vgpr58
+; GFX9-NEXT:    ; implicit-def: $vgpr58
+; GFX9-NEXT:    ; kill: killed $vgpr58
+; GFX9-NEXT:    ; implicit-def: $vgpr58
+; GFX9-NEXT:    ; implicit-def: $vgpr59
+; GFX9-NEXT:    ; kill: killed $vgpr58
+; GFX9-NEXT:    ; implicit-def: $vgpr58
+; GFX9-NEXT:    ; kill: killed $vgpr59
+; GFX9-NEXT:    ; implicit-def: $vgpr59
+; GFX9-NEXT:    ; kill: killed $vgpr58
+; GFX9-NEXT:    ; implicit-def: $vgpr58
+; GFX9-NEXT:    ; kill: killed $vgpr59
+; GFX9-NEXT:    ; implicit-def: $vgpr59
+; GFX9-NEXT:    ; kill: killed $vgpr58
+; GFX9-NEXT:    ; implicit-def: $vgpr58
+; GFX9-NEXT:    ; kill: killed $vgpr59
+; GFX9-NEXT:    ; implicit-def: $vgpr59
+; GFX9-NEXT:    ; kill: killed $vgpr58
+; GFX9-NEXT:    ; implicit-def: $vgpr58
+; GFX9-NEXT:    ; kill: killed $vgpr59
+; GFX9-NEXT:    ; implicit-def: $vgpr59
+; GFX9-NEXT:    ; kill: killed $vgpr58
+; GFX9-NEXT:    ; implicit-def: $vgpr58
 ; GFX9-NEXT:    v_mov_b32_e32 v46, v15
 ; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:8
 ; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT:    ; kill: killed $vgpr59
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; GFX9-NEXT:    ; implicit-def: $vgpr58
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
 ; GFX9-NEXT:    ; kill: killed $vgpr33
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
@@ -158015,86 +158074,42 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
 ; GFX9-NEXT:    ; kill: killed $vgpr33
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
-; GFX9-NEXT:    ; implicit-def: $vgpr58
 ; GFX9-NEXT:    ; kill: killed $vgpr33
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
-; GFX9-NEXT:    ; kill: killed $vgpr58
-; GFX9-NEXT:    ; implicit-def: $vgpr58
 ; GFX9-NEXT:    ; kill: killed $vgpr33
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
-; GFX9-NEXT:    ; kill: killed $vgpr58
-; GFX9-NEXT:    ; implicit-def: $vgpr58
 ; GFX9-NEXT:    ; kill: killed $vgpr33
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
-; GFX9-NEXT:    ; kill: killed $vgpr58
-; GFX9-NEXT:    ; implicit-def: $vgpr58
 ; GFX9-NEXT:    ; kill: killed $vgpr33
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
-; GFX9-NEXT:    ; kill: killed $vgpr58
-; GFX9-NEXT:    ; implicit-def: $vgpr58
 ; GFX9-NEXT:    ; kill: killed $vgpr33
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
-; GFX9-NEXT:    ; kill: killed $vgpr58
-; GFX9-NEXT:    ; implicit-def: $vgpr58
 ; GFX9-NEXT:    ; kill: killed $vgpr33
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
-; GFX9-NEXT:    ; kill: killed $vgpr58
-; GFX9-NEXT:    ; implicit-def: $vgpr58
 ; GFX9-NEXT:    ; kill: killed $vgpr33
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
-; GFX9-NEXT:    ; kill: killed $vgpr58
-; GFX9-NEXT:    ; implicit-def: $vgpr58
 ; GFX9-NEXT:    ; kill: killed $vgpr33
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
-; GFX9-NEXT:    ; kill: killed $vgpr58
-; GFX9-NEXT:    ; implicit-def: $vgpr58
 ; GFX9-NEXT:    ; kill: killed $vgpr33
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
-; GFX9-NEXT:    ; kill: killed $vgpr58
-; GFX9-NEXT:    ; implicit-def: $vgpr58
 ; GFX9-NEXT:    ; kill: killed $vgpr33
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
-; GFX9-NEXT:    ; kill: killed $vgpr58
-; GFX9-NEXT:    ; implicit-def: $vgpr58
 ; GFX9-NEXT:    ; kill: killed $vgpr33
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
-; GFX9-NEXT:    ; kill: killed $vgpr58
-; GFX9-NEXT:    ; implicit-def: $vgpr58
 ; GFX9-NEXT:    ; kill: killed $vgpr33
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
-; GFX9-NEXT:    ; kill: killed $vgpr58
-; GFX9-NEXT:    ; implicit-def: $vgpr58
 ; GFX9-NEXT:    ; kill: killed $vgpr33
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
-; GFX9-NEXT:    ; implicit-def: $vgpr59
-; GFX9-NEXT:    ; kill: killed $vgpr58
-; GFX9-NEXT:    ; implicit-def: $vgpr58
 ; GFX9-NEXT:    ; kill: killed $vgpr33
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
-; GFX9-NEXT:    ; kill: killed $vgpr59
-; GFX9-NEXT:    ; implicit-def: $vgpr59
-; GFX9-NEXT:    ; kill: killed $vgpr58
-; GFX9-NEXT:    ; implicit-def: $vgpr58
 ; GFX9-NEXT:    ; kill: killed $vgpr33
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
-; GFX9-NEXT:    ; kill: killed $vgpr59
-; GFX9-NEXT:    ; implicit-def: $vgpr59
-; GFX9-NEXT:    ; kill: killed $vgpr58
-; GFX9-NEXT:    ; implicit-def: $vgpr58
 ; GFX9-NEXT:    ; kill: killed $vgpr33
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
-; GFX9-NEXT:    ; kill: killed $vgpr59
-; GFX9-NEXT:    ; implicit-def: $vgpr59
-; GFX9-NEXT:    ; kill: killed $vgpr58
-; GFX9-NEXT:    ; implicit-def: $vgpr58
 ; GFX9-NEXT:    v_mov_b32_e32 v47, v16
 ; GFX9-NEXT:    ; kill: killed $vgpr33
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
 ; GFX9-NEXT:    ; implicit-def: $vgpr36
-; GFX9-NEXT:    ; kill: killed $vgpr59
-; GFX9-NEXT:    ; implicit-def: $vgpr59
-; GFX9-NEXT:    ; kill: killed $vgpr58
-; GFX9-NEXT:    ; implicit-def: $vgpr58
 ; GFX9-NEXT:    ; implicit-def: $vgpr41
 ; GFX9-NEXT:    ; implicit-def: $vgpr38
 ; GFX9-NEXT:    ; implicit-def: $vgpr51
@@ -158125,15 +158140,25 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr57
 ; GFX9-NEXT:    ; implicit-def: $vgpr36
 ; GFX9-NEXT:    ; implicit-def: $vgpr63
-; GFX9-NEXT:    ; kill: killed $vgpr59
-; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; GFX9-NEXT:    ; implicit-def: $vgpr58
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr58
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v15
+; GFX9-NEXT:    ; implicit-def: $vgpr15
+; GFX9-NEXT:    ; kill: killed $vgpr15
+; GFX9-NEXT:    ; implicit-def: $vgpr15
+; GFX9-NEXT:    ; kill: killed $vgpr15
+; GFX9-NEXT:    ; implicit-def: $vgpr15
+; GFX9-NEXT:    ; kill: killed $vgpr15
+; GFX9-NEXT:    ; implicit-def: $vgpr15
+; GFX9-NEXT:    ; kill: killed $vgpr15
+; GFX9-NEXT:    ; implicit-def: $vgpr15
+; GFX9-NEXT:    ; kill: killed $vgpr15
+; GFX9-NEXT:    ; implicit-def: $vgpr15
+; GFX9-NEXT:    ; kill: killed $vgpr15
+; GFX9-NEXT:    ; implicit-def: $vgpr15
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
@@ -158189,21 +158214,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(33)
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v15
-; GFX9-NEXT:    ; implicit-def: $vgpr15
-; GFX9-NEXT:    ; kill: killed $vgpr15
-; GFX9-NEXT:    ; implicit-def: $vgpr15
-; GFX9-NEXT:    ; kill: killed $vgpr15
-; GFX9-NEXT:    ; implicit-def: $vgpr15
-; GFX9-NEXT:    ; kill: killed $vgpr15
-; GFX9-NEXT:    ; implicit-def: $vgpr15
-; GFX9-NEXT:    ; kill: killed $vgpr15
-; GFX9-NEXT:    ; implicit-def: $vgpr15
-; GFX9-NEXT:    ; kill: killed $vgpr15
-; GFX9-NEXT:    ; implicit-def: $vgpr15
-; GFX9-NEXT:    ; kill: killed $vgpr15
-; GFX9-NEXT:    ; implicit-def: $vgpr15
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB90_2
@@ -182746,9 +182756,12 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v27
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v25
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v23
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v14
 ; VI-NEXT:    v_lshrrev_b32_e32 v48, 16, v16
 ; VI-NEXT:    v_lshrrev_b32_e32 v39, 16, v15
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v14
 ; VI-NEXT:    v_lshrrev_b32_e32 v54, 16, v12
 ; VI-NEXT:    v_lshrrev_b32_e32 v62, 16, v11
 ; VI-NEXT:    v_lshrrev_b32_e32 v63, 16, v10
@@ -182758,19 +182771,20 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    v_lshrrev_b32_e32 v55, 16, v2
 ; VI-NEXT:    v_lshrrev_b32_e32 v56, 16, v28
 ; VI-NEXT:    v_lshrrev_b32_e32 v58, 16, v26
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v57, 16, v24
-; VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v23
 ; VI-NEXT:    v_lshrrev_b32_e32 v59, 16, v22
 ; VI-NEXT:    v_lshrrev_b32_e32 v38, 16, v21
 ; VI-NEXT:    v_lshrrev_b32_e32 v53, 16, v20
 ; VI-NEXT:    v_lshrrev_b32_e32 v43, 16, v19
 ; VI-NEXT:    v_lshrrev_b32_e32 v41, 16, v18
 ; VI-NEXT:    v_lshrrev_b32_e32 v46, 16, v17
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr52
 ; VI-NEXT:    ; implicit-def: $vgpr45
 ; VI-NEXT:    ; implicit-def: $vgpr42
+; VI-NEXT:    ; implicit-def: $vgpr50
+; VI-NEXT:    ; implicit-def: $vgpr40
+; VI-NEXT:    ; implicit-def: $vgpr49
+; VI-NEXT:    ; implicit-def: $vgpr35
 ; VI-NEXT:    s_waitcnt vmcnt(13)
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
 ; VI-NEXT:    ; implicit-def: $vgpr31
@@ -182868,15 +182882,15 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    ; implicit-def: $vgpr31
 ; VI-NEXT:    ; kill: killed $vgpr31
 ; VI-NEXT:    ; implicit-def: $vgpr31
-; VI-NEXT:    s_waitcnt vmcnt(12)
-; VI-NEXT:    v_lshrrev_b32_e32 v36, 16, v61
-; VI-NEXT:    s_waitcnt vmcnt(11)
-; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v60
 ; VI-NEXT:    ; kill: killed $vgpr31
 ; VI-NEXT:    ; implicit-def: $vgpr31
 ; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr31
+; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    v_lshrrev_b32_e32 v36, 16, v61
+; VI-NEXT:    s_waitcnt vmcnt(13)
+; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v60
 ; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr31
@@ -182888,15 +182902,11 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    ; implicit-def: $vgpr31
 ; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr50
-; VI-NEXT:    ; implicit-def: $vgpr31
-; VI-NEXT:    ; implicit-def: $vgpr40
-; VI-NEXT:    ; implicit-def: $vgpr49
-; VI-NEXT:    ; implicit-def: $vgpr35
 ; VI-NEXT:    ; implicit-def: $vgpr32
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr32
+; VI-NEXT:    ; implicit-def: $vgpr31
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr32
@@ -183793,9 +183803,17 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT:    ; implicit-def: $vgpr53
 ; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; GFX9-NEXT:    ; implicit-def: $vgpr44
+; GFX9-NEXT:    ; implicit-def: $vgpr53
+; GFX9-NEXT:    ; implicit-def: $vgpr43
 ; GFX9-NEXT:    ; implicit-def: $vgpr50
 ; GFX9-NEXT:    ; kill: killed $vgpr50
 ; GFX9-NEXT:    ; implicit-def: $vgpr50
@@ -183855,10 +183873,8 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr50
 ; GFX9-NEXT:    ; kill: killed $vgpr50
 ; GFX9-NEXT:    ; implicit-def: $vgpr50
-; GFX9-NEXT:    ; implicit-def: $vgpr44
 ; GFX9-NEXT:    ; kill: killed $vgpr50
 ; GFX9-NEXT:    ; implicit-def: $vgpr50
-; GFX9-NEXT:    ; implicit-def: $vgpr53
 ; GFX9-NEXT:    ; implicit-def: $vgpr58
 ; GFX9-NEXT:    ; implicit-def: $vgpr36
 ; GFX9-NEXT:    ; implicit-def: $vgpr57
@@ -183885,41 +183901,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr52
 ; GFX9-NEXT:    ; implicit-def: $vgpr51
 ; GFX9-NEXT:    ; implicit-def: $vgpr50
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; GFX9-NEXT:    ; implicit-def: $vgpr53
 ; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; GFX9-NEXT:    ; implicit-def: $vgpr54
-; GFX9-NEXT:    ; implicit-def: $vgpr53
-; GFX9-NEXT:    ; implicit-def: $vgpr43
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr43
-; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; GFX9-NEXT:    ; implicit-def: $vgpr43
-; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; GFX9-NEXT:    ; implicit-def: $vgpr43
-; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; GFX9-NEXT:    ; implicit-def: $vgpr43
-; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; GFX9-NEXT:    ; implicit-def: $vgpr43
-; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(17)
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
 ; GFX9-NEXT:    ; kill: killed $vgpr33
@@ -183978,6 +183967,27 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
 ; GFX9-NEXT:    ; kill: killed $vgpr33
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
+; GFX9-NEXT:    ; implicit-def: $vgpr54
+; GFX9-NEXT:    ; implicit-def: $vgpr53
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-NEXT:    ; implicit-def: $vgpr43
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-NEXT:    ; implicit-def: $vgpr43
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-NEXT:    ; implicit-def: $vgpr43
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-NEXT:    ; implicit-def: $vgpr43
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB94_2
@@ -205305,12 +205315,74 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v27
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v26
+; VI-NEXT:    ; implicit-def: $vgpr48
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v25
+; VI-NEXT:    ; kill: killed $vgpr48
+; VI-NEXT:    ; implicit-def: $vgpr48
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v24
+; VI-NEXT:    ; kill: killed $vgpr48
+; VI-NEXT:    ; implicit-def: $vgpr48
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v23
+; VI-NEXT:    ; kill: killed $vgpr48
+; VI-NEXT:    ; implicit-def: $vgpr48
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v22
+; VI-NEXT:    v_lshrrev_b32_e32 v49, 16, v21
+; VI-NEXT:    ; kill: killed $vgpr48
+; VI-NEXT:    ; implicit-def: $vgpr48
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr54
+; VI-NEXT:    ; kill: killed $vgpr54
+; VI-NEXT:    ; implicit-def: $vgpr54
+; VI-NEXT:    ; kill: killed $vgpr54
+; VI-NEXT:    ; implicit-def: $vgpr54
+; VI-NEXT:    ; kill: killed $vgpr54
+; VI-NEXT:    ; implicit-def: $vgpr54
+; VI-NEXT:    ; kill: killed $vgpr54
+; VI-NEXT:    ; implicit-def: $vgpr54
+; VI-NEXT:    ; kill: killed $vgpr54
+; VI-NEXT:    ; implicit-def: $vgpr54
+; VI-NEXT:    ; kill: killed $vgpr54
+; VI-NEXT:    ; implicit-def: $vgpr54
+; VI-NEXT:    ; kill: killed $vgpr54
+; VI-NEXT:    ; implicit-def: $vgpr54
+; VI-NEXT:    ; kill: killed $vgpr54
+; VI-NEXT:    ; implicit-def: $vgpr54
+; VI-NEXT:    ; kill: killed $vgpr54
+; VI-NEXT:    ; implicit-def: $vgpr54
+; VI-NEXT:    ; kill: killed $vgpr54
+; VI-NEXT:    ; implicit-def: $vgpr54
+; VI-NEXT:    ; implicit-def: $vgpr55
+; VI-NEXT:    ; kill: killed $vgpr54
+; VI-NEXT:    ; implicit-def: $vgpr54
+; VI-NEXT:    ; implicit-def: $vgpr35
+; VI-NEXT:    ; kill: killed $vgpr35
+; VI-NEXT:    ; implicit-def: $vgpr35
+; VI-NEXT:    ; kill: killed $vgpr35
+; VI-NEXT:    ; implicit-def: $vgpr35
+; VI-NEXT:    ; kill: killed $vgpr35
+; VI-NEXT:    ; implicit-def: $vgpr35
+; VI-NEXT:    ; kill: killed $vgpr35
+; VI-NEXT:    ; implicit-def: $vgpr35
+; VI-NEXT:    ; kill: killed $vgpr35
+; VI-NEXT:    ; implicit-def: $vgpr35
+; VI-NEXT:    ; kill: killed $vgpr35
+; VI-NEXT:    ; implicit-def: $vgpr35
+; VI-NEXT:    ; kill: killed $vgpr35
+; VI-NEXT:    ; implicit-def: $vgpr35
+; VI-NEXT:    ; kill: killed $vgpr35
+; VI-NEXT:    ; implicit-def: $vgpr35
+; VI-NEXT:    ; kill: killed $vgpr35
+; VI-NEXT:    ; implicit-def: $vgpr35
+; VI-NEXT:    ; kill: killed $vgpr35
+; VI-NEXT:    ; implicit-def: $vgpr35
+; VI-NEXT:    ; kill: killed $vgpr35
+; VI-NEXT:    ; implicit-def: $vgpr35
+; VI-NEXT:    ; kill: killed $vgpr35
+; VI-NEXT:    ; implicit-def: $vgpr35
 ; VI-NEXT:    v_lshrrev_b32_e32 v60, 16, v16
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v14
 ; VI-NEXT:    v_lshrrev_b32_e32 v63, 16, v12
@@ -205320,20 +205392,34 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; VI-NEXT:    v_lshrrev_b32_e32 v56, 16, v4
 ; VI-NEXT:    v_lshrrev_b32_e32 v57, 16, v2
 ; VI-NEXT:    v_lshrrev_b32_e32 v46, 16, v30
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v22
-; VI-NEXT:    v_lshrrev_b32_e32 v49, 16, v21
 ; VI-NEXT:    v_lshrrev_b32_e32 v53, 16, v20
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v19
 ; VI-NEXT:    v_lshrrev_b32_e32 v52, 16, v18
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; VI-NEXT:    ; kill: killed $vgpr35
+; VI-NEXT:    ; implicit-def: $vgpr35
+; VI-NEXT:    ; implicit-def: $vgpr32
+; VI-NEXT:    ; implicit-def: $vgpr62
+; VI-NEXT:    ; implicit-def: $vgpr38
+; VI-NEXT:    ; implicit-def: $vgpr42
+; VI-NEXT:    ; implicit-def: $vgpr61
+; VI-NEXT:    ; implicit-def: $vgpr58
+; VI-NEXT:    ; implicit-def: $vgpr39
+; VI-NEXT:    ; kill: killed $vgpr35
+; VI-NEXT:    ; implicit-def: $vgpr35
+; VI-NEXT:    ; implicit-def: $vgpr43
+; VI-NEXT:    ; implicit-def: $vgpr40
+; VI-NEXT:    ; implicit-def: $vgpr45
+; VI-NEXT:    ; implicit-def: $vgpr44
+; VI-NEXT:    ; implicit-def: $vgpr41
 ; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
 ; VI-NEXT:    v_lshrrev_b32_e32 v31, 16, v37
 ; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v50, 16, v36
 ; VI-NEXT:    v_lshrrev_b32_e32 v31, 16, v17
 ; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr48
 ; VI-NEXT:    ; implicit-def: $vgpr31
 ; VI-NEXT:    ; kill: killed $vgpr31
 ; VI-NEXT:    ; implicit-def: $vgpr31
@@ -205404,7 +205490,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; VI-NEXT:    ; kill: killed $vgpr31
 ; VI-NEXT:    ; implicit-def: $vgpr31
 ; VI-NEXT:    ; kill: killed $vgpr31
-; VI-NEXT:    ; implicit-def: $vgpr32
 ; VI-NEXT:    ; implicit-def: $vgpr31
 ; VI-NEXT:    ; kill: killed $vgpr31
 ; VI-NEXT:    ; implicit-def: $vgpr31
@@ -205414,88 +205499,17 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; VI-NEXT:    ; implicit-def: $vgpr31
 ; VI-NEXT:    ; kill: killed $vgpr31
 ; VI-NEXT:    ; implicit-def: $vgpr31
+; VI-NEXT:    v_lshrrev_b32_e32 v50, 16, v36
 ; VI-NEXT:    ; kill: killed $vgpr31
-; VI-NEXT:    ; implicit-def: $vgpr55
 ; VI-NEXT:    ; implicit-def: $vgpr31
 ; VI-NEXT:    ; kill: killed $vgpr31
-; VI-NEXT:    ; implicit-def: $vgpr62
 ; VI-NEXT:    ; implicit-def: $vgpr31
-; VI-NEXT:    ; implicit-def: $vgpr35
-; VI-NEXT:    ; kill: killed $vgpr35
-; VI-NEXT:    ; implicit-def: $vgpr38
-; VI-NEXT:    ; implicit-def: $vgpr35
-; VI-NEXT:    ; kill: killed $vgpr35
-; VI-NEXT:    ; implicit-def: $vgpr42
-; VI-NEXT:    ; implicit-def: $vgpr61
-; VI-NEXT:    ; implicit-def: $vgpr35
-; VI-NEXT:    ; kill: killed $vgpr35
-; VI-NEXT:    ; implicit-def: $vgpr35
-; VI-NEXT:    ; kill: killed $vgpr35
-; VI-NEXT:    ; implicit-def: $vgpr35
-; VI-NEXT:    ; kill: killed $vgpr35
-; VI-NEXT:    ; implicit-def: $vgpr35
-; VI-NEXT:    ; kill: killed $vgpr35
-; VI-NEXT:    ; implicit-def: $vgpr58
-; VI-NEXT:    ; implicit-def: $vgpr35
-; VI-NEXT:    ; kill: killed $vgpr35
-; VI-NEXT:    ; implicit-def: $vgpr35
-; VI-NEXT:    ; kill: killed $vgpr35
-; VI-NEXT:    ; implicit-def: $vgpr35
-; VI-NEXT:    ; kill: killed $vgpr35
-; VI-NEXT:    ; implicit-def: $vgpr35
-; VI-NEXT:    ; kill: killed $vgpr35
-; VI-NEXT:    ; implicit-def: $vgpr39
-; VI-NEXT:    ; implicit-def: $vgpr35
-; VI-NEXT:    ; kill: killed $vgpr35
-; VI-NEXT:    ; implicit-def: $vgpr35
-; VI-NEXT:    ; kill: killed $vgpr35
-; VI-NEXT:    ; implicit-def: $vgpr35
-; VI-NEXT:    ; kill: killed $vgpr35
-; VI-NEXT:    ; implicit-def: $vgpr35
-; VI-NEXT:    ; kill: killed $vgpr35
-; VI-NEXT:    ; implicit-def: $vgpr35
-; VI-NEXT:    ; implicit-def: $vgpr54
-; VI-NEXT:    ; kill: killed $vgpr54
-; VI-NEXT:    ; implicit-def: $vgpr48
-; VI-NEXT:    ; kill: killed $vgpr48
-; VI-NEXT:    ; implicit-def: $vgpr54
-; VI-NEXT:    ; kill: killed $vgpr54
-; VI-NEXT:    ; implicit-def: $vgpr48
-; VI-NEXT:    ; kill: killed $vgpr48
-; VI-NEXT:    ; implicit-def: $vgpr48
-; VI-NEXT:    ; kill: killed $vgpr48
-; VI-NEXT:    ; implicit-def: $vgpr54
-; VI-NEXT:    ; kill: killed $vgpr54
-; VI-NEXT:    ; implicit-def: $vgpr54
-; VI-NEXT:    ; kill: killed $vgpr54
-; VI-NEXT:    ; implicit-def: $vgpr54
-; VI-NEXT:    ; kill: killed $vgpr54
-; VI-NEXT:    ; implicit-def: $vgpr54
-; VI-NEXT:    ; kill: killed $vgpr54
-; VI-NEXT:    ; implicit-def: $vgpr48
-; VI-NEXT:    ; kill: killed $vgpr48
-; VI-NEXT:    ; implicit-def: $vgpr54
-; VI-NEXT:    ; kill: killed $vgpr54
-; VI-NEXT:    ; implicit-def: $vgpr54
-; VI-NEXT:    ; kill: killed $vgpr54
-; VI-NEXT:    ; implicit-def: $vgpr54
-; VI-NEXT:    ; kill: killed $vgpr54
-; VI-NEXT:    ; implicit-def: $vgpr54
-; VI-NEXT:    ; kill: killed $vgpr54
-; VI-NEXT:    ; implicit-def: $vgpr54
-; VI-NEXT:    ; kill: killed $vgpr54
-; VI-NEXT:    ; implicit-def: $vgpr43
-; VI-NEXT:    ; implicit-def: $vgpr40
-; VI-NEXT:    ; implicit-def: $vgpr48
-; VI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr48
 ; VI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr54
 ; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr54
+; VI-NEXT:    ; implicit-def: $vgpr48
 ; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr54
@@ -205504,14 +205518,10 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; VI-NEXT:    ; implicit-def: $vgpr54
 ; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr45
-; VI-NEXT:    ; implicit-def: $vgpr44
-; VI-NEXT:    ; implicit-def: $vgpr41
-; VI-NEXT:    ; implicit-def: $vgpr54
-; VI-NEXT:    ; implicit-def: $vgpr48
 ; VI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr48
+; VI-NEXT:    ; implicit-def: $vgpr54
 ; VI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr48
@@ -206506,9 +206516,17 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT:    ; implicit-def: $vgpr53
 ; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; GFX9-NEXT:    ; implicit-def: $vgpr44
+; GFX9-NEXT:    ; implicit-def: $vgpr53
+; GFX9-NEXT:    ; implicit-def: $vgpr43
 ; GFX9-NEXT:    ; implicit-def: $vgpr50
 ; GFX9-NEXT:    ; kill: killed $vgpr50
 ; GFX9-NEXT:    ; implicit-def: $vgpr50
@@ -206568,10 +206586,8 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr50
 ; GFX9-NEXT:    ; kill: killed $vgpr50
 ; GFX9-NEXT:    ; implicit-def: $vgpr50
-; GFX9-NEXT:    ; implicit-def: $vgpr44
 ; GFX9-NEXT:    ; kill: killed $vgpr50
 ; GFX9-NEXT:    ; implicit-def: $vgpr50
-; GFX9-NEXT:    ; implicit-def: $vgpr53
 ; GFX9-NEXT:    ; implicit-def: $vgpr58
 ; GFX9-NEXT:    ; implicit-def: $vgpr36
 ; GFX9-NEXT:    ; implicit-def: $vgpr57
@@ -206598,41 +206614,14 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr52
 ; GFX9-NEXT:    ; implicit-def: $vgpr51
 ; GFX9-NEXT:    ; implicit-def: $vgpr50
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; GFX9-NEXT:    ; implicit-def: $vgpr53
 ; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; GFX9-NEXT:    ; implicit-def: $vgpr54
-; GFX9-NEXT:    ; implicit-def: $vgpr53
-; GFX9-NEXT:    ; implicit-def: $vgpr43
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr43
-; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; GFX9-NEXT:    ; implicit-def: $vgpr43
-; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; GFX9-NEXT:    ; implicit-def: $vgpr43
-; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; GFX9-NEXT:    ; implicit-def: $vgpr43
-; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; GFX9-NEXT:    ; implicit-def: $vgpr43
-; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(17)
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
 ; GFX9-NEXT:    ; kill: killed $vgpr33
@@ -206691,6 +206680,27 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
 ; GFX9-NEXT:    ; kill: killed $vgpr33
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
+; GFX9-NEXT:    ; implicit-def: $vgpr54
+; GFX9-NEXT:    ; implicit-def: $vgpr53
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-NEXT:    ; implicit-def: $vgpr43
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-NEXT:    ; implicit-def: $vgpr43
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-NEXT:    ; implicit-def: $vgpr43
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-NEXT:    ; implicit-def: $vgpr43
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB98_2
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
index 35d135b123969..85a69fbdb4386 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
@@ -63507,8 +63507,28 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
 ;
 ; VI-LABEL: bitcast_v32i16_to_v64i8:
 ; VI:       ; %bb.0:
-; VI-NEXT:    ; implicit-def: $vgpr19
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr43
+; VI-NEXT:    ; implicit-def: $vgpr42
+; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr19
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v17
 ; VI-NEXT:    ; implicit-def: $vgpr17
 ; VI-NEXT:    ; kill: killed $vgpr19
@@ -63529,23 +63549,6 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
 ; VI-NEXT:    ; implicit-def: $vgpr17
 ; VI-NEXT:    ; kill: killed $vgpr19
 ; VI-NEXT:    ; implicit-def: $vgpr19
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr43
 ; VI-NEXT:    ; kill: killed $vgpr17
 ; VI-NEXT:    ; implicit-def: $vgpr17
 ; VI-NEXT:    ; kill: killed $vgpr19
@@ -63571,11 +63574,6 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
 ; VI-NEXT:    ; implicit-def: $vgpr17
 ; VI-NEXT:    ; kill: killed $vgpr19
 ; VI-NEXT:    ; implicit-def: $vgpr19
-; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr42
-; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr41
 ; VI-NEXT:    ; implicit-def: $vgpr18
 ; VI-NEXT:    ; implicit-def: $vgpr54
@@ -63607,6 +63605,8 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
 ; VI-NEXT:    ; implicit-def: $vgpr21
 ; VI-NEXT:    ; implicit-def: $vgpr20
 ; VI-NEXT:    ; implicit-def: $vgpr19
+; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr42
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
index 919464a936740..bc1d5956cec56 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
@@ -15780,18 +15780,16 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX9-NEXT:    v_writelane_b32 v40, s14, 10
 ; GFX9-NEXT:    v_writelane_b32 v40, s15, 11
 ; GFX9-NEXT:    v_writelane_b32 v40, s16, 12
+; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
 ; GFX9-NEXT:    v_writelane_b32 v40, s17, 13
 ; GFX9-NEXT:    v_writelane_b32 v40, s18, 14
-; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
 ; GFX9-NEXT:    v_writelane_b32 v40, s19, 15
 ; GFX9-NEXT:    v_writelane_b32 v40, s20, 16
 ; GFX9-NEXT:    v_writelane_b32 v40, s21, 17
-; GFX9-NEXT:    v_writelane_b32 v40, s22, 18
-; GFX9-NEXT:    v_writelane_b32 v40, s23, 19
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_load_dword s52, s[34:35], 0x0
-; GFX9-NEXT:    ; kill: killed $sgpr34_sgpr35
-; GFX9-NEXT:    ; kill: killed $sgpr34_sgpr35
+; GFX9-NEXT:    v_writelane_b32 v40, s22, 18
+; GFX9-NEXT:    v_writelane_b32 v40, s23, 19
 ; GFX9-NEXT:    s_load_dwordx16 s[36:51], s[34:35], 0x40
 ; GFX9-NEXT:    s_load_dwordx16 s[4:19], s[34:35], 0x0
 ; GFX9-NEXT:    v_writelane_b32 v40, s24, 20
@@ -15814,6 +15812,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s50
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 26
+; GFX9-NEXT:    ; kill: killed $sgpr34_sgpr35
+; GFX9-NEXT:    ; kill: killed $sgpr34_sgpr35
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:16
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s51
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v32i32_i32_inreg at abs32@hi
@@ -15880,14 +15880,20 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s34, 28
 ; GFX10-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    ; kill: killed $sgpr34_sgpr35
+; GFX10-NEXT:    ; kill: killed $sgpr34_sgpr35
 ; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
 ; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_load_dword s52, s[34:35], 0x0
 ; GFX10-NEXT:    v_writelane_b32 v40, s8, 4
 ; GFX10-NEXT:    v_writelane_b32 v40, s9, 5
 ; GFX10-NEXT:    v_writelane_b32 v40, s10, 6
 ; GFX10-NEXT:    v_writelane_b32 v40, s11, 7
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, s52
 ; GFX10-NEXT:    v_writelane_b32 v40, s12, 8
 ; GFX10-NEXT:    v_writelane_b32 v40, s13, 9
 ; GFX10-NEXT:    v_writelane_b32 v40, s14, 10
@@ -15896,44 +15902,39 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s17, 13
 ; GFX10-NEXT:    v_writelane_b32 v40, s18, 14
 ; GFX10-NEXT:    v_writelane_b32 v40, s19, 15
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_clause 0x2
-; GFX10-NEXT:    s_load_dword s52, s[34:35], 0x0
-; GFX10-NEXT:    ; meta instruction
-; GFX10-NEXT:    ; meta instruction
+; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    s_load_dwordx16 s[36:51], s[34:35], 0x40
 ; GFX10-NEXT:    s_load_dwordx16 s[4:19], s[34:35], 0x0
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:24
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v32i32_i32_inreg at abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v32i32_i32_inreg at abs32@lo
 ; GFX10-NEXT:    v_writelane_b32 v40, s20, 16
 ; GFX10-NEXT:    v_writelane_b32 v40, s21, 17
 ; GFX10-NEXT:    v_writelane_b32 v40, s22, 18
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s52
-; GFX10-NEXT:    v_mov_b32_e32 v1, s47
-; GFX10-NEXT:    v_writelane_b32 v40, s23, 19
-; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s46
+; GFX10-NEXT:    v_writelane_b32 v40, s23, 19
+; GFX10-NEXT:    v_mov_b32_e32 v1, s47
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s48
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s49
-; GFX10-NEXT:    v_writelane_b32 v40, s24, 20
 ; GFX10-NEXT:    s_mov_b32 s20, s36
+; GFX10-NEXT:    v_writelane_b32 v40, s24, 20
 ; GFX10-NEXT:    s_mov_b32 s21, s37
 ; GFX10-NEXT:    s_mov_b32 s22, s38
 ; GFX10-NEXT:    s_mov_b32 s23, s39
-; GFX10-NEXT:    v_writelane_b32 v40, s25, 21
 ; GFX10-NEXT:    s_mov_b32 s24, s40
+; GFX10-NEXT:    v_writelane_b32 v40, s25, 21
 ; GFX10-NEXT:    s_mov_b32 s25, s41
 ; GFX10-NEXT:    v_mov_b32_e32 v4, s50
 ; GFX10-NEXT:    v_mov_b32_e32 v5, s51
-; GFX10-NEXT:    v_writelane_b32 v40, s26, 22
-; GFX10-NEXT:    s_mov_b32 s26, s42
 ; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32
 ; GFX10-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
 ; GFX10-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8
 ; GFX10-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:12
 ; GFX10-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:16
 ; GFX10-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:20
+; GFX10-NEXT:    v_writelane_b32 v40, s26, 22
+; GFX10-NEXT:    s_mov_b32 s26, s42
 ; GFX10-NEXT:    v_writelane_b32 v40, s27, 23
 ; GFX10-NEXT:    s_mov_b32 s27, s43
 ; GFX10-NEXT:    v_writelane_b32 v40, s28, 24
@@ -16099,15 +16100,22 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 28
 ; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    ; kill: killed $sgpr0_sgpr1
+; GFX10-SCRATCH-NEXT:    ; kill: killed $sgpr0_sgpr1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s3, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s7, 3
+; GFX10-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SCRATCH-NEXT:    s_load_dword s2, s[0:1], 0x0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s8, 4
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s9, 5
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s10, 6
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s11, 7
+; GFX10-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v6, s2
+; GFX10-SCRATCH-NEXT:    s_add_i32 s2, s32, 24
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s12, 8
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s13, 9
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s14, 10
@@ -16116,11 +16124,7 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s17, 13
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s18, 14
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s19, 15
-; GFX10-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_clause 0x2
-; GFX10-SCRATCH-NEXT:    s_load_dword s2, s[0:1], 0x0
-; GFX10-SCRATCH-NEXT:    ; meta instruction
-; GFX10-SCRATCH-NEXT:    ; meta instruction
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
 ; GFX10-SCRATCH-NEXT:    s_load_dwordx16 s[36:51], s[0:1], 0x40
 ; GFX10-SCRATCH-NEXT:    s_load_dwordx16 s[4:19], s[0:1], 0x0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v32i32_i32_inreg at abs32@hi
@@ -16129,8 +16133,6 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s21, 17
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s22, 18
 ; GFX10-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v6, s2
-; GFX10-SCRATCH-NEXT:    s_add_i32 s2, s32, 24
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v4, s50
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s23, 19
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v5, s51
diff --git a/llvm/test/CodeGen/AMDGPU/max.i16.ll b/llvm/test/CodeGen/AMDGPU/max.i16.ll
index a5b64f6f80d9b..956ee8a3a5384 100644
--- a/llvm/test/CodeGen/AMDGPU/max.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/max.i16.ll
@@ -149,7 +149,6 @@ define amdgpu_kernel void @v_test_imax_sge_v3i16(ptr addrspace(1) %out, ptr addr
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    global_load_dword v3, v0, s[6:7]
 ; GFX9-NEXT:    global_load_dword v4, v0, s[2:3]
-; GFX9-NEXT:    ; kill: killed $sgpr2_sgpr3
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    global_load_short_d16 v1, v0, s[6:7] offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
diff --git a/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir b/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir
index 1e6c00a54a012..b8972cce73bb5 100644
--- a/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir
+++ b/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir
@@ -114,7 +114,7 @@ body:             |
     $vgpr3 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $exec
 ...
 
-# Middle dbg_value should be bundled
+# Loads are not bundled due to middle deb_value
 ---
 name: bundle_dbg_value_0
 tracksRegLiveness: true
@@ -124,18 +124,16 @@ body:             |
     ; GCN-LABEL: name: bundle_dbg_value_0
     ; GCN: liveins: $vgpr3_vgpr4, $vgpr5_vgpr6
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: BUNDLE implicit-def $vgpr0, implicit-def $vgpr1, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr5_vgpr6 {
-    ; GCN-NEXT:   $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, implicit $exec
-    ; GCN-NEXT:   DBG_VALUE $vgpr0, 0, 0
-    ; GCN-NEXT:   $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, implicit $exec
-    ; GCN-NEXT: }
+    ; GCN-NEXT: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, implicit $exec
+    ; GCN-NEXT: DBG_VALUE $vgpr0, 0, 0
+    ; GCN-NEXT: $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, implicit $exec
     $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, implicit $exec
     DBG_VALUE $vgpr0, 0, 0
     $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, implicit $exec
 
 ...
 
-# Middle dbg_value should be bundled
+# Loads are not bundled due to middle deb_value
 ---
 name: bundle_dbg_value_1
 tracksRegLiveness: true
@@ -145,12 +143,10 @@ body:             |
     ; GCN-LABEL: name: bundle_dbg_value_1
     ; GCN: liveins: $vgpr3_vgpr4, $vgpr5_vgpr6, $vgpr1
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: BUNDLE implicit-def $vgpr0, implicit-def $vgpr2, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr5_vgpr6 {
-    ; GCN-NEXT:   $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, implicit $exec
-    ; GCN-NEXT:   DBG_VALUE $vgpr0, 0, 0
-    ; GCN-NEXT:   DBG_VALUE $vgpr1, 0, 0
-    ; GCN-NEXT:   $vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, implicit $exec
-    ; GCN-NEXT: }
+    ; GCN-NEXT: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, implicit $exec
+    ; GCN-NEXT: DBG_VALUE $vgpr0, 0, 0
+    ; GCN-NEXT: DBG_VALUE $vgpr1, 0, 0
+    ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, implicit $exec
     ; GCN-NEXT: DBG_VALUE $vgpr2, 0, 0
     $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, implicit $exec
     DBG_VALUE $vgpr0, 0, 0
@@ -159,7 +155,7 @@ body:             |
     DBG_VALUE $vgpr2, 0, 0
 ...
 
-# Starting and ending dbg_values should not be in the bundle
+# Loads are not bundled due to middle deb_value
 ---
 name: bundle_dbg_value_2
 tracksRegLiveness: true
@@ -170,11 +166,9 @@ body:             |
     ; GCN: liveins: $vgpr3_vgpr4, $vgpr5_vgpr6, $vgpr1
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: DBG_VALUE $vgpr1, 0, 0
-    ; GCN-NEXT: BUNDLE implicit-def $vgpr0, implicit-def $vgpr2, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr5_vgpr6 {
-    ; GCN-NEXT:   $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, implicit $exec
-    ; GCN-NEXT:   DBG_VALUE $vgpr0, 0, 0
-    ; GCN-NEXT:   $vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, implicit $exec
-    ; GCN-NEXT: }
+    ; GCN-NEXT: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, implicit $exec
+    ; GCN-NEXT: DBG_VALUE $vgpr0, 0, 0
+    ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, implicit $exec
     ; GCN-NEXT: DBG_VALUE $vgpr2, 0, 0
     DBG_VALUE $vgpr1, 0, 0
     $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, implicit $exec
@@ -183,6 +177,7 @@ body:             |
     DBG_VALUE $vgpr2, 0, 0
 ...
 
+# Loads are not bundled due to KILL
 ---
 name: bundle_kill
 tracksRegLiveness: true
@@ -193,11 +188,9 @@ body:             |
     ; GCN: liveins: $vgpr3_vgpr4, $vgpr5_vgpr6
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec
-    ; GCN-NEXT: BUNDLE implicit-def $vgpr0, implicit-def $vgpr2, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr1, implicit $vgpr5_vgpr6 {
-    ; GCN-NEXT:   $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, implicit $exec
-    ; GCN-NEXT:   KILL $vgpr1
-    ; GCN-NEXT:   $vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, implicit $exec
-    ; GCN-NEXT: }
+    ; GCN-NEXT: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, implicit $exec
+    ; GCN-NEXT: KILL $vgpr1
+    ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, implicit $exec
     $vgpr1 = V_MOV_B32_e32 0, implicit $exec
     $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, implicit $exec
     KILL $vgpr1
@@ -215,11 +208,9 @@ body:             |
     ; GCN: liveins: $vgpr3_vgpr4, $vgpr5_vgpr6
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec
-    ; GCN-NEXT: BUNDLE implicit-def $vgpr0, implicit-def $vgpr2, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr5_vgpr6 {
-    ; GCN-NEXT:   $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, implicit $exec
-    ; GCN-NEXT:   KILL internal $vgpr0
-    ; GCN-NEXT:   $vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, implicit $exec
-    ; GCN-NEXT: }
+    ; GCN-NEXT: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, implicit $exec
+    ; GCN-NEXT: KILL $vgpr0
+    ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, implicit $exec
     $vgpr1 = V_MOV_B32_e32 0, implicit $exec
     $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, implicit $exec
     KILL $vgpr0
@@ -341,11 +332,10 @@ body:             |
     ; GCN-LABEL: name: post_bundle_kill_and_null_reg_dbginfo
     ; GCN: liveins: $vgpr3_vgpr4, $vgpr5_vgpr6
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: BUNDLE implicit-def $vgpr0, implicit-def $vgpr1, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr5_vgpr6 {
-    ; GCN-NEXT:   $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, implicit $exec
-    ; GCN-NEXT:   DBG_VALUE $noreg, $noreg
-    ; GCN-NEXT:   $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, implicit $exec
-    ; GCN-NEXT: }
+    ; GCN-NEXT: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, implicit $exec
+    ; GCN-NEXT: DBG_VALUE $noreg, $noreg
+    ; GCN-NEXT: $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, implicit $exec
+    ; GCN-NEXT: KILL killed $vgpr3_vgpr4, killed $vgpr5_vgpr6
     $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, implicit $exec
     DBG_VALUE $noreg, $noreg
     $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, implicit $exec

>From a502c34a4cdd28497a2056fedac92118766f6540 Mon Sep 17 00:00:00 2001
From: Yoonseo Choi <Yoonseo.Choi at amd.com>
Date: Wed, 13 Aug 2025 20:48:10 -0500
Subject: [PATCH 02/11] formatting

---
 llvm/lib/Target/AMDGPU/SIPostRABundler.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
index 79be639993343..f009bfd2ca372 100644
--- a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
@@ -188,10 +188,11 @@ bool SIPostRABundler::run(MachineFunction &MF) {
           // IGLP_OPT, amdgcn.unreachable.
           break;
         }
-          BundleEnd = I;
-          if (I->getNumExplicitDefs() != 0)
-            Defs.insert(I->defs().begin()->getReg());
-          ++ClauseLength;
+        
+        BundleEnd = I;
+        if (I->getNumExplicitDefs() != 0)
+          Defs.insert(I->defs().begin()->getReg());
+        ++ClauseLength;
       }
 
       Next = std::next(BundleEnd);

>From 6f98505a9a1358d5f1ac38a792456b8d95851b7f Mon Sep 17 00:00:00 2001
From: Yoonseo Choi <Yoonseo.Choi at amd.com>
Date: Wed, 13 Aug 2025 20:48:28 -0500
Subject: [PATCH 03/11] formatting

---
 llvm/lib/Target/AMDGPU/SIPostRABundler.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
index f009bfd2ca372..e12d9e6c69509 100644
--- a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
@@ -188,7 +188,7 @@ bool SIPostRABundler::run(MachineFunction &MF) {
           // IGLP_OPT, amdgcn.unreachable.
           break;
         }
-        
+
         BundleEnd = I;
         if (I->getNumExplicitDefs() != 0)
           Defs.insert(I->defs().begin()->getReg());

>From 273243c55f43ff687faf31f79e9e9f17457ee3aa Mon Sep 17 00:00:00 2001
From: Yoonseo Choi <Yoonseo.Choi at amd.com>
Date: Wed, 13 Aug 2025 23:32:38 -0500
Subject: [PATCH 04/11] Updating a lit-test

---
 .../CodeGen/AMDGPU/postra-bundle-memops.mir   | 67 +++++++++++++++++++
 1 file changed, 67 insertions(+)

diff --git a/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir b/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir
index b8972cce73bb5..10ee822405caa 100644
--- a/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir
+++ b/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir
@@ -341,3 +341,70 @@ body:             |
     $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, implicit $exec
     KILL killed $vgpr3_vgpr4, killed $vgpr5_vgpr6
 ...
+
+# The 3rd and 4th loads are not bundled due to an in-between SCHED_BARRIER meta instruction. 
+# This helps SCHED_BARRIERs are honored maximally. 
+# MASK 1924 = 0b 0111 1000 0100 VALU and all VMEM, VMEM Read/Write cannot be
+#                     scheduled across SCHED_BARRIER. 
+
+---
+name: extra_sched_barrier_in_bundle
+tracksRegLiveness: true
+body: |
+  bb.0:
+    ; GCN-LABEL: name: extra_sched_barrier_in_bundle
+    ; GCN: renamable $sgpr0_sgpr1 = IMPLICIT_DEF
+    ; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF
+    ; GCN-NEXT: BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $sgpr0_sgpr1, implicit $vgpr0, implicit $exec {
+    ; GCN-NEXT:   renamable $vgpr1 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 0, 0, implicit $exec
+    ; GCN-NEXT:   renamable $vgpr2 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 512, 0, implicit $exec
+    ; GCN-NEXT: }
+    ; GCN-NEXT: renamable $sgpr2_sgpr3 = IMPLICIT_DEF
+    ; GCN-NEXT: renamable $vgpr10 = IMPLICIT_DEF
+    ; GCN-NEXT: renamable $vgpr1 = nsw V_MUL_LO_U32_e64 killed $vgpr1, $vgpr1, implicit $exec
+    ; GCN-NEXT: renamable $vgpr2 = nsw V_MUL_LO_U32_e64 killed $vgpr2, $vgpr2, implicit $exec
+    ; GCN-NEXT: SCHED_BARRIER 1924
+    ; GCN-NEXT: renamable $vgpr11 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr2_sgpr3, renamable $vgpr10, 0, 0, implicit $exec, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit $sgpr2_sgpr3, implicit $vgpr10
+    ; GCN-NEXT: SCHED_BARRIER 1924
+    ; GCN-NEXT: renamable $vgpr12 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr2_sgpr3, renamable $vgpr10, 512, 0, implicit $exec, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit $sgpr2_sgpr3, implicit $vgpr10
+    ; GCN-NEXT: renamable $sgpr4_sgpr5 = IMPLICIT_DEF
+    ; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF
+    ; GCN-NEXT: renamable $vgpr11 = nsw V_MUL_LO_U32_e64 killed $vgpr11, $vgpr11, implicit $exec
+    ; GCN-NEXT: renamable $vgpr12 = nsw V_MUL_LO_U32_e64 killed $vgpr12, $vgpr12, implicit $exec
+    ; GCN-NEXT: BUNDLE implicit killed $vgpr10, implicit killed $vgpr11, implicit killed $sgpr2_sgpr3, implicit $exec, implicit killed $vgpr12 {
+    ; GCN-NEXT:   GLOBAL_STORE_DWORD_SADDR renamable $vgpr10, killed renamable $vgpr11, renamable $sgpr2_sgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr10, killed renamable $vgpr12, killed renamable $sgpr2_sgpr3, 512, 0, implicit $exec
+    ; GCN-NEXT: }
+    ; GCN-NEXT: BUNDLE implicit killed $vgpr0, implicit killed $vgpr1, implicit killed $sgpr4_sgpr5, implicit $exec, implicit killed $vgpr2 {
+    ; GCN-NEXT:   GLOBAL_STORE_DWORD_SADDR renamable $vgpr0, killed renamable $vgpr1, renamable $sgpr4_sgpr5, 0, 0, implicit $exec
+    ; GCN-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr4_sgpr5, 512, 0, implicit $exec
+    ; GCN-NEXT: }
+    ; GCN-NEXT: S_ENDPGM 0
+    renamable $sgpr0_sgpr1 = IMPLICIT_DEF
+    renamable $vgpr0 = IMPLICIT_DEF
+    BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $sgpr0_sgpr1, implicit $vgpr0, implicit $exec {
+      renamable $vgpr1 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 0, 0, implicit $exec
+      renamable $vgpr2 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 512, 0, implicit $exec
+    }
+    renamable $sgpr2_sgpr3 = IMPLICIT_DEF
+    renamable $vgpr10 = IMPLICIT_DEF
+    renamable $vgpr1 = nsw V_MUL_LO_U32_e64 killed $vgpr1, $vgpr1, implicit $exec
+    renamable $vgpr2 = nsw V_MUL_LO_U32_e64 killed $vgpr2, $vgpr2, implicit $exec
+    SCHED_BARRIER 1924
+    renamable $vgpr11 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr2_sgpr3, renamable $vgpr10, 0, 0, implicit $exec, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit $sgpr2_sgpr3, implicit $vgpr10
+    SCHED_BARRIER 1924
+    renamable $vgpr12 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr2_sgpr3, renamable $vgpr10, 512, 0, implicit $exec, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit $sgpr2_sgpr3, implicit $vgpr10
+    renamable $sgpr4_sgpr5 = IMPLICIT_DEF
+    renamable $vgpr0 = IMPLICIT_DEF
+    renamable $vgpr11 = nsw V_MUL_LO_U32_e64 killed $vgpr11, $vgpr11, implicit $exec
+    renamable $vgpr12 = nsw V_MUL_LO_U32_e64 killed $vgpr12, $vgpr12, implicit $exec
+    BUNDLE implicit killed $vgpr10, implicit killed $vgpr11, implicit killed $sgpr2_sgpr3, implicit $exec, implicit killed $vgpr12 {
+      GLOBAL_STORE_DWORD_SADDR renamable $vgpr10, killed renamable $vgpr11, renamable $sgpr2_sgpr3, 0, 0, implicit $exec
+      GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr10, killed renamable $vgpr12, killed renamable $sgpr2_sgpr3, 512, 0, implicit $exec
+    }
+    BUNDLE implicit killed $vgpr0, implicit killed $vgpr1, implicit killed $sgpr4_sgpr5, implicit $exec, implicit killed $vgpr2 {
+      GLOBAL_STORE_DWORD_SADDR renamable $vgpr0, killed renamable $vgpr1, renamable $sgpr4_sgpr5, 0, 0, implicit $exec
+      GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr4_sgpr5, 512, 0, implicit $exec
+    }
+    S_ENDPGM 0
+...

>From c36948c1ec85b23e23342456c8a8514c046923b8 Mon Sep 17 00:00:00 2001
From: Yoonseo Choi <Yoonseo.Choi at amd.com>
Date: Thu, 14 Aug 2025 11:25:36 -0500
Subject: [PATCH 05/11] Resolving a merge conflict

---
 .../test/CodeGen/AMDGPU/bundle-breaks-phy-liveness.mir | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/bundle-breaks-phy-liveness.mir b/llvm/test/CodeGen/AMDGPU/bundle-breaks-phy-liveness.mir
index 5c7dc8a079345..fe9dd0294cc4d 100644
--- a/llvm/test/CodeGen/AMDGPU/bundle-breaks-phy-liveness.mir
+++ b/llvm/test/CodeGen/AMDGPU/bundle-breaks-phy-liveness.mir
@@ -23,12 +23,10 @@ body: |
     ; GCN-BUNDLE-LABEL: name: clause_implicit_def
     ; GCN-BUNDLE: liveins: $vgpr0, $sgpr32
     ; GCN-BUNDLE-NEXT: {{  $}}
-    ; GCN-BUNDLE-NEXT: BUNDLE implicit-def $vgpr2, implicit-def $vgpr3, implicit $vgpr0, implicit $sgpr32, implicit $exec, implicit $flat_scr {
-    ; GCN-BUNDLE-NEXT:   SCRATCH_STORE_DWORD_SADDR $vgpr0, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr
-    ; GCN-BUNDLE-NEXT:   $vgpr2 = IMPLICIT_DEF
-    ; GCN-BUNDLE-NEXT:   $vgpr3 = IMPLICIT_DEF
-    ; GCN-BUNDLE-NEXT:   SCRATCH_STORE_DWORDX2_SADDR internal $vgpr2_vgpr3, $sgpr32, 16, 0, implicit $exec, implicit $flat_scr
-    ; GCN-BUNDLE-NEXT: }
+    ; GCN-BUNDLE-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr0, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr
+    ; GCN-BUNDLE-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; GCN-BUNDLE-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; GCN-BUNDLE-NEXT: SCRATCH_STORE_DWORDX2_SADDR $vgpr2_vgpr3, $sgpr32, 16, 0, implicit $exec, implicit $flat_scr
     SCRATCH_STORE_DWORD_SADDR $vgpr0, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr
     $vgpr2 = IMPLICIT_DEF
     $vgpr3 = IMPLICIT_DEF

>From b50f92d6cf046cc5aef89578b6235fd64e8c9c91 Mon Sep 17 00:00:00 2001
From: Yoonseo Choi <Yoonseo.Choi at amd.com>
Date: Fri, 15 Aug 2025 18:07:39 -0500
Subject: [PATCH 06/11] Bypass MBB that has SCHED_BARRIER

---
 llvm/lib/Target/AMDGPU/SIPostRABundler.cpp | 32 +++++++++++++---------
 1 file changed, 19 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
index e12d9e6c69509..892e1a5d65f4f 100644
--- a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
@@ -15,6 +15,7 @@
 #include "SIPostRABundler.h"
 #include "AMDGPU.h"
 #include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 
@@ -151,7 +152,12 @@ bool SIPostRABundler::run(MachineFunction &MF) {
   for (MachineBasicBlock &MBB : MF) {
     bool HasIGLPInstrs = llvm::any_of(MBB.instrs(), [](MachineInstr &MI) {
       unsigned Opc = MI.getOpcode();
-      return Opc == AMDGPU::SCHED_GROUP_BARRIER || Opc == AMDGPU::IGLP_OPT;
+      // Bypass a MBB with SCHED_BARRIER as well to make it honored.
+      // If SCHED_BARRIER is embedded between memory operations in a bundle,
+      // that SCHED_BARRIER is not picked up by IGLPmutation in post mi
+      // scheduler phase.
+      return Opc == AMDGPU::SCHED_GROUP_BARRIER || Opc == AMDGPU::IGLP_OPT ||
+             Opc == AMDGPU::SCHED_BARRIER;
     });
 
     // Don't cluster with IGLP instructions.
@@ -179,20 +185,20 @@ bool SIPostRABundler::run(MachineFunction &MF) {
         Next = std::next(I);
 
         assert(BundleEnd != I);
-        if (!canBundle(*BundleEnd, *I)) {
-          // Do not allow even meta instructions (e.g. SCHED_BARRIER) bundled
-          // between memory operations.
-          // SCHED_BARRIERs are added by users for a finer control over schedule
-          // than bundling.
-          // Examples of meta instructions: WAVE_BARRIER, SCHED_{GROUP_}BARRIER,
-          // IGLP_OPT, amdgcn.unreachable.
+        if (canBundle(*BundleEnd, *I)) {
+          BundleEnd = I;
+          if (I->getNumExplicitDefs() != 0)
+            Defs.insert(I->defs().begin()->getReg());
+          ++ClauseLength;
+        } else if (!!I->isMetaInstruction()) {
+          // Allow meta instructions in between bundle candidates, but do not
+          // start or end a bundle on one.
+          //
+          // TODO: It may be better to move meta instructions like dbg_value
+          // after the bundle. We're relying on the memory legalizer to unbundle
+          // these.
           break;
         }
-
-        BundleEnd = I;
-        if (I->getNumExplicitDefs() != 0)
-          Defs.insert(I->defs().begin()->getReg());
-        ++ClauseLength;
       }
 
       Next = std::next(BundleEnd);

>From 4cd281ce393b4429b06ef70f6a2070c34ae2ac31 Mon Sep 17 00:00:00 2001
From: Yoonseo Choi <Yoonseo.Choi at amd.com>
Date: Fri, 15 Aug 2025 21:57:14 -0500
Subject: [PATCH 07/11] Update lit-tests

---
 llvm/lib/Target/AMDGPU/SIPostRABundler.cpp    |   2 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll  | 930 +++++++++---------
 .../CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll   |  46 +-
 .../AMDGPU/bundle-breaks-phy-liveness.mir     |  10 +-
 .../AMDGPU/gfx-callable-argument-types.ll     |  52 +-
 llvm/test/CodeGen/AMDGPU/max.i16.ll           |   1 +
 .../CodeGen/AMDGPU/postra-bundle-memops.mir   | 108 +-
 7 files changed, 601 insertions(+), 548 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
index 892e1a5d65f4f..73692c43adc46 100644
--- a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
@@ -190,7 +190,7 @@ bool SIPostRABundler::run(MachineFunction &MF) {
           if (I->getNumExplicitDefs() != 0)
             Defs.insert(I->defs().begin()->getReg());
           ++ClauseLength;
-        } else if (!!I->isMetaInstruction()) {
+        } else if (!I->isMetaInstruction()) {
           // Allow meta instructions in between bundle candidates, but do not
           // start or end a bundle on one.
           //
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
index 6b47ed6e43984..d03d6a8940b2f 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
@@ -4333,16 +4333,9 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr59
-; VI-NEXT:    ; implicit-def: $vgpr58
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr58
-; VI-NEXT:    ; implicit-def: $vgpr39
-; VI-NEXT:    ; kill: killed $vgpr39
 ; VI-NEXT:    ; implicit-def: $vgpr39
 ; VI-NEXT:    ; kill: killed $vgpr39
 ; VI-NEXT:    ; implicit-def: $vgpr39
@@ -4444,43 +4437,18 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; VI-NEXT:    ; implicit-def: $vgpr39
 ; VI-NEXT:    ; kill: killed $vgpr39
 ; VI-NEXT:    ; implicit-def: $vgpr39
+; VI-NEXT:    ; implicit-def: $vgpr59
 ; VI-NEXT:    ; kill: killed $vgpr39
 ; VI-NEXT:    ; implicit-def: $vgpr39
-; VI-NEXT:    ; implicit-def: $vgpr48
-; VI-NEXT:    ; implicit-def: $vgpr53
-; VI-NEXT:    ; implicit-def: $vgpr57
-; VI-NEXT:    ; implicit-def: $vgpr56
-; VI-NEXT:    ; implicit-def: $vgpr34
-; VI-NEXT:    ; implicit-def: $vgpr38
-; VI-NEXT:    ; implicit-def: $vgpr52
-; VI-NEXT:    ; implicit-def: $vgpr47
-; VI-NEXT:    ; implicit-def: $vgpr46
-; VI-NEXT:    ; implicit-def: $vgpr45
-; VI-NEXT:    ; implicit-def: $vgpr44
-; VI-NEXT:    ; implicit-def: $vgpr51
-; VI-NEXT:    ; implicit-def: $vgpr37
-; VI-NEXT:    ; implicit-def: $vgpr43
-; VI-NEXT:    ; implicit-def: $vgpr50
-; VI-NEXT:    ; implicit-def: $vgpr63
-; VI-NEXT:    ; implicit-def: $vgpr36
-; VI-NEXT:    ; implicit-def: $vgpr62
-; VI-NEXT:    ; implicit-def: $vgpr61
-; VI-NEXT:    ; implicit-def: $vgpr49
-; VI-NEXT:    ; implicit-def: $vgpr60
-; VI-NEXT:    ; implicit-def: $vgpr35
+; VI-NEXT:    ; implicit-def: $vgpr58
 ; VI-NEXT:    ; kill: killed $vgpr39
-; VI-NEXT:    ; implicit-def: $vgpr42
-; VI-NEXT:    ; implicit-def: $vgpr55
-; VI-NEXT:    ; implicit-def: $vgpr41
-; VI-NEXT:    ; implicit-def: $vgpr40
 ; VI-NEXT:    ; implicit-def: $vgpr39
-; VI-NEXT:    ; implicit-def: $vgpr54
+; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr58
 ; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr58
-; VI-NEXT:    s_waitcnt vmcnt(5)
-; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
-; VI-NEXT:    ; implicit-def: $vgpr33
 ; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr58
@@ -4510,14 +4478,45 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; VI-NEXT:    ; implicit-def: $vgpr58
 ; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr48
+; VI-NEXT:    ; implicit-def: $vgpr53
+; VI-NEXT:    ; implicit-def: $vgpr57
+; VI-NEXT:    ; implicit-def: $vgpr56
+; VI-NEXT:    ; implicit-def: $vgpr34
+; VI-NEXT:    ; implicit-def: $vgpr38
+; VI-NEXT:    ; implicit-def: $vgpr52
+; VI-NEXT:    ; implicit-def: $vgpr47
+; VI-NEXT:    ; implicit-def: $vgpr46
+; VI-NEXT:    ; implicit-def: $vgpr45
+; VI-NEXT:    ; implicit-def: $vgpr44
+; VI-NEXT:    ; implicit-def: $vgpr51
+; VI-NEXT:    ; implicit-def: $vgpr37
+; VI-NEXT:    ; implicit-def: $vgpr43
+; VI-NEXT:    ; implicit-def: $vgpr50
+; VI-NEXT:    ; implicit-def: $vgpr63
+; VI-NEXT:    ; implicit-def: $vgpr36
+; VI-NEXT:    ; implicit-def: $vgpr62
+; VI-NEXT:    ; implicit-def: $vgpr61
+; VI-NEXT:    ; implicit-def: $vgpr49
+; VI-NEXT:    ; implicit-def: $vgpr60
+; VI-NEXT:    ; implicit-def: $vgpr35
+; VI-NEXT:    ; kill: killed $vgpr39
+; VI-NEXT:    ; implicit-def: $vgpr42
+; VI-NEXT:    ; implicit-def: $vgpr55
+; VI-NEXT:    ; implicit-def: $vgpr41
+; VI-NEXT:    ; implicit-def: $vgpr40
+; VI-NEXT:    ; implicit-def: $vgpr39
+; VI-NEXT:    ; implicit-def: $vgpr54
 ; VI-NEXT:    ; implicit-def: $vgpr58
+; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
+; VI-NEXT:    ; implicit-def: $vgpr33
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; VI-NEXT:    s_cbranch_execz .LBB12_2
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v32
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v31
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v31
@@ -4695,7 +4694,6 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; VI-NEXT:    s_cbranch_execz .LBB12_4
 ; VI-NEXT:  ; %bb.3: ; %cmp.true
 ; VI-NEXT:    v_add_u32_e32 v32, vcc, 3, v32
-; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_add_u32_e32 v31, vcc, 3, v31
 ; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[31:32]
 ; VI-NEXT:    v_add_u32_e32 v30, vcc, 3, v30
@@ -5304,16 +5302,9 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT:    ; implicit-def: $vgpr43
-; GFX9-NEXT:    ; implicit-def: $vgpr42
 ; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; GFX9-NEXT:    ; implicit-def: $vgpr42
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
 ; GFX9-NEXT:    ; kill: killed $vgpr40
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
@@ -5407,12 +5398,14 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr50
 ; GFX9-NEXT:    ; kill: killed $vgpr40
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
+; GFX9-NEXT:    ; implicit-def: $vgpr43
 ; GFX9-NEXT:    ; kill: killed $vgpr36
 ; GFX9-NEXT:    ; implicit-def: $vgpr36
 ; GFX9-NEXT:    ; kill: killed $vgpr50
 ; GFX9-NEXT:    ; implicit-def: $vgpr50
 ; GFX9-NEXT:    ; kill: killed $vgpr40
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
+; GFX9-NEXT:    ; implicit-def: $vgpr42
 ; GFX9-NEXT:    ; implicit-def: $vgpr56
 ; GFX9-NEXT:    ; implicit-def: $vgpr44
 ; GFX9-NEXT:    ; implicit-def: $vgpr38
@@ -5444,15 +5437,15 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; GFX9-NEXT:    ; kill: killed $vgpr40
 ; GFX9-NEXT:    ; implicit-def: $vgpr41
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-NEXT:    ; implicit-def: $vgpr42
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr42
-; GFX9-NEXT:    s_waitcnt vmcnt(5)
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
-; GFX9-NEXT:    ; implicit-def: $vgpr33
-; GFX9-NEXT:    ; kill: killed $vgpr33
-; GFX9-NEXT:    ; implicit-def: $vgpr33
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
@@ -5500,6 +5493,11 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(29)
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
+; GFX9-NEXT:    ; implicit-def: $vgpr33
+; GFX9-NEXT:    ; kill: killed $vgpr33
+; GFX9-NEXT:    ; implicit-def: $vgpr33
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB12_2
@@ -40054,16 +40052,9 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr59
-; VI-NEXT:    ; implicit-def: $vgpr58
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr58
-; VI-NEXT:    ; implicit-def: $vgpr39
-; VI-NEXT:    ; kill: killed $vgpr39
 ; VI-NEXT:    ; implicit-def: $vgpr39
 ; VI-NEXT:    ; kill: killed $vgpr39
 ; VI-NEXT:    ; implicit-def: $vgpr39
@@ -40165,43 +40156,18 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; VI-NEXT:    ; implicit-def: $vgpr39
 ; VI-NEXT:    ; kill: killed $vgpr39
 ; VI-NEXT:    ; implicit-def: $vgpr39
+; VI-NEXT:    ; implicit-def: $vgpr59
 ; VI-NEXT:    ; kill: killed $vgpr39
 ; VI-NEXT:    ; implicit-def: $vgpr39
-; VI-NEXT:    ; implicit-def: $vgpr48
-; VI-NEXT:    ; implicit-def: $vgpr53
-; VI-NEXT:    ; implicit-def: $vgpr57
-; VI-NEXT:    ; implicit-def: $vgpr56
-; VI-NEXT:    ; implicit-def: $vgpr34
-; VI-NEXT:    ; implicit-def: $vgpr38
-; VI-NEXT:    ; implicit-def: $vgpr52
-; VI-NEXT:    ; implicit-def: $vgpr47
-; VI-NEXT:    ; implicit-def: $vgpr46
-; VI-NEXT:    ; implicit-def: $vgpr45
-; VI-NEXT:    ; implicit-def: $vgpr44
-; VI-NEXT:    ; implicit-def: $vgpr51
-; VI-NEXT:    ; implicit-def: $vgpr37
-; VI-NEXT:    ; implicit-def: $vgpr43
-; VI-NEXT:    ; implicit-def: $vgpr50
-; VI-NEXT:    ; implicit-def: $vgpr63
-; VI-NEXT:    ; implicit-def: $vgpr36
-; VI-NEXT:    ; implicit-def: $vgpr62
-; VI-NEXT:    ; implicit-def: $vgpr61
-; VI-NEXT:    ; implicit-def: $vgpr49
-; VI-NEXT:    ; implicit-def: $vgpr60
-; VI-NEXT:    ; implicit-def: $vgpr35
+; VI-NEXT:    ; implicit-def: $vgpr58
 ; VI-NEXT:    ; kill: killed $vgpr39
-; VI-NEXT:    ; implicit-def: $vgpr42
-; VI-NEXT:    ; implicit-def: $vgpr55
-; VI-NEXT:    ; implicit-def: $vgpr41
-; VI-NEXT:    ; implicit-def: $vgpr40
 ; VI-NEXT:    ; implicit-def: $vgpr39
-; VI-NEXT:    ; implicit-def: $vgpr54
+; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr58
 ; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr58
-; VI-NEXT:    s_waitcnt vmcnt(5)
-; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
-; VI-NEXT:    ; implicit-def: $vgpr33
 ; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr58
@@ -40231,14 +40197,45 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; VI-NEXT:    ; implicit-def: $vgpr58
 ; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr48
+; VI-NEXT:    ; implicit-def: $vgpr53
+; VI-NEXT:    ; implicit-def: $vgpr57
+; VI-NEXT:    ; implicit-def: $vgpr56
+; VI-NEXT:    ; implicit-def: $vgpr34
+; VI-NEXT:    ; implicit-def: $vgpr38
+; VI-NEXT:    ; implicit-def: $vgpr52
+; VI-NEXT:    ; implicit-def: $vgpr47
+; VI-NEXT:    ; implicit-def: $vgpr46
+; VI-NEXT:    ; implicit-def: $vgpr45
+; VI-NEXT:    ; implicit-def: $vgpr44
+; VI-NEXT:    ; implicit-def: $vgpr51
+; VI-NEXT:    ; implicit-def: $vgpr37
+; VI-NEXT:    ; implicit-def: $vgpr43
+; VI-NEXT:    ; implicit-def: $vgpr50
+; VI-NEXT:    ; implicit-def: $vgpr63
+; VI-NEXT:    ; implicit-def: $vgpr36
+; VI-NEXT:    ; implicit-def: $vgpr62
+; VI-NEXT:    ; implicit-def: $vgpr61
+; VI-NEXT:    ; implicit-def: $vgpr49
+; VI-NEXT:    ; implicit-def: $vgpr60
+; VI-NEXT:    ; implicit-def: $vgpr35
+; VI-NEXT:    ; kill: killed $vgpr39
+; VI-NEXT:    ; implicit-def: $vgpr42
+; VI-NEXT:    ; implicit-def: $vgpr55
+; VI-NEXT:    ; implicit-def: $vgpr41
+; VI-NEXT:    ; implicit-def: $vgpr40
+; VI-NEXT:    ; implicit-def: $vgpr39
+; VI-NEXT:    ; implicit-def: $vgpr54
 ; VI-NEXT:    ; implicit-def: $vgpr58
+; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
+; VI-NEXT:    ; implicit-def: $vgpr33
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; VI-NEXT:    s_cbranch_execz .LBB36_2
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v32
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v31
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v31
@@ -40416,7 +40413,6 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; VI-NEXT:    s_cbranch_execz .LBB36_4
 ; VI-NEXT:  ; %bb.3: ; %cmp.true
 ; VI-NEXT:    v_add_f32_e32 v32, 1.0, v32
-; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_add_f32_e32 v31, 1.0, v31
 ; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[31:32]
 ; VI-NEXT:    v_add_f32_e32 v30, 1.0, v30
@@ -41025,16 +41021,9 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT:    ; implicit-def: $vgpr43
-; GFX9-NEXT:    ; implicit-def: $vgpr42
 ; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; GFX9-NEXT:    ; implicit-def: $vgpr42
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
 ; GFX9-NEXT:    ; kill: killed $vgpr40
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
@@ -41128,12 +41117,14 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr50
 ; GFX9-NEXT:    ; kill: killed $vgpr40
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
+; GFX9-NEXT:    ; implicit-def: $vgpr43
 ; GFX9-NEXT:    ; kill: killed $vgpr36
 ; GFX9-NEXT:    ; implicit-def: $vgpr36
 ; GFX9-NEXT:    ; kill: killed $vgpr50
 ; GFX9-NEXT:    ; implicit-def: $vgpr50
 ; GFX9-NEXT:    ; kill: killed $vgpr40
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
+; GFX9-NEXT:    ; implicit-def: $vgpr42
 ; GFX9-NEXT:    ; implicit-def: $vgpr56
 ; GFX9-NEXT:    ; implicit-def: $vgpr44
 ; GFX9-NEXT:    ; implicit-def: $vgpr38
@@ -41165,15 +41156,15 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; GFX9-NEXT:    ; kill: killed $vgpr40
 ; GFX9-NEXT:    ; implicit-def: $vgpr41
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-NEXT:    ; implicit-def: $vgpr42
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr42
-; GFX9-NEXT:    s_waitcnt vmcnt(5)
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
-; GFX9-NEXT:    ; implicit-def: $vgpr33
-; GFX9-NEXT:    ; kill: killed $vgpr33
-; GFX9-NEXT:    ; implicit-def: $vgpr33
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
@@ -41221,6 +41212,11 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(29)
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
+; GFX9-NEXT:    ; implicit-def: $vgpr33
+; GFX9-NEXT:    ; kill: killed $vgpr33
+; GFX9-NEXT:    ; implicit-def: $vgpr33
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB36_2
@@ -75710,16 +75706,9 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr59
-; VI-NEXT:    ; implicit-def: $vgpr58
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr58
-; VI-NEXT:    ; implicit-def: $vgpr39
-; VI-NEXT:    ; kill: killed $vgpr39
 ; VI-NEXT:    ; implicit-def: $vgpr39
 ; VI-NEXT:    ; kill: killed $vgpr39
 ; VI-NEXT:    ; implicit-def: $vgpr39
@@ -75821,43 +75810,18 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; VI-NEXT:    ; implicit-def: $vgpr39
 ; VI-NEXT:    ; kill: killed $vgpr39
 ; VI-NEXT:    ; implicit-def: $vgpr39
+; VI-NEXT:    ; implicit-def: $vgpr59
 ; VI-NEXT:    ; kill: killed $vgpr39
 ; VI-NEXT:    ; implicit-def: $vgpr39
-; VI-NEXT:    ; implicit-def: $vgpr48
-; VI-NEXT:    ; implicit-def: $vgpr53
-; VI-NEXT:    ; implicit-def: $vgpr57
-; VI-NEXT:    ; implicit-def: $vgpr56
-; VI-NEXT:    ; implicit-def: $vgpr34
-; VI-NEXT:    ; implicit-def: $vgpr38
-; VI-NEXT:    ; implicit-def: $vgpr52
-; VI-NEXT:    ; implicit-def: $vgpr47
-; VI-NEXT:    ; implicit-def: $vgpr46
-; VI-NEXT:    ; implicit-def: $vgpr45
-; VI-NEXT:    ; implicit-def: $vgpr44
-; VI-NEXT:    ; implicit-def: $vgpr51
-; VI-NEXT:    ; implicit-def: $vgpr37
-; VI-NEXT:    ; implicit-def: $vgpr43
-; VI-NEXT:    ; implicit-def: $vgpr50
-; VI-NEXT:    ; implicit-def: $vgpr63
-; VI-NEXT:    ; implicit-def: $vgpr36
-; VI-NEXT:    ; implicit-def: $vgpr62
-; VI-NEXT:    ; implicit-def: $vgpr61
-; VI-NEXT:    ; implicit-def: $vgpr49
-; VI-NEXT:    ; implicit-def: $vgpr60
-; VI-NEXT:    ; implicit-def: $vgpr35
+; VI-NEXT:    ; implicit-def: $vgpr58
 ; VI-NEXT:    ; kill: killed $vgpr39
-; VI-NEXT:    ; implicit-def: $vgpr42
-; VI-NEXT:    ; implicit-def: $vgpr55
-; VI-NEXT:    ; implicit-def: $vgpr41
-; VI-NEXT:    ; implicit-def: $vgpr40
 ; VI-NEXT:    ; implicit-def: $vgpr39
-; VI-NEXT:    ; implicit-def: $vgpr54
+; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr58
 ; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr58
-; VI-NEXT:    s_waitcnt vmcnt(5)
-; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
-; VI-NEXT:    ; implicit-def: $vgpr33
 ; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr58
@@ -75887,14 +75851,45 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; VI-NEXT:    ; implicit-def: $vgpr58
 ; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr48
+; VI-NEXT:    ; implicit-def: $vgpr53
+; VI-NEXT:    ; implicit-def: $vgpr57
+; VI-NEXT:    ; implicit-def: $vgpr56
+; VI-NEXT:    ; implicit-def: $vgpr34
+; VI-NEXT:    ; implicit-def: $vgpr38
+; VI-NEXT:    ; implicit-def: $vgpr52
+; VI-NEXT:    ; implicit-def: $vgpr47
+; VI-NEXT:    ; implicit-def: $vgpr46
+; VI-NEXT:    ; implicit-def: $vgpr45
+; VI-NEXT:    ; implicit-def: $vgpr44
+; VI-NEXT:    ; implicit-def: $vgpr51
+; VI-NEXT:    ; implicit-def: $vgpr37
+; VI-NEXT:    ; implicit-def: $vgpr43
+; VI-NEXT:    ; implicit-def: $vgpr50
+; VI-NEXT:    ; implicit-def: $vgpr63
+; VI-NEXT:    ; implicit-def: $vgpr36
+; VI-NEXT:    ; implicit-def: $vgpr62
+; VI-NEXT:    ; implicit-def: $vgpr61
+; VI-NEXT:    ; implicit-def: $vgpr49
+; VI-NEXT:    ; implicit-def: $vgpr60
+; VI-NEXT:    ; implicit-def: $vgpr35
+; VI-NEXT:    ; kill: killed $vgpr39
+; VI-NEXT:    ; implicit-def: $vgpr42
+; VI-NEXT:    ; implicit-def: $vgpr55
+; VI-NEXT:    ; implicit-def: $vgpr41
+; VI-NEXT:    ; implicit-def: $vgpr40
+; VI-NEXT:    ; implicit-def: $vgpr39
+; VI-NEXT:    ; implicit-def: $vgpr54
 ; VI-NEXT:    ; implicit-def: $vgpr58
+; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
+; VI-NEXT:    ; implicit-def: $vgpr33
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; VI-NEXT:    s_cbranch_execz .LBB56_2
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v32
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v31
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v31
@@ -76101,7 +76096,6 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; VI-NEXT:    v_addc_u32_e32 v28, vcc, 0, v28, vcc
 ; VI-NEXT:    v_add_u32_e32 v29, vcc, 3, v29
 ; VI-NEXT:    v_addc_u32_e32 v30, vcc, 0, v30, vcc
-; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_add_u32_e32 v31, vcc, 3, v31
 ; VI-NEXT:    v_addc_u32_e32 v32, vcc, 0, v32, vcc
 ; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[31:32]
@@ -76681,16 +76675,9 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT:    ; implicit-def: $vgpr43
-; GFX9-NEXT:    ; implicit-def: $vgpr42
 ; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; GFX9-NEXT:    ; implicit-def: $vgpr42
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
 ; GFX9-NEXT:    ; kill: killed $vgpr40
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
@@ -76784,12 +76771,14 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr50
 ; GFX9-NEXT:    ; kill: killed $vgpr40
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
+; GFX9-NEXT:    ; implicit-def: $vgpr43
 ; GFX9-NEXT:    ; kill: killed $vgpr36
 ; GFX9-NEXT:    ; implicit-def: $vgpr36
 ; GFX9-NEXT:    ; kill: killed $vgpr50
 ; GFX9-NEXT:    ; implicit-def: $vgpr50
 ; GFX9-NEXT:    ; kill: killed $vgpr40
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
+; GFX9-NEXT:    ; implicit-def: $vgpr42
 ; GFX9-NEXT:    ; implicit-def: $vgpr56
 ; GFX9-NEXT:    ; implicit-def: $vgpr44
 ; GFX9-NEXT:    ; implicit-def: $vgpr38
@@ -76821,15 +76810,15 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; GFX9-NEXT:    ; kill: killed $vgpr40
 ; GFX9-NEXT:    ; implicit-def: $vgpr41
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-NEXT:    ; implicit-def: $vgpr42
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr42
-; GFX9-NEXT:    s_waitcnt vmcnt(5)
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
-; GFX9-NEXT:    ; implicit-def: $vgpr33
-; GFX9-NEXT:    ; kill: killed $vgpr33
-; GFX9-NEXT:    ; implicit-def: $vgpr33
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
@@ -76877,6 +76866,11 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(29)
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
+; GFX9-NEXT:    ; implicit-def: $vgpr33
+; GFX9-NEXT:    ; kill: killed $vgpr33
+; GFX9-NEXT:    ; implicit-def: $vgpr33
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB56_2
@@ -109467,16 +109461,9 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr57
-; VI-NEXT:    ; implicit-def: $vgpr56
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr56
-; VI-NEXT:    ; implicit-def: $vgpr39
-; VI-NEXT:    ; kill: killed $vgpr39
 ; VI-NEXT:    ; implicit-def: $vgpr39
 ; VI-NEXT:    ; kill: killed $vgpr39
 ; VI-NEXT:    ; implicit-def: $vgpr39
@@ -109576,43 +109563,18 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; VI-NEXT:    ; implicit-def: $vgpr39
 ; VI-NEXT:    ; kill: killed $vgpr39
 ; VI-NEXT:    ; implicit-def: $vgpr39
+; VI-NEXT:    ; implicit-def: $vgpr57
 ; VI-NEXT:    ; kill: killed $vgpr39
 ; VI-NEXT:    ; implicit-def: $vgpr39
-; VI-NEXT:    ; implicit-def: $vgpr55
-; VI-NEXT:    ; implicit-def: $vgpr47
-; VI-NEXT:    ; implicit-def: $vgpr54
-; VI-NEXT:    ; implicit-def: $vgpr48
-; VI-NEXT:    ; implicit-def: $vgpr46
-; VI-NEXT:    ; implicit-def: $vgpr53
-; VI-NEXT:    ; implicit-def: $vgpr45
-; VI-NEXT:    ; implicit-def: $vgpr44
-; VI-NEXT:    ; implicit-def: $vgpr34
-; VI-NEXT:    ; implicit-def: $vgpr52
-; VI-NEXT:    ; implicit-def: $vgpr38
-; VI-NEXT:    ; implicit-def: $vgpr43
-; VI-NEXT:    ; implicit-def: $vgpr63
-; VI-NEXT:    ; implicit-def: $vgpr62
-; VI-NEXT:    ; implicit-def: $vgpr51
-; VI-NEXT:    ; implicit-def: $vgpr37
-; VI-NEXT:    ; implicit-def: $vgpr61
-; VI-NEXT:    ; implicit-def: $vgpr50
-; VI-NEXT:    ; implicit-def: $vgpr60
-; VI-NEXT:    ; implicit-def: $vgpr59
-; VI-NEXT:    ; implicit-def: $vgpr36
-; VI-NEXT:    ; implicit-def: $vgpr58
-; VI-NEXT:    ; implicit-def: $vgpr35
-; VI-NEXT:    ; implicit-def: $vgpr49
+; VI-NEXT:    ; implicit-def: $vgpr56
 ; VI-NEXT:    ; kill: killed $vgpr39
-; VI-NEXT:    ; implicit-def: $vgpr42
-; VI-NEXT:    ; implicit-def: $vgpr41
-; VI-NEXT:    ; implicit-def: $vgpr40
 ; VI-NEXT:    ; implicit-def: $vgpr39
+; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr56
 ; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr56
-; VI-NEXT:    s_waitcnt vmcnt(5)
-; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
-; VI-NEXT:    ; implicit-def: $vgpr33
 ; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr56
@@ -109645,7 +109607,39 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; VI-NEXT:    ; implicit-def: $vgpr56
 ; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr55
+; VI-NEXT:    ; implicit-def: $vgpr47
+; VI-NEXT:    ; implicit-def: $vgpr54
+; VI-NEXT:    ; implicit-def: $vgpr48
+; VI-NEXT:    ; implicit-def: $vgpr46
+; VI-NEXT:    ; implicit-def: $vgpr53
+; VI-NEXT:    ; implicit-def: $vgpr45
+; VI-NEXT:    ; implicit-def: $vgpr44
+; VI-NEXT:    ; implicit-def: $vgpr34
+; VI-NEXT:    ; implicit-def: $vgpr52
+; VI-NEXT:    ; implicit-def: $vgpr38
+; VI-NEXT:    ; implicit-def: $vgpr43
+; VI-NEXT:    ; implicit-def: $vgpr63
+; VI-NEXT:    ; implicit-def: $vgpr62
+; VI-NEXT:    ; implicit-def: $vgpr51
+; VI-NEXT:    ; implicit-def: $vgpr37
+; VI-NEXT:    ; implicit-def: $vgpr61
+; VI-NEXT:    ; implicit-def: $vgpr50
+; VI-NEXT:    ; implicit-def: $vgpr60
+; VI-NEXT:    ; implicit-def: $vgpr59
+; VI-NEXT:    ; implicit-def: $vgpr36
+; VI-NEXT:    ; implicit-def: $vgpr58
+; VI-NEXT:    ; implicit-def: $vgpr35
+; VI-NEXT:    ; implicit-def: $vgpr49
+; VI-NEXT:    ; kill: killed $vgpr39
+; VI-NEXT:    ; implicit-def: $vgpr42
+; VI-NEXT:    ; implicit-def: $vgpr41
+; VI-NEXT:    ; implicit-def: $vgpr40
+; VI-NEXT:    ; implicit-def: $vgpr39
 ; VI-NEXT:    ; implicit-def: $vgpr56
+; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
+; VI-NEXT:    ; implicit-def: $vgpr33
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; VI-NEXT:    s_cbranch_execz .LBB72_2
@@ -109654,7 +109648,6 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v32
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v31
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v31
@@ -109829,7 +109822,6 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; VI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; VI-NEXT:    s_cbranch_execz .LBB72_4
 ; VI-NEXT:  ; %bb.3: ; %cmp.true
-; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_add_f64 v[31:32], v[31:32], 1.0
 ; VI-NEXT:    v_add_f64 v[29:30], v[29:30], 1.0
 ; VI-NEXT:    v_add_f64 v[27:28], v[27:28], 1.0
@@ -110425,16 +110417,9 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT:    ; implicit-def: $vgpr43
-; GFX9-NEXT:    ; implicit-def: $vgpr42
 ; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; GFX9-NEXT:    ; implicit-def: $vgpr42
 ; GFX9-NEXT:    ; implicit-def: $vgpr41
 ; GFX9-NEXT:    ; kill: killed $vgpr41
 ; GFX9-NEXT:    ; implicit-def: $vgpr41
@@ -110528,12 +110513,14 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr48
 ; GFX9-NEXT:    ; kill: killed $vgpr41
 ; GFX9-NEXT:    ; implicit-def: $vgpr41
+; GFX9-NEXT:    ; implicit-def: $vgpr43
 ; GFX9-NEXT:    ; kill: killed $vgpr36
 ; GFX9-NEXT:    ; implicit-def: $vgpr36
 ; GFX9-NEXT:    ; kill: killed $vgpr48
 ; GFX9-NEXT:    ; implicit-def: $vgpr48
 ; GFX9-NEXT:    ; kill: killed $vgpr41
 ; GFX9-NEXT:    ; implicit-def: $vgpr41
+; GFX9-NEXT:    ; implicit-def: $vgpr42
 ; GFX9-NEXT:    ; implicit-def: $vgpr46
 ; GFX9-NEXT:    ; implicit-def: $vgpr54
 ; GFX9-NEXT:    ; implicit-def: $vgpr45
@@ -110565,13 +110552,15 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr48
 ; GFX9-NEXT:    ; kill: killed $vgpr41
 ; GFX9-NEXT:    ; implicit-def: $vgpr41
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; GFX9-NEXT:    ; implicit-def: $vgpr42
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr42
-; GFX9-NEXT:    s_waitcnt vmcnt(5)
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
-; GFX9-NEXT:    ; implicit-def: $vgpr33
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
@@ -110623,6 +110612,9 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(31)
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
+; GFX9-NEXT:    ; implicit-def: $vgpr33
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB72_2
@@ -156464,16 +156456,12 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr35
-; VI-NEXT:    ; implicit-def: $vgpr34
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr34
+; VI-NEXT:    ; implicit-def: $vgpr35
 ; VI-NEXT:    ; implicit-def: $vgpr45
-; VI-NEXT:    ; implicit-def: $vgpr44
+; VI-NEXT:    ; implicit-def: $vgpr34
 ; VI-NEXT:    ; implicit-def: $vgpr38
 ; VI-NEXT:    ; implicit-def: $vgpr55
 ; VI-NEXT:    ; implicit-def: $vgpr54
@@ -156498,11 +156486,52 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; VI-NEXT:    ; implicit-def: $vgpr48
 ; VI-NEXT:    ; implicit-def: $vgpr52
 ; VI-NEXT:    ; implicit-def: $vgpr37
-; VI-NEXT:    ; implicit-def: $vgpr56
+; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr34
 ; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr34
-; VI-NEXT:    s_waitcnt vmcnt(5)
+; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr34
+; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr34
+; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr34
+; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr34
+; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr34
+; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr34
+; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr34
+; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr34
+; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr34
+; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr56
+; VI-NEXT:    ; implicit-def: $vgpr34
+; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr44
+; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr44
+; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
 ; VI-NEXT:    ; implicit-def: $vgpr33
 ; VI-NEXT:    ; kill: killed $vgpr33
@@ -156611,43 +156640,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; VI-NEXT:    ; implicit-def: $vgpr33
 ; VI-NEXT:    ; kill: killed $vgpr33
 ; VI-NEXT:    ; implicit-def: $vgpr33
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr34
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr34
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr34
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr34
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr34
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr34
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr34
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr34
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr34
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr34
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr44
-; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; VI-NEXT:    s_cbranch_execz .LBB90_2
@@ -156782,7 +156774,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; VI-NEXT:    v_lshrrev_b64 v[44:45], 24, v[1:2]
 ; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_lshrrev_b64 v[44:45], 24, v[31:32]
 ; VI-NEXT:    v_lshrrev_b32_e32 v46, 24, v12
 ; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
@@ -158011,60 +158002,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT:    ; implicit-def: $vgpr58
-; GFX9-NEXT:    ; kill: killed $vgpr58
-; GFX9-NEXT:    ; implicit-def: $vgpr58
-; GFX9-NEXT:    ; kill: killed $vgpr58
-; GFX9-NEXT:    ; implicit-def: $vgpr58
-; GFX9-NEXT:    ; kill: killed $vgpr58
-; GFX9-NEXT:    ; implicit-def: $vgpr58
-; GFX9-NEXT:    ; kill: killed $vgpr58
-; GFX9-NEXT:    ; implicit-def: $vgpr58
-; GFX9-NEXT:    ; kill: killed $vgpr58
-; GFX9-NEXT:    ; implicit-def: $vgpr58
-; GFX9-NEXT:    ; kill: killed $vgpr58
-; GFX9-NEXT:    ; implicit-def: $vgpr58
-; GFX9-NEXT:    ; kill: killed $vgpr58
-; GFX9-NEXT:    ; implicit-def: $vgpr58
-; GFX9-NEXT:    ; kill: killed $vgpr58
-; GFX9-NEXT:    ; implicit-def: $vgpr58
-; GFX9-NEXT:    ; kill: killed $vgpr58
-; GFX9-NEXT:    ; implicit-def: $vgpr58
-; GFX9-NEXT:    ; kill: killed $vgpr58
-; GFX9-NEXT:    ; implicit-def: $vgpr58
-; GFX9-NEXT:    ; kill: killed $vgpr58
-; GFX9-NEXT:    ; implicit-def: $vgpr58
-; GFX9-NEXT:    ; kill: killed $vgpr58
-; GFX9-NEXT:    ; implicit-def: $vgpr58
-; GFX9-NEXT:    ; implicit-def: $vgpr59
-; GFX9-NEXT:    ; kill: killed $vgpr58
-; GFX9-NEXT:    ; implicit-def: $vgpr58
-; GFX9-NEXT:    ; kill: killed $vgpr59
-; GFX9-NEXT:    ; implicit-def: $vgpr59
-; GFX9-NEXT:    ; kill: killed $vgpr58
-; GFX9-NEXT:    ; implicit-def: $vgpr58
-; GFX9-NEXT:    ; kill: killed $vgpr59
-; GFX9-NEXT:    ; implicit-def: $vgpr59
-; GFX9-NEXT:    ; kill: killed $vgpr58
-; GFX9-NEXT:    ; implicit-def: $vgpr58
-; GFX9-NEXT:    ; kill: killed $vgpr59
-; GFX9-NEXT:    ; implicit-def: $vgpr59
-; GFX9-NEXT:    ; kill: killed $vgpr58
-; GFX9-NEXT:    ; implicit-def: $vgpr58
-; GFX9-NEXT:    ; kill: killed $vgpr59
-; GFX9-NEXT:    ; implicit-def: $vgpr59
-; GFX9-NEXT:    ; kill: killed $vgpr58
-; GFX9-NEXT:    ; implicit-def: $vgpr58
 ; GFX9-NEXT:    v_mov_b32_e32 v46, v15
 ; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:8
 ; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX9-NEXT:    ; kill: killed $vgpr59
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; GFX9-NEXT:    ; implicit-def: $vgpr58
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
 ; GFX9-NEXT:    ; kill: killed $vgpr33
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
@@ -158074,42 +158015,86 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
 ; GFX9-NEXT:    ; kill: killed $vgpr33
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
+; GFX9-NEXT:    ; implicit-def: $vgpr58
 ; GFX9-NEXT:    ; kill: killed $vgpr33
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
+; GFX9-NEXT:    ; kill: killed $vgpr58
+; GFX9-NEXT:    ; implicit-def: $vgpr58
 ; GFX9-NEXT:    ; kill: killed $vgpr33
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
+; GFX9-NEXT:    ; kill: killed $vgpr58
+; GFX9-NEXT:    ; implicit-def: $vgpr58
 ; GFX9-NEXT:    ; kill: killed $vgpr33
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
+; GFX9-NEXT:    ; kill: killed $vgpr58
+; GFX9-NEXT:    ; implicit-def: $vgpr58
 ; GFX9-NEXT:    ; kill: killed $vgpr33
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
+; GFX9-NEXT:    ; kill: killed $vgpr58
+; GFX9-NEXT:    ; implicit-def: $vgpr58
 ; GFX9-NEXT:    ; kill: killed $vgpr33
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
+; GFX9-NEXT:    ; kill: killed $vgpr58
+; GFX9-NEXT:    ; implicit-def: $vgpr58
 ; GFX9-NEXT:    ; kill: killed $vgpr33
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
+; GFX9-NEXT:    ; kill: killed $vgpr58
+; GFX9-NEXT:    ; implicit-def: $vgpr58
 ; GFX9-NEXT:    ; kill: killed $vgpr33
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
+; GFX9-NEXT:    ; kill: killed $vgpr58
+; GFX9-NEXT:    ; implicit-def: $vgpr58
 ; GFX9-NEXT:    ; kill: killed $vgpr33
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
+; GFX9-NEXT:    ; kill: killed $vgpr58
+; GFX9-NEXT:    ; implicit-def: $vgpr58
 ; GFX9-NEXT:    ; kill: killed $vgpr33
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
+; GFX9-NEXT:    ; kill: killed $vgpr58
+; GFX9-NEXT:    ; implicit-def: $vgpr58
 ; GFX9-NEXT:    ; kill: killed $vgpr33
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
+; GFX9-NEXT:    ; kill: killed $vgpr58
+; GFX9-NEXT:    ; implicit-def: $vgpr58
 ; GFX9-NEXT:    ; kill: killed $vgpr33
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
+; GFX9-NEXT:    ; kill: killed $vgpr58
+; GFX9-NEXT:    ; implicit-def: $vgpr58
 ; GFX9-NEXT:    ; kill: killed $vgpr33
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
+; GFX9-NEXT:    ; kill: killed $vgpr58
+; GFX9-NEXT:    ; implicit-def: $vgpr58
 ; GFX9-NEXT:    ; kill: killed $vgpr33
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
+; GFX9-NEXT:    ; implicit-def: $vgpr59
+; GFX9-NEXT:    ; kill: killed $vgpr58
+; GFX9-NEXT:    ; implicit-def: $vgpr58
 ; GFX9-NEXT:    ; kill: killed $vgpr33
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
+; GFX9-NEXT:    ; kill: killed $vgpr59
+; GFX9-NEXT:    ; implicit-def: $vgpr59
+; GFX9-NEXT:    ; kill: killed $vgpr58
+; GFX9-NEXT:    ; implicit-def: $vgpr58
 ; GFX9-NEXT:    ; kill: killed $vgpr33
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
+; GFX9-NEXT:    ; kill: killed $vgpr59
+; GFX9-NEXT:    ; implicit-def: $vgpr59
+; GFX9-NEXT:    ; kill: killed $vgpr58
+; GFX9-NEXT:    ; implicit-def: $vgpr58
 ; GFX9-NEXT:    ; kill: killed $vgpr33
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
+; GFX9-NEXT:    ; kill: killed $vgpr59
+; GFX9-NEXT:    ; implicit-def: $vgpr59
+; GFX9-NEXT:    ; kill: killed $vgpr58
+; GFX9-NEXT:    ; implicit-def: $vgpr58
 ; GFX9-NEXT:    v_mov_b32_e32 v47, v16
 ; GFX9-NEXT:    ; kill: killed $vgpr33
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
 ; GFX9-NEXT:    ; implicit-def: $vgpr36
+; GFX9-NEXT:    ; kill: killed $vgpr59
+; GFX9-NEXT:    ; implicit-def: $vgpr59
+; GFX9-NEXT:    ; kill: killed $vgpr58
+; GFX9-NEXT:    ; implicit-def: $vgpr58
 ; GFX9-NEXT:    ; implicit-def: $vgpr41
 ; GFX9-NEXT:    ; implicit-def: $vgpr38
 ; GFX9-NEXT:    ; implicit-def: $vgpr51
@@ -158140,25 +158125,15 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr57
 ; GFX9-NEXT:    ; implicit-def: $vgpr36
 ; GFX9-NEXT:    ; implicit-def: $vgpr63
+; GFX9-NEXT:    ; kill: killed $vgpr59
+; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; GFX9-NEXT:    ; implicit-def: $vgpr58
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr58
-; GFX9-NEXT:    s_waitcnt vmcnt(5)
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v15
-; GFX9-NEXT:    ; implicit-def: $vgpr15
-; GFX9-NEXT:    ; kill: killed $vgpr15
-; GFX9-NEXT:    ; implicit-def: $vgpr15
-; GFX9-NEXT:    ; kill: killed $vgpr15
-; GFX9-NEXT:    ; implicit-def: $vgpr15
-; GFX9-NEXT:    ; kill: killed $vgpr15
-; GFX9-NEXT:    ; implicit-def: $vgpr15
-; GFX9-NEXT:    ; kill: killed $vgpr15
-; GFX9-NEXT:    ; implicit-def: $vgpr15
-; GFX9-NEXT:    ; kill: killed $vgpr15
-; GFX9-NEXT:    ; implicit-def: $vgpr15
-; GFX9-NEXT:    ; kill: killed $vgpr15
-; GFX9-NEXT:    ; implicit-def: $vgpr15
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
@@ -158214,6 +158189,21 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(33)
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v15
+; GFX9-NEXT:    ; implicit-def: $vgpr15
+; GFX9-NEXT:    ; kill: killed $vgpr15
+; GFX9-NEXT:    ; implicit-def: $vgpr15
+; GFX9-NEXT:    ; kill: killed $vgpr15
+; GFX9-NEXT:    ; implicit-def: $vgpr15
+; GFX9-NEXT:    ; kill: killed $vgpr15
+; GFX9-NEXT:    ; implicit-def: $vgpr15
+; GFX9-NEXT:    ; kill: killed $vgpr15
+; GFX9-NEXT:    ; implicit-def: $vgpr15
+; GFX9-NEXT:    ; kill: killed $vgpr15
+; GFX9-NEXT:    ; implicit-def: $vgpr15
+; GFX9-NEXT:    ; kill: killed $vgpr15
+; GFX9-NEXT:    ; implicit-def: $vgpr15
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB90_2
@@ -182756,12 +182746,9 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v27
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v25
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v23
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v14
 ; VI-NEXT:    v_lshrrev_b32_e32 v48, 16, v16
 ; VI-NEXT:    v_lshrrev_b32_e32 v39, 16, v15
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v14
 ; VI-NEXT:    v_lshrrev_b32_e32 v54, 16, v12
 ; VI-NEXT:    v_lshrrev_b32_e32 v62, 16, v11
 ; VI-NEXT:    v_lshrrev_b32_e32 v63, 16, v10
@@ -182771,20 +182758,19 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    v_lshrrev_b32_e32 v55, 16, v2
 ; VI-NEXT:    v_lshrrev_b32_e32 v56, 16, v28
 ; VI-NEXT:    v_lshrrev_b32_e32 v58, 16, v26
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v57, 16, v24
+; VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v23
 ; VI-NEXT:    v_lshrrev_b32_e32 v59, 16, v22
 ; VI-NEXT:    v_lshrrev_b32_e32 v38, 16, v21
 ; VI-NEXT:    v_lshrrev_b32_e32 v53, 16, v20
 ; VI-NEXT:    v_lshrrev_b32_e32 v43, 16, v19
 ; VI-NEXT:    v_lshrrev_b32_e32 v41, 16, v18
 ; VI-NEXT:    v_lshrrev_b32_e32 v46, 16, v17
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr52
 ; VI-NEXT:    ; implicit-def: $vgpr45
 ; VI-NEXT:    ; implicit-def: $vgpr42
-; VI-NEXT:    ; implicit-def: $vgpr50
-; VI-NEXT:    ; implicit-def: $vgpr40
-; VI-NEXT:    ; implicit-def: $vgpr49
-; VI-NEXT:    ; implicit-def: $vgpr35
 ; VI-NEXT:    s_waitcnt vmcnt(13)
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
 ; VI-NEXT:    ; implicit-def: $vgpr31
@@ -182882,15 +182868,15 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    ; implicit-def: $vgpr31
 ; VI-NEXT:    ; kill: killed $vgpr31
 ; VI-NEXT:    ; implicit-def: $vgpr31
+; VI-NEXT:    s_waitcnt vmcnt(12)
+; VI-NEXT:    v_lshrrev_b32_e32 v36, 16, v61
+; VI-NEXT:    s_waitcnt vmcnt(11)
+; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v60
 ; VI-NEXT:    ; kill: killed $vgpr31
 ; VI-NEXT:    ; implicit-def: $vgpr31
 ; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr31
-; VI-NEXT:    s_waitcnt vmcnt(14)
-; VI-NEXT:    v_lshrrev_b32_e32 v36, 16, v61
-; VI-NEXT:    s_waitcnt vmcnt(13)
-; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v60
 ; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr31
@@ -182902,11 +182888,15 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    ; implicit-def: $vgpr31
 ; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr50
+; VI-NEXT:    ; implicit-def: $vgpr31
+; VI-NEXT:    ; implicit-def: $vgpr40
+; VI-NEXT:    ; implicit-def: $vgpr49
+; VI-NEXT:    ; implicit-def: $vgpr35
 ; VI-NEXT:    ; implicit-def: $vgpr32
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr32
-; VI-NEXT:    ; implicit-def: $vgpr31
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr32
@@ -183803,17 +183793,9 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT:    ; implicit-def: $vgpr53
 ; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; GFX9-NEXT:    ; implicit-def: $vgpr44
-; GFX9-NEXT:    ; implicit-def: $vgpr53
-; GFX9-NEXT:    ; implicit-def: $vgpr43
 ; GFX9-NEXT:    ; implicit-def: $vgpr50
 ; GFX9-NEXT:    ; kill: killed $vgpr50
 ; GFX9-NEXT:    ; implicit-def: $vgpr50
@@ -183873,8 +183855,10 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr50
 ; GFX9-NEXT:    ; kill: killed $vgpr50
 ; GFX9-NEXT:    ; implicit-def: $vgpr50
+; GFX9-NEXT:    ; implicit-def: $vgpr44
 ; GFX9-NEXT:    ; kill: killed $vgpr50
 ; GFX9-NEXT:    ; implicit-def: $vgpr50
+; GFX9-NEXT:    ; implicit-def: $vgpr53
 ; GFX9-NEXT:    ; implicit-def: $vgpr58
 ; GFX9-NEXT:    ; implicit-def: $vgpr36
 ; GFX9-NEXT:    ; implicit-def: $vgpr57
@@ -183901,14 +183885,41 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr52
 ; GFX9-NEXT:    ; implicit-def: $vgpr51
 ; GFX9-NEXT:    ; implicit-def: $vgpr50
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; GFX9-NEXT:    ; implicit-def: $vgpr53
 ; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-NEXT:    ; implicit-def: $vgpr54
+; GFX9-NEXT:    ; implicit-def: $vgpr53
+; GFX9-NEXT:    ; implicit-def: $vgpr43
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr43
-; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-NEXT:    ; implicit-def: $vgpr43
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-NEXT:    ; implicit-def: $vgpr43
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-NEXT:    ; implicit-def: $vgpr43
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-NEXT:    ; implicit-def: $vgpr43
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(17)
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
 ; GFX9-NEXT:    ; kill: killed $vgpr33
@@ -183967,27 +183978,6 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
 ; GFX9-NEXT:    ; kill: killed $vgpr33
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
-; GFX9-NEXT:    ; implicit-def: $vgpr54
-; GFX9-NEXT:    ; implicit-def: $vgpr53
-; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; GFX9-NEXT:    ; implicit-def: $vgpr43
-; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; GFX9-NEXT:    ; implicit-def: $vgpr43
-; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; GFX9-NEXT:    ; implicit-def: $vgpr43
-; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; GFX9-NEXT:    ; implicit-def: $vgpr43
-; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB94_2
@@ -205315,74 +205305,12 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v27
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v26
-; VI-NEXT:    ; implicit-def: $vgpr48
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v25
-; VI-NEXT:    ; kill: killed $vgpr48
-; VI-NEXT:    ; implicit-def: $vgpr48
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v24
-; VI-NEXT:    ; kill: killed $vgpr48
-; VI-NEXT:    ; implicit-def: $vgpr48
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v23
-; VI-NEXT:    ; kill: killed $vgpr48
-; VI-NEXT:    ; implicit-def: $vgpr48
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v22
-; VI-NEXT:    v_lshrrev_b32_e32 v49, 16, v21
-; VI-NEXT:    ; kill: killed $vgpr48
-; VI-NEXT:    ; implicit-def: $vgpr48
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr54
-; VI-NEXT:    ; kill: killed $vgpr54
-; VI-NEXT:    ; implicit-def: $vgpr54
-; VI-NEXT:    ; kill: killed $vgpr54
-; VI-NEXT:    ; implicit-def: $vgpr54
-; VI-NEXT:    ; kill: killed $vgpr54
-; VI-NEXT:    ; implicit-def: $vgpr54
-; VI-NEXT:    ; kill: killed $vgpr54
-; VI-NEXT:    ; implicit-def: $vgpr54
-; VI-NEXT:    ; kill: killed $vgpr54
-; VI-NEXT:    ; implicit-def: $vgpr54
-; VI-NEXT:    ; kill: killed $vgpr54
-; VI-NEXT:    ; implicit-def: $vgpr54
-; VI-NEXT:    ; kill: killed $vgpr54
-; VI-NEXT:    ; implicit-def: $vgpr54
-; VI-NEXT:    ; kill: killed $vgpr54
-; VI-NEXT:    ; implicit-def: $vgpr54
-; VI-NEXT:    ; kill: killed $vgpr54
-; VI-NEXT:    ; implicit-def: $vgpr54
-; VI-NEXT:    ; kill: killed $vgpr54
-; VI-NEXT:    ; implicit-def: $vgpr54
-; VI-NEXT:    ; implicit-def: $vgpr55
-; VI-NEXT:    ; kill: killed $vgpr54
-; VI-NEXT:    ; implicit-def: $vgpr54
-; VI-NEXT:    ; implicit-def: $vgpr35
-; VI-NEXT:    ; kill: killed $vgpr35
-; VI-NEXT:    ; implicit-def: $vgpr35
-; VI-NEXT:    ; kill: killed $vgpr35
-; VI-NEXT:    ; implicit-def: $vgpr35
-; VI-NEXT:    ; kill: killed $vgpr35
-; VI-NEXT:    ; implicit-def: $vgpr35
-; VI-NEXT:    ; kill: killed $vgpr35
-; VI-NEXT:    ; implicit-def: $vgpr35
-; VI-NEXT:    ; kill: killed $vgpr35
-; VI-NEXT:    ; implicit-def: $vgpr35
-; VI-NEXT:    ; kill: killed $vgpr35
-; VI-NEXT:    ; implicit-def: $vgpr35
-; VI-NEXT:    ; kill: killed $vgpr35
-; VI-NEXT:    ; implicit-def: $vgpr35
-; VI-NEXT:    ; kill: killed $vgpr35
-; VI-NEXT:    ; implicit-def: $vgpr35
-; VI-NEXT:    ; kill: killed $vgpr35
-; VI-NEXT:    ; implicit-def: $vgpr35
-; VI-NEXT:    ; kill: killed $vgpr35
-; VI-NEXT:    ; implicit-def: $vgpr35
-; VI-NEXT:    ; kill: killed $vgpr35
-; VI-NEXT:    ; implicit-def: $vgpr35
-; VI-NEXT:    ; kill: killed $vgpr35
-; VI-NEXT:    ; implicit-def: $vgpr35
 ; VI-NEXT:    v_lshrrev_b32_e32 v60, 16, v16
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v14
 ; VI-NEXT:    v_lshrrev_b32_e32 v63, 16, v12
@@ -205392,34 +205320,20 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; VI-NEXT:    v_lshrrev_b32_e32 v56, 16, v4
 ; VI-NEXT:    v_lshrrev_b32_e32 v57, 16, v2
 ; VI-NEXT:    v_lshrrev_b32_e32 v46, 16, v30
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v22
+; VI-NEXT:    v_lshrrev_b32_e32 v49, 16, v21
 ; VI-NEXT:    v_lshrrev_b32_e32 v53, 16, v20
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v19
 ; VI-NEXT:    v_lshrrev_b32_e32 v52, 16, v18
-; VI-NEXT:    ; kill: killed $vgpr35
-; VI-NEXT:    ; implicit-def: $vgpr35
-; VI-NEXT:    ; implicit-def: $vgpr32
-; VI-NEXT:    ; implicit-def: $vgpr62
-; VI-NEXT:    ; implicit-def: $vgpr38
-; VI-NEXT:    ; implicit-def: $vgpr42
-; VI-NEXT:    ; implicit-def: $vgpr61
-; VI-NEXT:    ; implicit-def: $vgpr58
-; VI-NEXT:    ; implicit-def: $vgpr39
-; VI-NEXT:    ; kill: killed $vgpr35
-; VI-NEXT:    ; implicit-def: $vgpr35
-; VI-NEXT:    ; implicit-def: $vgpr43
-; VI-NEXT:    ; implicit-def: $vgpr40
-; VI-NEXT:    ; implicit-def: $vgpr45
-; VI-NEXT:    ; implicit-def: $vgpr44
-; VI-NEXT:    ; implicit-def: $vgpr41
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
 ; VI-NEXT:    v_lshrrev_b32_e32 v31, 16, v37
 ; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v50, 16, v36
 ; VI-NEXT:    v_lshrrev_b32_e32 v31, 16, v17
 ; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr48
 ; VI-NEXT:    ; implicit-def: $vgpr31
 ; VI-NEXT:    ; kill: killed $vgpr31
 ; VI-NEXT:    ; implicit-def: $vgpr31
@@ -205490,6 +205404,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; VI-NEXT:    ; kill: killed $vgpr31
 ; VI-NEXT:    ; implicit-def: $vgpr31
 ; VI-NEXT:    ; kill: killed $vgpr31
+; VI-NEXT:    ; implicit-def: $vgpr32
 ; VI-NEXT:    ; implicit-def: $vgpr31
 ; VI-NEXT:    ; kill: killed $vgpr31
 ; VI-NEXT:    ; implicit-def: $vgpr31
@@ -205499,17 +205414,88 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; VI-NEXT:    ; implicit-def: $vgpr31
 ; VI-NEXT:    ; kill: killed $vgpr31
 ; VI-NEXT:    ; implicit-def: $vgpr31
-; VI-NEXT:    v_lshrrev_b32_e32 v50, 16, v36
 ; VI-NEXT:    ; kill: killed $vgpr31
+; VI-NEXT:    ; implicit-def: $vgpr55
 ; VI-NEXT:    ; implicit-def: $vgpr31
 ; VI-NEXT:    ; kill: killed $vgpr31
+; VI-NEXT:    ; implicit-def: $vgpr62
 ; VI-NEXT:    ; implicit-def: $vgpr31
+; VI-NEXT:    ; implicit-def: $vgpr35
+; VI-NEXT:    ; kill: killed $vgpr35
+; VI-NEXT:    ; implicit-def: $vgpr38
+; VI-NEXT:    ; implicit-def: $vgpr35
+; VI-NEXT:    ; kill: killed $vgpr35
+; VI-NEXT:    ; implicit-def: $vgpr42
+; VI-NEXT:    ; implicit-def: $vgpr61
+; VI-NEXT:    ; implicit-def: $vgpr35
+; VI-NEXT:    ; kill: killed $vgpr35
+; VI-NEXT:    ; implicit-def: $vgpr35
+; VI-NEXT:    ; kill: killed $vgpr35
+; VI-NEXT:    ; implicit-def: $vgpr35
+; VI-NEXT:    ; kill: killed $vgpr35
+; VI-NEXT:    ; implicit-def: $vgpr35
+; VI-NEXT:    ; kill: killed $vgpr35
+; VI-NEXT:    ; implicit-def: $vgpr58
+; VI-NEXT:    ; implicit-def: $vgpr35
+; VI-NEXT:    ; kill: killed $vgpr35
+; VI-NEXT:    ; implicit-def: $vgpr35
+; VI-NEXT:    ; kill: killed $vgpr35
+; VI-NEXT:    ; implicit-def: $vgpr35
+; VI-NEXT:    ; kill: killed $vgpr35
+; VI-NEXT:    ; implicit-def: $vgpr35
+; VI-NEXT:    ; kill: killed $vgpr35
+; VI-NEXT:    ; implicit-def: $vgpr39
+; VI-NEXT:    ; implicit-def: $vgpr35
+; VI-NEXT:    ; kill: killed $vgpr35
+; VI-NEXT:    ; implicit-def: $vgpr35
+; VI-NEXT:    ; kill: killed $vgpr35
+; VI-NEXT:    ; implicit-def: $vgpr35
+; VI-NEXT:    ; kill: killed $vgpr35
+; VI-NEXT:    ; implicit-def: $vgpr35
+; VI-NEXT:    ; kill: killed $vgpr35
+; VI-NEXT:    ; implicit-def: $vgpr35
+; VI-NEXT:    ; implicit-def: $vgpr54
+; VI-NEXT:    ; kill: killed $vgpr54
+; VI-NEXT:    ; implicit-def: $vgpr48
+; VI-NEXT:    ; kill: killed $vgpr48
+; VI-NEXT:    ; implicit-def: $vgpr54
+; VI-NEXT:    ; kill: killed $vgpr54
+; VI-NEXT:    ; implicit-def: $vgpr48
+; VI-NEXT:    ; kill: killed $vgpr48
+; VI-NEXT:    ; implicit-def: $vgpr48
+; VI-NEXT:    ; kill: killed $vgpr48
+; VI-NEXT:    ; implicit-def: $vgpr54
+; VI-NEXT:    ; kill: killed $vgpr54
+; VI-NEXT:    ; implicit-def: $vgpr54
+; VI-NEXT:    ; kill: killed $vgpr54
+; VI-NEXT:    ; implicit-def: $vgpr54
+; VI-NEXT:    ; kill: killed $vgpr54
+; VI-NEXT:    ; implicit-def: $vgpr54
+; VI-NEXT:    ; kill: killed $vgpr54
+; VI-NEXT:    ; implicit-def: $vgpr48
+; VI-NEXT:    ; kill: killed $vgpr48
+; VI-NEXT:    ; implicit-def: $vgpr54
+; VI-NEXT:    ; kill: killed $vgpr54
+; VI-NEXT:    ; implicit-def: $vgpr54
+; VI-NEXT:    ; kill: killed $vgpr54
+; VI-NEXT:    ; implicit-def: $vgpr54
+; VI-NEXT:    ; kill: killed $vgpr54
+; VI-NEXT:    ; implicit-def: $vgpr54
+; VI-NEXT:    ; kill: killed $vgpr54
+; VI-NEXT:    ; implicit-def: $vgpr54
+; VI-NEXT:    ; kill: killed $vgpr54
+; VI-NEXT:    ; implicit-def: $vgpr43
+; VI-NEXT:    ; implicit-def: $vgpr40
+; VI-NEXT:    ; implicit-def: $vgpr48
+; VI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr48
 ; VI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr54
 ; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr54
-; VI-NEXT:    ; implicit-def: $vgpr48
 ; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr54
@@ -205518,10 +205504,14 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; VI-NEXT:    ; implicit-def: $vgpr54
 ; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr45
+; VI-NEXT:    ; implicit-def: $vgpr44
+; VI-NEXT:    ; implicit-def: $vgpr41
+; VI-NEXT:    ; implicit-def: $vgpr54
+; VI-NEXT:    ; implicit-def: $vgpr48
 ; VI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr48
-; VI-NEXT:    ; implicit-def: $vgpr54
 ; VI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr48
@@ -206516,17 +206506,9 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT:    ; implicit-def: $vgpr53
 ; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; GFX9-NEXT:    ; implicit-def: $vgpr44
-; GFX9-NEXT:    ; implicit-def: $vgpr53
-; GFX9-NEXT:    ; implicit-def: $vgpr43
 ; GFX9-NEXT:    ; implicit-def: $vgpr50
 ; GFX9-NEXT:    ; kill: killed $vgpr50
 ; GFX9-NEXT:    ; implicit-def: $vgpr50
@@ -206586,8 +206568,10 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr50
 ; GFX9-NEXT:    ; kill: killed $vgpr50
 ; GFX9-NEXT:    ; implicit-def: $vgpr50
+; GFX9-NEXT:    ; implicit-def: $vgpr44
 ; GFX9-NEXT:    ; kill: killed $vgpr50
 ; GFX9-NEXT:    ; implicit-def: $vgpr50
+; GFX9-NEXT:    ; implicit-def: $vgpr53
 ; GFX9-NEXT:    ; implicit-def: $vgpr58
 ; GFX9-NEXT:    ; implicit-def: $vgpr36
 ; GFX9-NEXT:    ; implicit-def: $vgpr57
@@ -206614,14 +206598,41 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr52
 ; GFX9-NEXT:    ; implicit-def: $vgpr51
 ; GFX9-NEXT:    ; implicit-def: $vgpr50
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; GFX9-NEXT:    ; implicit-def: $vgpr53
 ; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-NEXT:    ; implicit-def: $vgpr54
+; GFX9-NEXT:    ; implicit-def: $vgpr53
+; GFX9-NEXT:    ; implicit-def: $vgpr43
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr43
-; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-NEXT:    ; implicit-def: $vgpr43
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-NEXT:    ; implicit-def: $vgpr43
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-NEXT:    ; implicit-def: $vgpr43
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-NEXT:    ; implicit-def: $vgpr43
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(17)
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
 ; GFX9-NEXT:    ; kill: killed $vgpr33
@@ -206680,27 +206691,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
 ; GFX9-NEXT:    ; kill: killed $vgpr33
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
-; GFX9-NEXT:    ; implicit-def: $vgpr54
-; GFX9-NEXT:    ; implicit-def: $vgpr53
-; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; GFX9-NEXT:    ; implicit-def: $vgpr43
-; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; GFX9-NEXT:    ; implicit-def: $vgpr43
-; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; GFX9-NEXT:    ; implicit-def: $vgpr43
-; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; GFX9-NEXT:    ; implicit-def: $vgpr43
-; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB98_2
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
index 85a69fbdb4386..35d135b123969 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
@@ -63507,28 +63507,8 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
 ;
 ; VI-LABEL: bitcast_v32i16_to_v64i8:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr43
-; VI-NEXT:    ; implicit-def: $vgpr42
-; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr19
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v17
 ; VI-NEXT:    ; implicit-def: $vgpr17
 ; VI-NEXT:    ; kill: killed $vgpr19
@@ -63549,6 +63529,23 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
 ; VI-NEXT:    ; implicit-def: $vgpr17
 ; VI-NEXT:    ; kill: killed $vgpr19
 ; VI-NEXT:    ; implicit-def: $vgpr19
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr43
 ; VI-NEXT:    ; kill: killed $vgpr17
 ; VI-NEXT:    ; implicit-def: $vgpr17
 ; VI-NEXT:    ; kill: killed $vgpr19
@@ -63574,6 +63571,11 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
 ; VI-NEXT:    ; implicit-def: $vgpr17
 ; VI-NEXT:    ; kill: killed $vgpr19
 ; VI-NEXT:    ; implicit-def: $vgpr19
+; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr42
+; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr41
 ; VI-NEXT:    ; implicit-def: $vgpr18
 ; VI-NEXT:    ; implicit-def: $vgpr54
@@ -63605,8 +63607,6 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
 ; VI-NEXT:    ; implicit-def: $vgpr21
 ; VI-NEXT:    ; implicit-def: $vgpr20
 ; VI-NEXT:    ; implicit-def: $vgpr19
-; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr42
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/bundle-breaks-phy-liveness.mir b/llvm/test/CodeGen/AMDGPU/bundle-breaks-phy-liveness.mir
index fe9dd0294cc4d..5c7dc8a079345 100644
--- a/llvm/test/CodeGen/AMDGPU/bundle-breaks-phy-liveness.mir
+++ b/llvm/test/CodeGen/AMDGPU/bundle-breaks-phy-liveness.mir
@@ -23,10 +23,12 @@ body: |
     ; GCN-BUNDLE-LABEL: name: clause_implicit_def
     ; GCN-BUNDLE: liveins: $vgpr0, $sgpr32
     ; GCN-BUNDLE-NEXT: {{  $}}
-    ; GCN-BUNDLE-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr0, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr
-    ; GCN-BUNDLE-NEXT: $vgpr2 = IMPLICIT_DEF
-    ; GCN-BUNDLE-NEXT: $vgpr3 = IMPLICIT_DEF
-    ; GCN-BUNDLE-NEXT: SCRATCH_STORE_DWORDX2_SADDR $vgpr2_vgpr3, $sgpr32, 16, 0, implicit $exec, implicit $flat_scr
+    ; GCN-BUNDLE-NEXT: BUNDLE implicit-def $vgpr2, implicit-def $vgpr3, implicit $vgpr0, implicit $sgpr32, implicit $exec, implicit $flat_scr {
+    ; GCN-BUNDLE-NEXT:   SCRATCH_STORE_DWORD_SADDR $vgpr0, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr
+    ; GCN-BUNDLE-NEXT:   $vgpr2 = IMPLICIT_DEF
+    ; GCN-BUNDLE-NEXT:   $vgpr3 = IMPLICIT_DEF
+    ; GCN-BUNDLE-NEXT:   SCRATCH_STORE_DWORDX2_SADDR internal $vgpr2_vgpr3, $sgpr32, 16, 0, implicit $exec, implicit $flat_scr
+    ; GCN-BUNDLE-NEXT: }
     SCRATCH_STORE_DWORD_SADDR $vgpr0, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr
     $vgpr2 = IMPLICIT_DEF
     $vgpr3 = IMPLICIT_DEF
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
index bc1d5956cec56..919464a936740 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
@@ -15780,16 +15780,18 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX9-NEXT:    v_writelane_b32 v40, s14, 10
 ; GFX9-NEXT:    v_writelane_b32 v40, s15, 11
 ; GFX9-NEXT:    v_writelane_b32 v40, s16, 12
-; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
 ; GFX9-NEXT:    v_writelane_b32 v40, s17, 13
 ; GFX9-NEXT:    v_writelane_b32 v40, s18, 14
+; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
 ; GFX9-NEXT:    v_writelane_b32 v40, s19, 15
 ; GFX9-NEXT:    v_writelane_b32 v40, s20, 16
 ; GFX9-NEXT:    v_writelane_b32 v40, s21, 17
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s52, s[34:35], 0x0
 ; GFX9-NEXT:    v_writelane_b32 v40, s22, 18
 ; GFX9-NEXT:    v_writelane_b32 v40, s23, 19
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_load_dword s52, s[34:35], 0x0
+; GFX9-NEXT:    ; kill: killed $sgpr34_sgpr35
+; GFX9-NEXT:    ; kill: killed $sgpr34_sgpr35
 ; GFX9-NEXT:    s_load_dwordx16 s[36:51], s[34:35], 0x40
 ; GFX9-NEXT:    s_load_dwordx16 s[4:19], s[34:35], 0x0
 ; GFX9-NEXT:    v_writelane_b32 v40, s24, 20
@@ -15812,8 +15814,6 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s50
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 26
-; GFX9-NEXT:    ; kill: killed $sgpr34_sgpr35
-; GFX9-NEXT:    ; kill: killed $sgpr34_sgpr35
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:16
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s51
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v32i32_i32_inreg at abs32@hi
@@ -15880,20 +15880,14 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s34, 28
 ; GFX10-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    ; kill: killed $sgpr34_sgpr35
-; GFX10-NEXT:    ; kill: killed $sgpr34_sgpr35
 ; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
 ; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_load_dword s52, s[34:35], 0x0
 ; GFX10-NEXT:    v_writelane_b32 v40, s8, 4
 ; GFX10-NEXT:    v_writelane_b32 v40, s9, 5
 ; GFX10-NEXT:    v_writelane_b32 v40, s10, 6
 ; GFX10-NEXT:    v_writelane_b32 v40, s11, 7
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s52
 ; GFX10-NEXT:    v_writelane_b32 v40, s12, 8
 ; GFX10-NEXT:    v_writelane_b32 v40, s13, 9
 ; GFX10-NEXT:    v_writelane_b32 v40, s14, 10
@@ -15902,39 +15896,44 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s17, 13
 ; GFX10-NEXT:    v_writelane_b32 v40, s18, 14
 ; GFX10-NEXT:    v_writelane_b32 v40, s19, 15
-; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x2
+; GFX10-NEXT:    s_load_dword s52, s[34:35], 0x0
+; GFX10-NEXT:    ; meta instruction
+; GFX10-NEXT:    ; meta instruction
 ; GFX10-NEXT:    s_load_dwordx16 s[36:51], s[34:35], 0x40
 ; GFX10-NEXT:    s_load_dwordx16 s[4:19], s[34:35], 0x0
-; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:24
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v32i32_i32_inreg at abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v32i32_i32_inreg at abs32@lo
 ; GFX10-NEXT:    v_writelane_b32 v40, s20, 16
 ; GFX10-NEXT:    v_writelane_b32 v40, s21, 17
 ; GFX10-NEXT:    v_writelane_b32 v40, s22, 18
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s46
-; GFX10-NEXT:    v_writelane_b32 v40, s23, 19
+; GFX10-NEXT:    v_mov_b32_e32 v0, s52
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s47
+; GFX10-NEXT:    v_writelane_b32 v40, s23, 19
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:24
+; GFX10-NEXT:    v_mov_b32_e32 v0, s46
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s48
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s49
-; GFX10-NEXT:    s_mov_b32 s20, s36
 ; GFX10-NEXT:    v_writelane_b32 v40, s24, 20
+; GFX10-NEXT:    s_mov_b32 s20, s36
 ; GFX10-NEXT:    s_mov_b32 s21, s37
 ; GFX10-NEXT:    s_mov_b32 s22, s38
 ; GFX10-NEXT:    s_mov_b32 s23, s39
-; GFX10-NEXT:    s_mov_b32 s24, s40
 ; GFX10-NEXT:    v_writelane_b32 v40, s25, 21
+; GFX10-NEXT:    s_mov_b32 s24, s40
 ; GFX10-NEXT:    s_mov_b32 s25, s41
 ; GFX10-NEXT:    v_mov_b32_e32 v4, s50
 ; GFX10-NEXT:    v_mov_b32_e32 v5, s51
+; GFX10-NEXT:    v_writelane_b32 v40, s26, 22
+; GFX10-NEXT:    s_mov_b32 s26, s42
 ; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32
 ; GFX10-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
 ; GFX10-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8
 ; GFX10-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:12
 ; GFX10-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:16
 ; GFX10-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:20
-; GFX10-NEXT:    v_writelane_b32 v40, s26, 22
-; GFX10-NEXT:    s_mov_b32 s26, s42
 ; GFX10-NEXT:    v_writelane_b32 v40, s27, 23
 ; GFX10-NEXT:    s_mov_b32 s27, s43
 ; GFX10-NEXT:    v_writelane_b32 v40, s28, 24
@@ -16100,22 +16099,15 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 28
 ; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    ; kill: killed $sgpr0_sgpr1
-; GFX10-SCRATCH-NEXT:    ; kill: killed $sgpr0_sgpr1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s3, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX10-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_load_dword s2, s[0:1], 0x0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s8, 4
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s9, 5
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s10, 6
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s11, 7
-; GFX10-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v6, s2
-; GFX10-SCRATCH-NEXT:    s_add_i32 s2, s32, 24
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s12, 8
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s13, 9
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s14, 10
@@ -16124,7 +16116,11 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s17, 13
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s18, 14
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s19, 15
-; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SCRATCH-NEXT:    s_clause 0x2
+; GFX10-SCRATCH-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX10-SCRATCH-NEXT:    ; meta instruction
+; GFX10-SCRATCH-NEXT:    ; meta instruction
 ; GFX10-SCRATCH-NEXT:    s_load_dwordx16 s[36:51], s[0:1], 0x40
 ; GFX10-SCRATCH-NEXT:    s_load_dwordx16 s[4:19], s[0:1], 0x0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v32i32_i32_inreg at abs32@hi
@@ -16133,6 +16129,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s21, 17
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s22, 18
 ; GFX10-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v6, s2
+; GFX10-SCRATCH-NEXT:    s_add_i32 s2, s32, 24
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v4, s50
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s23, 19
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v5, s51
diff --git a/llvm/test/CodeGen/AMDGPU/max.i16.ll b/llvm/test/CodeGen/AMDGPU/max.i16.ll
index 956ee8a3a5384..a5b64f6f80d9b 100644
--- a/llvm/test/CodeGen/AMDGPU/max.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/max.i16.ll
@@ -149,6 +149,7 @@ define amdgpu_kernel void @v_test_imax_sge_v3i16(ptr addrspace(1) %out, ptr addr
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    global_load_dword v3, v0, s[6:7]
 ; GFX9-NEXT:    global_load_dword v4, v0, s[2:3]
+; GFX9-NEXT:    ; kill: killed $sgpr2_sgpr3
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    global_load_short_d16 v1, v0, s[6:7] offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
diff --git a/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir b/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir
index 10ee822405caa..0b90d551429ae 100644
--- a/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir
+++ b/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir
@@ -124,9 +124,11 @@ body:             |
     ; GCN-LABEL: name: bundle_dbg_value_0
     ; GCN: liveins: $vgpr3_vgpr4, $vgpr5_vgpr6
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, implicit $exec
-    ; GCN-NEXT: DBG_VALUE $vgpr0, 0, 0
-    ; GCN-NEXT: $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, implicit $exec
+    ; GCN-NEXT: BUNDLE implicit-def $vgpr0, implicit-def $vgpr1, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr5_vgpr6 {
+    ; GCN-NEXT:   $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, implicit $exec
+    ; GCN-NEXT:   DBG_VALUE $vgpr0, 0, 0
+    ; GCN-NEXT:   $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, implicit $exec
+    ; GCN-NEXT: }
     $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, implicit $exec
     DBG_VALUE $vgpr0, 0, 0
     $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, implicit $exec
@@ -143,10 +145,12 @@ body:             |
     ; GCN-LABEL: name: bundle_dbg_value_1
     ; GCN: liveins: $vgpr3_vgpr4, $vgpr5_vgpr6, $vgpr1
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, implicit $exec
-    ; GCN-NEXT: DBG_VALUE $vgpr0, 0, 0
-    ; GCN-NEXT: DBG_VALUE $vgpr1, 0, 0
-    ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, implicit $exec
+    ; GCN-NEXT: BUNDLE implicit-def $vgpr0, implicit-def $vgpr2, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr5_vgpr6 {
+    ; GCN-NEXT:   $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, implicit $exec
+    ; GCN-NEXT:   DBG_VALUE $vgpr0, 0, 0
+    ; GCN-NEXT:   DBG_VALUE $vgpr1, 0, 0
+    ; GCN-NEXT:   $vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, implicit $exec
+    ; GCN-NEXT: }
     ; GCN-NEXT: DBG_VALUE $vgpr2, 0, 0
     $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, implicit $exec
     DBG_VALUE $vgpr0, 0, 0
@@ -166,9 +170,11 @@ body:             |
     ; GCN: liveins: $vgpr3_vgpr4, $vgpr5_vgpr6, $vgpr1
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: DBG_VALUE $vgpr1, 0, 0
-    ; GCN-NEXT: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, implicit $exec
-    ; GCN-NEXT: DBG_VALUE $vgpr0, 0, 0
-    ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, implicit $exec
+    ; GCN-NEXT: BUNDLE implicit-def $vgpr0, implicit-def $vgpr2, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr5_vgpr6 {
+    ; GCN-NEXT:   $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, implicit $exec
+    ; GCN-NEXT:   DBG_VALUE $vgpr0, 0, 0
+    ; GCN-NEXT:   $vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, implicit $exec
+    ; GCN-NEXT: }
     ; GCN-NEXT: DBG_VALUE $vgpr2, 0, 0
     DBG_VALUE $vgpr1, 0, 0
     $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, implicit $exec
@@ -188,9 +194,11 @@ body:             |
     ; GCN: liveins: $vgpr3_vgpr4, $vgpr5_vgpr6
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec
-    ; GCN-NEXT: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, implicit $exec
-    ; GCN-NEXT: KILL $vgpr1
-    ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, implicit $exec
+    ; GCN-NEXT: BUNDLE implicit-def $vgpr0, implicit-def $vgpr2, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr1, implicit $vgpr5_vgpr6 {
+    ; GCN-NEXT:   $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, implicit $exec
+    ; GCN-NEXT:   KILL $vgpr1
+    ; GCN-NEXT:   $vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, implicit $exec
+    ; GCN-NEXT: }
     $vgpr1 = V_MOV_B32_e32 0, implicit $exec
     $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, implicit $exec
     KILL $vgpr1
@@ -208,9 +216,11 @@ body:             |
     ; GCN: liveins: $vgpr3_vgpr4, $vgpr5_vgpr6
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec
-    ; GCN-NEXT: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, implicit $exec
-    ; GCN-NEXT: KILL $vgpr0
-    ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, implicit $exec
+    ; GCN-NEXT: BUNDLE implicit-def $vgpr0, implicit-def $vgpr2, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr5_vgpr6 {
+    ; GCN-NEXT:   $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, implicit $exec
+    ; GCN-NEXT:   KILL internal $vgpr0
+    ; GCN-NEXT:   $vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, implicit $exec
+    ; GCN-NEXT: }
     $vgpr1 = V_MOV_B32_e32 0, implicit $exec
     $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, implicit $exec
     KILL $vgpr0
@@ -332,20 +342,21 @@ body:             |
     ; GCN-LABEL: name: post_bundle_kill_and_null_reg_dbginfo
     ; GCN: liveins: $vgpr3_vgpr4, $vgpr5_vgpr6
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, implicit $exec
-    ; GCN-NEXT: DBG_VALUE $noreg, $noreg
-    ; GCN-NEXT: $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, implicit $exec
-    ; GCN-NEXT: KILL killed $vgpr3_vgpr4, killed $vgpr5_vgpr6
+    ; GCN-NEXT: BUNDLE implicit-def $vgpr0, implicit-def $vgpr1, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr5_vgpr6 {
+    ; GCN-NEXT:   $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, implicit $exec
+    ; GCN-NEXT:   DBG_VALUE $noreg, $noreg
+    ; GCN-NEXT:   $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, implicit $exec
+    ; GCN-NEXT: }
     $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, implicit $exec
     DBG_VALUE $noreg, $noreg
     $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, implicit $exec
     KILL killed $vgpr3_vgpr4, killed $vgpr5_vgpr6
 ...
 
-# The 3rd and 4th loads are not bundled due to an in-between SCHED_BARRIER meta instruction. 
-# This helps SCHED_BARRIERs are honored maximally. 
+# The 3rd and 4th loads are not bundled due to an in-between SCHED_BARRIER meta instruction.
+# This helps SCHED_BARRIERs are honored maximally.
 # MASK 1924 = 0b 0111 1000 0100 VALU and all VMEM, VMEM Read/Write cannot be
-#                     scheduled across SCHED_BARRIER. 
+#                     scheduled across SCHED_BARRIER.
 
 ---
 name: extra_sched_barrier_in_bundle
@@ -408,3 +419,54 @@ body: |
     }
     S_ENDPGM 0
 ...
+
+# Bypass bundling if a MBB has SCHED_BARRIER
+---
+name: no_bundle_with_sched_barrier
+tracksRegLiveness: true
+body: |
+  bb.0:
+    ; GCN-LABEL: name: no_bundle_with_sched_barrier
+    ; GCN: renamable $sgpr0_sgpr1 = IMPLICIT_DEF
+    ; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF
+    ; GCN-NEXT: renamable $vgpr1 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 0, 0, implicit $exec, implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2
+    ; GCN-NEXT: renamable $vgpr2 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 512, 0, implicit $exec, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $sgpr0_sgpr1, implicit $vgpr0
+    ; GCN-NEXT: renamable $sgpr2_sgpr3 = IMPLICIT_DEF
+    ; GCN-NEXT: renamable $vgpr10 = IMPLICIT_DEF
+    ; GCN-NEXT: renamable $vgpr1 = nsw V_MUL_LO_U32_e64 killed $vgpr1, $vgpr1, implicit $exec
+    ; GCN-NEXT: renamable $vgpr2 = nsw V_MUL_LO_U32_e64 killed $vgpr2, $vgpr2, implicit $exec
+    ; GCN-NEXT: SCHED_BARRIER 1924
+    ; GCN-NEXT: renamable $vgpr11 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr2_sgpr3, renamable $vgpr10, 0, 0, implicit $exec, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit $sgpr2_sgpr3, implicit $vgpr10
+    ; GCN-NEXT: SCHED_BARRIER 1924
+    ; GCN-NEXT: renamable $vgpr12 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr2_sgpr3, renamable $vgpr10, 512, 0, implicit $exec, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit $sgpr2_sgpr3, implicit $vgpr10
+    ; GCN-NEXT: renamable $sgpr4_sgpr5 = IMPLICIT_DEF
+    ; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF
+    ; GCN-NEXT: renamable $vgpr11 = nsw V_MUL_LO_U32_e64 killed $vgpr11, $vgpr11, implicit $exec
+    ; GCN-NEXT: renamable $vgpr12 = nsw V_MUL_LO_U32_e64 killed $vgpr12, $vgpr12, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR renamable $vgpr10, killed renamable $vgpr11, renamable $sgpr2_sgpr3, 0, 0, implicit $exec, implicit killed $vgpr11
+    ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr10, killed renamable $vgpr12, killed renamable $sgpr2_sgpr3, 512, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR renamable $vgpr0, killed renamable $vgpr1, renamable $sgpr4_sgpr5, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr4_sgpr5, 512, 0, implicit $exec
+    ; GCN-NEXT: S_ENDPGM 0
+    renamable $sgpr0_sgpr1 = IMPLICIT_DEF
+    renamable $vgpr0 = IMPLICIT_DEF
+    renamable $vgpr1 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 0, 0, implicit $exec, implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2
+    renamable $vgpr2 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 512, 0, implicit $exec, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $sgpr0_sgpr1, implicit $vgpr0
+    renamable $sgpr2_sgpr3 = IMPLICIT_DEF
+    renamable $vgpr10 = IMPLICIT_DEF
+    renamable $vgpr1 = nsw V_MUL_LO_U32_e64 killed $vgpr1, $vgpr1, implicit $exec
+    renamable $vgpr2 = nsw V_MUL_LO_U32_e64 killed $vgpr2, $vgpr2, implicit $exec
+    SCHED_BARRIER 1924
+    renamable $vgpr11 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr2_sgpr3, renamable $vgpr10, 0, 0, implicit $exec, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit $sgpr2_sgpr3, implicit $vgpr10
+    SCHED_BARRIER 1924
+    renamable $vgpr12 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr2_sgpr3, renamable $vgpr10, 512, 0, implicit $exec, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit $sgpr2_sgpr3, implicit $vgpr10
+    renamable $sgpr4_sgpr5 = IMPLICIT_DEF
+    renamable $vgpr0 = IMPLICIT_DEF
+    renamable $vgpr11 = nsw V_MUL_LO_U32_e64 killed $vgpr11, $vgpr11, implicit $exec
+    renamable $vgpr12 = nsw V_MUL_LO_U32_e64 killed $vgpr12, $vgpr12, implicit $exec
+    GLOBAL_STORE_DWORD_SADDR renamable $vgpr10, killed renamable $vgpr11, renamable $sgpr2_sgpr3, 0, 0, implicit $exec, implicit killed $vgpr11
+    GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr10, killed renamable $vgpr12, killed renamable $sgpr2_sgpr3, 512, 0, implicit $exec
+    GLOBAL_STORE_DWORD_SADDR renamable $vgpr0, killed renamable $vgpr1, renamable $sgpr4_sgpr5, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr4_sgpr5, 512, 0, implicit $exec
+    S_ENDPGM 0
+...

>From 5897ff31be824814dc3585518ae73f049d27d3a1 Mon Sep 17 00:00:00 2001
From: Yoonseo Choi <Yoonseo.Choi at amd.com>
Date: Fri, 15 Aug 2025 22:39:39 -0500
Subject: [PATCH 08/11] Updating a lit-test

---
 .../CodeGen/AMDGPU/postra-bundle-memops.mir   | 76 +------------------
 1 file changed, 4 insertions(+), 72 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir b/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir
index 0b90d551429ae..1b3503c76fc98 100644
--- a/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir
+++ b/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir
@@ -114,7 +114,7 @@ body:             |
     $vgpr3 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $exec
 ...
 
-# Loads are not bundled due to middle deb_value
+# Middle dbg_value should be bundled
 ---
 name: bundle_dbg_value_0
 tracksRegLiveness: true
@@ -135,7 +135,7 @@ body:             |
 
 ...
 
-# Loads are not bundled due to middle deb_value
+# Middle dbg_value should be bundled
 ---
 name: bundle_dbg_value_1
 tracksRegLiveness: true
@@ -159,7 +159,7 @@ body:             |
     DBG_VALUE $vgpr2, 0, 0
 ...
 
-# Loads are not bundled due to middle deb_value
+# Starting and ending dbg_values should not be in the bundle
 ---
 name: bundle_dbg_value_2
 tracksRegLiveness: true
@@ -183,7 +183,6 @@ body:             |
     DBG_VALUE $vgpr2, 0, 0
 ...
 
-# Loads are not bundled due to KILL
 ---
 name: bundle_kill
 tracksRegLiveness: true
@@ -353,74 +352,7 @@ body:             |
     KILL killed $vgpr3_vgpr4, killed $vgpr5_vgpr6
 ...
 
-# The 3rd and 4th loads are not bundled due to an in-between SCHED_BARRIER meta instruction.
-# This helps SCHED_BARRIERs are honored maximally.
-# MASK 1924 = 0b 0111 1000 0100 VALU and all VMEM, VMEM Read/Write cannot be
-#                     scheduled across SCHED_BARRIER.
-
----
-name: extra_sched_barrier_in_bundle
-tracksRegLiveness: true
-body: |
-  bb.0:
-    ; GCN-LABEL: name: extra_sched_barrier_in_bundle
-    ; GCN: renamable $sgpr0_sgpr1 = IMPLICIT_DEF
-    ; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF
-    ; GCN-NEXT: BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $sgpr0_sgpr1, implicit $vgpr0, implicit $exec {
-    ; GCN-NEXT:   renamable $vgpr1 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 0, 0, implicit $exec
-    ; GCN-NEXT:   renamable $vgpr2 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 512, 0, implicit $exec
-    ; GCN-NEXT: }
-    ; GCN-NEXT: renamable $sgpr2_sgpr3 = IMPLICIT_DEF
-    ; GCN-NEXT: renamable $vgpr10 = IMPLICIT_DEF
-    ; GCN-NEXT: renamable $vgpr1 = nsw V_MUL_LO_U32_e64 killed $vgpr1, $vgpr1, implicit $exec
-    ; GCN-NEXT: renamable $vgpr2 = nsw V_MUL_LO_U32_e64 killed $vgpr2, $vgpr2, implicit $exec
-    ; GCN-NEXT: SCHED_BARRIER 1924
-    ; GCN-NEXT: renamable $vgpr11 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr2_sgpr3, renamable $vgpr10, 0, 0, implicit $exec, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit $sgpr2_sgpr3, implicit $vgpr10
-    ; GCN-NEXT: SCHED_BARRIER 1924
-    ; GCN-NEXT: renamable $vgpr12 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr2_sgpr3, renamable $vgpr10, 512, 0, implicit $exec, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit $sgpr2_sgpr3, implicit $vgpr10
-    ; GCN-NEXT: renamable $sgpr4_sgpr5 = IMPLICIT_DEF
-    ; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF
-    ; GCN-NEXT: renamable $vgpr11 = nsw V_MUL_LO_U32_e64 killed $vgpr11, $vgpr11, implicit $exec
-    ; GCN-NEXT: renamable $vgpr12 = nsw V_MUL_LO_U32_e64 killed $vgpr12, $vgpr12, implicit $exec
-    ; GCN-NEXT: BUNDLE implicit killed $vgpr10, implicit killed $vgpr11, implicit killed $sgpr2_sgpr3, implicit $exec, implicit killed $vgpr12 {
-    ; GCN-NEXT:   GLOBAL_STORE_DWORD_SADDR renamable $vgpr10, killed renamable $vgpr11, renamable $sgpr2_sgpr3, 0, 0, implicit $exec
-    ; GCN-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr10, killed renamable $vgpr12, killed renamable $sgpr2_sgpr3, 512, 0, implicit $exec
-    ; GCN-NEXT: }
-    ; GCN-NEXT: BUNDLE implicit killed $vgpr0, implicit killed $vgpr1, implicit killed $sgpr4_sgpr5, implicit $exec, implicit killed $vgpr2 {
-    ; GCN-NEXT:   GLOBAL_STORE_DWORD_SADDR renamable $vgpr0, killed renamable $vgpr1, renamable $sgpr4_sgpr5, 0, 0, implicit $exec
-    ; GCN-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr4_sgpr5, 512, 0, implicit $exec
-    ; GCN-NEXT: }
-    ; GCN-NEXT: S_ENDPGM 0
-    renamable $sgpr0_sgpr1 = IMPLICIT_DEF
-    renamable $vgpr0 = IMPLICIT_DEF
-    BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $sgpr0_sgpr1, implicit $vgpr0, implicit $exec {
-      renamable $vgpr1 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 0, 0, implicit $exec
-      renamable $vgpr2 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 512, 0, implicit $exec
-    }
-    renamable $sgpr2_sgpr3 = IMPLICIT_DEF
-    renamable $vgpr10 = IMPLICIT_DEF
-    renamable $vgpr1 = nsw V_MUL_LO_U32_e64 killed $vgpr1, $vgpr1, implicit $exec
-    renamable $vgpr2 = nsw V_MUL_LO_U32_e64 killed $vgpr2, $vgpr2, implicit $exec
-    SCHED_BARRIER 1924
-    renamable $vgpr11 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr2_sgpr3, renamable $vgpr10, 0, 0, implicit $exec, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit $sgpr2_sgpr3, implicit $vgpr10
-    SCHED_BARRIER 1924
-    renamable $vgpr12 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr2_sgpr3, renamable $vgpr10, 512, 0, implicit $exec, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit $sgpr2_sgpr3, implicit $vgpr10
-    renamable $sgpr4_sgpr5 = IMPLICIT_DEF
-    renamable $vgpr0 = IMPLICIT_DEF
-    renamable $vgpr11 = nsw V_MUL_LO_U32_e64 killed $vgpr11, $vgpr11, implicit $exec
-    renamable $vgpr12 = nsw V_MUL_LO_U32_e64 killed $vgpr12, $vgpr12, implicit $exec
-    BUNDLE implicit killed $vgpr10, implicit killed $vgpr11, implicit killed $sgpr2_sgpr3, implicit $exec, implicit killed $vgpr12 {
-      GLOBAL_STORE_DWORD_SADDR renamable $vgpr10, killed renamable $vgpr11, renamable $sgpr2_sgpr3, 0, 0, implicit $exec
-      GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr10, killed renamable $vgpr12, killed renamable $sgpr2_sgpr3, 512, 0, implicit $exec
-    }
-    BUNDLE implicit killed $vgpr0, implicit killed $vgpr1, implicit killed $sgpr4_sgpr5, implicit $exec, implicit killed $vgpr2 {
-      GLOBAL_STORE_DWORD_SADDR renamable $vgpr0, killed renamable $vgpr1, renamable $sgpr4_sgpr5, 0, 0, implicit $exec
-      GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr4_sgpr5, 512, 0, implicit $exec
-    }
-    S_ENDPGM 0
-...
-
-# Bypass bundling if a MBB has SCHED_BARRIER
+# Avoid bundling if a MBB has SCHED_BARRIER
 ---
 name: no_bundle_with_sched_barrier
 tracksRegLiveness: true

>From badcaef552f506301a25dbb7fe38124ac8a2f48c Mon Sep 17 00:00:00 2001
From: Yoonseo Choi <Yoonseo.Choi at amd.com>
Date: Wed, 20 Aug 2025 17:49:43 -0500
Subject: [PATCH 09/11] Bundling ends either on non-meta-instruction or on
 SCHED_BARRIER

---
 llvm/lib/Target/AMDGPU/SIPostRABundler.cpp | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
index 73692c43adc46..7d6cec290aa30 100644
--- a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
@@ -152,12 +152,7 @@ bool SIPostRABundler::run(MachineFunction &MF) {
   for (MachineBasicBlock &MBB : MF) {
     bool HasIGLPInstrs = llvm::any_of(MBB.instrs(), [](MachineInstr &MI) {
       unsigned Opc = MI.getOpcode();
-      // Bypass a MBB with SCHED_BARRIER as well to make it honored.
-      // If SCHED_BARRIER is embedded between memory operations in a bundle,
-      // that SCHED_BARRIER is not picked up by IGLPmutation in post mi
-      // scheduler phase.
-      return Opc == AMDGPU::SCHED_GROUP_BARRIER || Opc == AMDGPU::IGLP_OPT ||
-             Opc == AMDGPU::SCHED_BARRIER;
+      return Opc == AMDGPU::SCHED_GROUP_BARRIER || Opc == AMDGPU::IGLP_OPT;
     });
 
     // Don't cluster with IGLP instructions.
@@ -190,9 +185,11 @@ bool SIPostRABundler::run(MachineFunction &MF) {
           if (I->getNumExplicitDefs() != 0)
             Defs.insert(I->defs().begin()->getReg());
           ++ClauseLength;
-        } else if (!I->isMetaInstruction()) {
-          // Allow meta instructions in between bundle candidates, but do not
-          // start or end a bundle on one.
+        } else if (!I->isMetaInstruction() ||
+                   I->getOpcode() == AMDGPU::SCHED_BARRIER) {
+          // SCHED_BARRIER is not bundled to be honored by scheduler later.
+          // Allow other meta instructions in between bundle candidates, but do
+          // not start or end a bundle on one.
           //
           // TODO: It may be better to move meta instructions like dbg_value
           // after the bundle. We're relying on the memory legalizer to unbundle

>From e68e38ce0d75ecf7126f052e325d0ca8c862f03d Mon Sep 17 00:00:00 2001
From: Yoonseo Choi <Yoonseo.Choi at amd.com>
Date: Wed, 20 Aug 2025 18:06:29 -0500
Subject: [PATCH 10/11] updating a lit-test accordingly

---
 .../CodeGen/AMDGPU/postra-bundle-memops.mir    | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir b/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir
index 1b3503c76fc98..8e0df53bbfa75 100644
--- a/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir
+++ b/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir
@@ -354,15 +354,17 @@ body:             |
 
 # Avoid bundling if a MBB has SCHED_BARRIER
 ---
-name: no_bundle_with_sched_barrier
+name: no_sched_barrier_within_bundle
 tracksRegLiveness: true
 body: |
   bb.0:
     ; GCN-LABEL: name: no_bundle_with_sched_barrier
     ; GCN: renamable $sgpr0_sgpr1 = IMPLICIT_DEF
     ; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF
-    ; GCN-NEXT: renamable $vgpr1 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 0, 0, implicit $exec, implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2
-    ; GCN-NEXT: renamable $vgpr2 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 512, 0, implicit $exec, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $sgpr0_sgpr1, implicit $vgpr0
+    ; GCN-NEXT: BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $sgpr0_sgpr1, implicit $vgpr0, implicit $exec {
+    ; GCN-NEXT:   renamable $vgpr1 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 0, 0, implicit $exec, implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2
+    ; GCN-NEXT:   renamable $vgpr2 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 512, 0, implicit $exec, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $sgpr0_sgpr1, implicit $vgpr0
+    ; GCN-NEXT: }
     ; GCN-NEXT: renamable $sgpr2_sgpr3 = IMPLICIT_DEF
     ; GCN-NEXT: renamable $vgpr10 = IMPLICIT_DEF
     ; GCN-NEXT: renamable $vgpr1 = nsw V_MUL_LO_U32_e64 killed $vgpr1, $vgpr1, implicit $exec
@@ -375,10 +377,12 @@ body: |
     ; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF
     ; GCN-NEXT: renamable $vgpr11 = nsw V_MUL_LO_U32_e64 killed $vgpr11, $vgpr11, implicit $exec
     ; GCN-NEXT: renamable $vgpr12 = nsw V_MUL_LO_U32_e64 killed $vgpr12, $vgpr12, implicit $exec
-    ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR renamable $vgpr10, killed renamable $vgpr11, renamable $sgpr2_sgpr3, 0, 0, implicit $exec, implicit killed $vgpr11
-    ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr10, killed renamable $vgpr12, killed renamable $sgpr2_sgpr3, 512, 0, implicit $exec
-    ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR renamable $vgpr0, killed renamable $vgpr1, renamable $sgpr4_sgpr5, 0, 0, implicit $exec
-    ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr4_sgpr5, 512, 0, implicit $exec
+    ; GCN-NEXT: BUNDLE implicit killed $vgpr10, implicit killed $vgpr11, implicit killed $sgpr2_sgpr3, implicit $exec, implicit killed $vgpr12, implicit killed $vgpr0, implicit killed $vgpr1, implicit killed $sgpr4_sgpr5, implicit killed $vgpr2 {
+    ; GCN-NEXT:   GLOBAL_STORE_DWORD_SADDR renamable $vgpr10, killed renamable $vgpr11, renamable $sgpr2_sgpr3, 0, 0, implicit $exec, implicit killed $vgpr11
+    ; GCN-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr10, killed renamable $vgpr12, killed renamable $sgpr2_sgpr3, 512, 0, implicit $exec
+    ; GCN-NEXT:   GLOBAL_STORE_DWORD_SADDR renamable $vgpr0, killed renamable $vgpr1, renamable $sgpr4_sgpr5, 0, 0, implicit $exec
+    ; GCN-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr4_sgpr5, 512, 0, implicit $exec
+    ; GCN-NEXT: }
     ; GCN-NEXT: S_ENDPGM 0
     renamable $sgpr0_sgpr1 = IMPLICIT_DEF
     renamable $vgpr0 = IMPLICIT_DEF

>From c5798f59d0c767d2d1b9a4a888547957c60130ff Mon Sep 17 00:00:00 2001
From: Yoonseo Choi <Yoonseo.Choi at amd.com>
Date: Wed, 20 Aug 2025 18:16:29 -0500
Subject: [PATCH 11/11] Remove staled change from merging

---
 llvm/lib/Target/AMDGPU/SIPostRABundler.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
index 7d6cec290aa30..5720b978aada0 100644
--- a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
@@ -15,7 +15,6 @@
 #include "SIPostRABundler.h"
 #include "AMDGPU.h"
 #include "GCNSubtarget.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 



More information about the llvm-commits mailing list