[llvm] [AMDGPU] SelectionDAG support for vector type set 0 to multiple sgpr64 (PR #128017)

Janek van Oirschot via llvm-commits llvm-commits at lists.llvm.org
Wed Mar 12 04:36:31 PDT 2025


https://github.com/JanekvO updated https://github.com/llvm/llvm-project/pull/128017

>From f7e6f7b7f3bd16943a7874a717dbec2e04b5156f Mon Sep 17 00:00:00 2001
From: Janek van Oirschot <janek.vanoirschot at amd.com>
Date: Thu, 20 Feb 2025 14:46:47 +0000
Subject: [PATCH] [AMDGPU] SelectionDAG support for vector type materialization
 of 0 to multiple sgpr64

---
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp |  55 +++
 .../CodeGen/AMDGPU/adjust-writemask-cse.ll    |  17 +-
 .../AMDGPU/agpr-copy-no-free-registers.ll     | 274 ++++++------
 .../AMDGPU/atomic-optimizer-strict-wqm.ll     |   6 +-
 .../AMDGPU/bitreverse-inline-immediates.ll    |   2 +-
 ...der-no-live-segment-at-def-implicit-def.ll |  53 +--
 llvm/test/CodeGen/AMDGPU/bug-cselect-b64.ll   |   9 +-
 llvm/test/CodeGen/AMDGPU/cluster_stores.ll    |  17 +-
 llvm/test/CodeGen/AMDGPU/collapse-endcf.ll    | 298 ++++++-------
 llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll |  14 +-
 llvm/test/CodeGen/AMDGPU/fcanonicalize.ll     |  34 +-
 .../AMDGPU/fix-sgpr-copies-nondeterminism.ll  |   8 +-
 llvm/test/CodeGen/AMDGPU/flat-scratch.ll      | 412 +++++++-----------
 .../AMDGPU/gfx-callable-return-types.ll       |  14 +-
 .../hazard-recognizer-src-shared-base.ll      |   6 +-
 .../identical-subrange-spill-infloop.ll       | 337 +++++++-------
 llvm/test/CodeGen/AMDGPU/imm.ll               |   4 +-
 .../CodeGen/AMDGPU/indirect-addressing-si.ll  | 193 ++++----
 ...e92561-restore-undef-scc-verifier-error.ll |  18 +-
 ...474-need-live-out-undef-subregister-def.ll |   6 +-
 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll  |   4 +-
 .../CodeGen/AMDGPU/masked-load-vectortypes.ll |  75 ++--
 .../CodeGen/AMDGPU/max-hard-clause-length.ll  |  22 +-
 .../CodeGen/AMDGPU/memcpy-crash-issue63986.ll |   4 +-
 llvm/test/CodeGen/AMDGPU/mfma-loop.ll         |  13 +-
 .../AMDGPU/module-lds-false-sharing.ll        |  14 +-
 .../CodeGen/AMDGPU/no-dup-inst-prefetch.ll    |  36 +-
 .../CodeGen/AMDGPU/no-fold-accvgpr-mov.ll     |  46 +-
 llvm/test/CodeGen/AMDGPU/scalar-float-sop2.ll | 106 ++++-
 ...calc-one-successor-two-predecessors-bug.ll |  36 +-
 ...t_kill_i1_for_floation_point_comparison.ll |  10 +-
 .../si-optimize-vgpr-live-range-dbg-instr.ll  |  10 +-
 .../CodeGen/AMDGPU/si-scheduler-exports.ll    |  13 +-
 llvm/test/CodeGen/AMDGPU/smfmac_no_agprs.ll   |  25 +-
 .../AMDGPU/splitkit-getsubrangeformask.ll     |  35 +-
 .../AMDGPU/tuple-allocation-failure.ll        | 188 ++++----
 .../test/CodeGen/AMDGPU/vni8-across-blocks.ll |   5 +-
 llvm/test/CodeGen/AMDGPU/vopc_dpp.ll          |   2 +-
 .../CodeGen/AMDGPU/waterfall_kills_scc.ll     |  21 +-
 llvm/test/CodeGen/AMDGPU/wqm.ll               |  36 +-
 40 files changed, 1216 insertions(+), 1262 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 8e90754103ff1..c990df622175f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -634,6 +634,61 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
   case ISD::BUILD_VECTOR: {
     EVT VT = N->getValueType(0);
     unsigned NumVectorElts = VT.getVectorNumElements();
+
+    auto IsSplatAllZeros = [this](SDNode *N) -> bool {
+      if (ISD::isConstantSplatVectorAllZeros(N))
+        return true;
+
+      // Types may have legalized by stripping the 16 bit multi-element vector
+      // into multiple BUILD_VECTORs. Peek through and see if it is all zeros
+      // regardless of what the legalizer did. Assumes cases along the lines of:
+      //   v8i16 build_vector 0, 0, 0, 0, 0, 0, 0, 0
+      //     -> legalizer ->
+      //   t0 = v2i16 build_vector 0, 0
+      //   t1 = bitcast t0 to i32
+      //   v4i32 build_vector t1, t1, t1, t1
+      if (CurDAG->isSplatValue(SDValue(N, 0))) {
+        SDValue Op = peekThroughBitcasts(N->getOperand(0));
+        EVT InnerVT = Op.getValueType();
+        if (InnerVT.isVector() && Op.getOpcode() == ISD::BUILD_VECTOR &&
+            InnerVT.getVectorNumElements() == 2)
+          return ISD::isConstantSplatVectorAllZeros(Op.getNode());
+      }
+      return false;
+    };
+    if (IsSplatAllZeros(N)) {
+      unsigned FixedBitSize = VT.getFixedSizeInBits();
+      SDLoc DL(N);
+      if (FixedBitSize == 64) {
+        SDValue Set0 = {
+            CurDAG->getMachineNode(AMDGPU::S_MOV_B64_IMM_PSEUDO, DL, MVT::i64,
+                                   CurDAG->getTargetConstant(0, DL, MVT::i64)),
+            0};
+        CurDAG->SelectNodeTo(N, AMDGPU::COPY, VT, Set0);
+        return;
+      } else if (NumVectorElts <= 32 && (FixedBitSize % 64 == 0)) {
+        SmallVector<SDValue, 32 * 2 + 1> Ops((FixedBitSize / 64) * 2 + 1);
+        SDValue Set0 = {
+            CurDAG->getMachineNode(AMDGPU::S_MOV_B64_IMM_PSEUDO, DL, MVT::i64,
+                                   CurDAG->getTargetConstant(0, DL, MVT::i64)),
+            0};
+        unsigned RCID =
+            SIRegisterInfo::getSGPRClassForBitWidth(FixedBitSize)->getID();
+        Ops[0] = CurDAG->getTargetConstant(RCID, DL, MVT::i32);
+
+        for (unsigned i = 0, CurrentBitSize = FixedBitSize; CurrentBitSize != 0;
+             ++i, CurrentBitSize -= 64) {
+          unsigned SubRegs =
+              SIRegisterInfo::getSubRegFromChannel(i * 2, /*NumRegs=*/2);
+          Ops[i * 2 + 1] = Set0;
+          Ops[i * 2 + 2] = CurDAG->getTargetConstant(SubRegs, DL, MVT::i32);
+        }
+
+        CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, VT, Ops);
+        return;
+      }
+    }
+
     if (VT.getScalarSizeInBits() == 16) {
       if (Opc == ISD::BUILD_VECTOR && NumVectorElts == 2) {
         if (SDNode *Packed = packConstantV2I16(N, *CurDAG)) {
diff --git a/llvm/test/CodeGen/AMDGPU/adjust-writemask-cse.ll b/llvm/test/CodeGen/AMDGPU/adjust-writemask-cse.ll
index 23a7bb6ece488..1d0a9f9585123 100644
--- a/llvm/test/CodeGen/AMDGPU/adjust-writemask-cse.ll
+++ b/llvm/test/CodeGen/AMDGPU/adjust-writemask-cse.ll
@@ -4,15 +4,14 @@
 define float @test() {
   ; GFX10-LABEL: name: test
   ; GFX10: bb.0.bb:
-  ; GFX10-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-  ; GFX10-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_256 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1, [[S_MOV_B32_]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3, [[S_MOV_B32_]], %subreg.sub4, [[S_MOV_B32_]], %subreg.sub5, [[S_MOV_B32_]], %subreg.sub6, [[S_MOV_B32_]], %subreg.sub7
-  ; GFX10-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GFX10-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GFX10-NEXT:   [[IMAGE_LOAD_V2_V2_nsa_gfx10_:%[0-9]+]]:vreg_64 = IMAGE_LOAD_V2_V2_nsa_gfx10 [[COPY]], [[COPY1]], killed [[REG_SEQUENCE]], 3, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s96), align 16, addrspace 8)
-  ; GFX10-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[IMAGE_LOAD_V2_V2_nsa_gfx10_]].sub1
-  ; GFX10-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[IMAGE_LOAD_V2_V2_nsa_gfx10_]].sub0
-  ; GFX10-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY2]], 0, killed [[COPY3]], 0, 0, implicit $mode, implicit $exec
-  ; GFX10-NEXT:   [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_ADD_F32_e64_]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
+  ; GFX10-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0
+  ; GFX10-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_256 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[S_MOV_B64_]], %subreg.sub2_sub3, [[S_MOV_B64_]], %subreg.sub4_sub5, [[S_MOV_B64_]], %subreg.sub6_sub7
+  ; GFX10-NEXT:   [[V_MOV_B32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0
+  ; GFX10-NEXT:   [[IMAGE_LOAD_V2_V2_nsa_gfx10_:%[0-9]+]]:vreg_64 = IMAGE_LOAD_V2_V2_nsa_gfx10 [[V_MOV_B32_]], [[V_MOV_B32_]], killed [[REG_SEQUENCE]], 3, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s96), align 16, addrspace 8)
+  ; GFX10-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY [[IMAGE_LOAD_V2_V2_nsa_gfx10_]].sub1
+  ; GFX10-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[IMAGE_LOAD_V2_V2_nsa_gfx10_]].sub0
+  ; GFX10-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY]], 0, killed [[COPY1]], 0, 0, implicit $mode, implicit $exec
+  ; GFX10-NEXT:   [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_ADD_F32_e64_]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
   ; GFX10-NEXT:   $vgpr0 = COPY [[V_ADD_F32_e64_1]]
   ; GFX10-NEXT:   SI_RETURN implicit $vgpr0
 bb:
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
index 3116b5d59a097..1f69299f8ad77 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
@@ -515,48 +515,47 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX908-NEXT:    global_load_ushort v16, v[0:1], off glc
 ; GFX908-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
 ; GFX908-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
-; GFX908-NEXT:    s_load_dword s7, s[8:9], 0x18
-; GFX908-NEXT:    s_mov_b32 s6, 0
-; GFX908-NEXT:    s_mov_b32 s9, s6
+; GFX908-NEXT:    s_mov_b32 s7, 0
+; GFX908-NEXT:    s_load_dword s8, s[8:9], 0x18
+; GFX908-NEXT:    v_mov_b32_e32 v19, 0
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX908-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX908-NEXT:    s_sub_i32 s8, 0, s3
-; GFX908-NEXT:    v_cvt_f32_f16_e32 v17, s7
-; GFX908-NEXT:    v_mov_b32_e32 v19, 0
-; GFX908-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GFX908-NEXT:    s_sub_i32 s6, 0, s3
+; GFX908-NEXT:    v_cvt_f32_f16_e32 v17, s8
+; GFX908-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX908-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX908-NEXT:    v_cvt_u32_f32_e32 v2, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX908-NEXT:    v_mov_b32_e32 v1, 0
-; GFX908-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
-; GFX908-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX908-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX908-NEXT:    s_mul_i32 s8, s8, s10
-; GFX908-NEXT:    s_mul_hi_u32 s8, s10, s8
-; GFX908-NEXT:    s_add_i32 s10, s10, s8
-; GFX908-NEXT:    s_mul_hi_u32 s8, s2, s10
-; GFX908-NEXT:    s_mul_i32 s10, s8, s3
-; GFX908-NEXT:    s_sub_i32 s2, s2, s10
-; GFX908-NEXT:    s_add_i32 s11, s8, 1
-; GFX908-NEXT:    s_sub_i32 s10, s2, s3
+; GFX908-NEXT:    v_readfirstlane_b32 s9, v2
+; GFX908-NEXT:    s_mul_i32 s6, s6, s9
+; GFX908-NEXT:    s_mul_hi_u32 s6, s9, s6
+; GFX908-NEXT:    s_add_i32 s9, s9, s6
+; GFX908-NEXT:    s_mul_hi_u32 s6, s2, s9
+; GFX908-NEXT:    s_mul_i32 s9, s6, s3
+; GFX908-NEXT:    s_sub_i32 s2, s2, s9
+; GFX908-NEXT:    s_add_i32 s10, s6, 1
+; GFX908-NEXT:    s_sub_i32 s9, s2, s3
 ; GFX908-NEXT:    s_cmp_ge_u32 s2, s3
-; GFX908-NEXT:    s_cselect_b32 s8, s11, s8
-; GFX908-NEXT:    s_cselect_b32 s2, s10, s2
-; GFX908-NEXT:    s_add_i32 s10, s8, 1
+; GFX908-NEXT:    s_cselect_b32 s6, s10, s6
+; GFX908-NEXT:    s_cselect_b32 s2, s9, s2
+; GFX908-NEXT:    s_add_i32 s9, s6, 1
 ; GFX908-NEXT:    s_cmp_ge_u32 s2, s3
-; GFX908-NEXT:    s_cselect_b32 s8, s10, s8
-; GFX908-NEXT:    s_lshr_b32 s7, s7, 16
-; GFX908-NEXT:    v_cvt_f32_f16_e32 v18, s7
+; GFX908-NEXT:    s_cselect_b32 s6, s9, s6
+; GFX908-NEXT:    s_lshr_b32 s10, s8, 16
+; GFX908-NEXT:    v_cvt_f32_f16_e32 v18, s10
+; GFX908-NEXT:    s_lshl_b64 s[10:11], s[6:7], 5
 ; GFX908-NEXT:    s_lshl_b64 s[2:3], s[0:1], 5
-; GFX908-NEXT:    s_lshl_b64 s[12:13], s[8:9], 5
-; GFX908-NEXT:    s_lshl_b64 s[10:11], s[4:5], 5
-; GFX908-NEXT:    s_or_b32 s10, s10, 28
+; GFX908-NEXT:    s_lshl_b64 s[8:9], s[4:5], 5
+; GFX908-NEXT:    s_or_b32 s8, s8, 28
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_readfirstlane_b32 s7, v16
 ; GFX908-NEXT:    s_and_b32 s7, 0xffff, s7
 ; GFX908-NEXT:    s_mul_i32 s1, s1, s7
-; GFX908-NEXT:    s_mul_hi_u32 s9, s0, s7
+; GFX908-NEXT:    s_mul_hi_u32 s12, s0, s7
 ; GFX908-NEXT:    s_mul_i32 s0, s0, s7
-; GFX908-NEXT:    s_add_i32 s1, s9, s1
-; GFX908-NEXT:    s_lshl_b64 s[14:15], s[0:1], 5
+; GFX908-NEXT:    s_add_i32 s1, s12, s1
+; GFX908-NEXT:    s_lshl_b64 s[12:13], s[0:1], 5
 ; GFX908-NEXT:    s_branch .LBB3_2
 ; GFX908-NEXT:  .LBB3_1: ; %Flow20
 ; GFX908-NEXT:    ; in Loop: Header=BB3_2 Depth=1
@@ -565,59 +564,58 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX908-NEXT:  .LBB3_2: ; %bb9
 ; GFX908-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX908-NEXT:    ; Child Loop BB3_5 Depth 2
-; GFX908-NEXT:    s_mov_b64 s[16:17], -1
+; GFX908-NEXT:    s_mov_b64 s[14:15], -1
 ; GFX908-NEXT:    s_cbranch_scc0 .LBB3_10
 ; GFX908-NEXT:  ; %bb.3: ; %bb14
 ; GFX908-NEXT:    ; in Loop: Header=BB3_2 Depth=1
 ; GFX908-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
 ; GFX908-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[4:5], -1
-; GFX908-NEXT:    s_mov_b32 s7, s6
-; GFX908-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[0:1]
-; GFX908-NEXT:    v_mov_b32_e32 v4, s6
-; GFX908-NEXT:    v_cmp_ne_u32_e64 s[0:1], 1, v6
-; GFX908-NEXT:    v_mov_b32_e32 v6, s6
-; GFX908-NEXT:    v_mov_b32_e32 v9, s7
-; GFX908-NEXT:    v_mov_b32_e32 v5, s7
-; GFX908-NEXT:    v_mov_b32_e32 v7, s7
-; GFX908-NEXT:    v_mov_b32_e32 v8, s6
-; GFX908-NEXT:    v_cmp_lt_i64_e64 s[16:17], s[4:5], 0
-; GFX908-NEXT:    v_mov_b32_e32 v11, v5
-; GFX908-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; GFX908-NEXT:    v_mov_b32_e32 v10, v4
+; GFX908-NEXT:    v_cmp_lt_i64_e64 s[14:15], s[4:5], 0
+; GFX908-NEXT:    v_mov_b32_e32 v4, 0
+; GFX908-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[0:1]
+; GFX908-NEXT:    v_mov_b32_e32 v6, 0
+; GFX908-NEXT:    v_mov_b32_e32 v8, 0
+; GFX908-NEXT:    v_mov_b32_e32 v10, 0
+; GFX908-NEXT:    v_mov_b32_e32 v5, 0
+; GFX908-NEXT:    v_mov_b32_e32 v7, 0
+; GFX908-NEXT:    v_mov_b32_e32 v9, 0
+; GFX908-NEXT:    v_mov_b32_e32 v11, 0
+; GFX908-NEXT:    v_cmp_ne_u32_e64 s[0:1], 1, v12
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_readfirstlane_b32 s7, v2
-; GFX908-NEXT:    v_readfirstlane_b32 s9, v3
+; GFX908-NEXT:    v_readfirstlane_b32 s16, v3
 ; GFX908-NEXT:    s_add_u32 s7, s7, 1
-; GFX908-NEXT:    s_addc_u32 s9, s9, 0
-; GFX908-NEXT:    s_mul_hi_u32 s20, s2, s7
-; GFX908-NEXT:    s_mul_i32 s9, s2, s9
-; GFX908-NEXT:    s_mul_i32 s21, s3, s7
-; GFX908-NEXT:    s_add_i32 s9, s20, s9
+; GFX908-NEXT:    s_addc_u32 s16, s16, 0
+; GFX908-NEXT:    s_mul_hi_u32 s17, s2, s7
+; GFX908-NEXT:    s_mul_i32 s16, s2, s16
+; GFX908-NEXT:    s_mul_i32 s18, s3, s7
+; GFX908-NEXT:    s_add_i32 s16, s17, s16
 ; GFX908-NEXT:    s_mul_i32 s7, s2, s7
-; GFX908-NEXT:    s_add_i32 s9, s9, s21
+; GFX908-NEXT:    s_add_i32 s22, s16, s18
+; GFX908-NEXT:    s_mov_b64 s[16:17], s[8:9]
 ; GFX908-NEXT:    s_branch .LBB3_5
 ; GFX908-NEXT:  .LBB3_4: ; %bb58
 ; GFX908-NEXT:    ; in Loop: Header=BB3_5 Depth=2
 ; GFX908-NEXT:    v_add_co_u32_sdwa v2, vcc, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX908-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX908-NEXT:    s_add_u32 s18, s18, s14
-; GFX908-NEXT:    v_cmp_lt_i64_e64 s[22:23], -1, v[2:3]
-; GFX908-NEXT:    s_addc_u32 s19, s19, s15
-; GFX908-NEXT:    s_mov_b64 s[20:21], 0
-; GFX908-NEXT:    s_andn2_b64 vcc, exec, s[22:23]
+; GFX908-NEXT:    s_add_u32 s16, s16, s12
+; GFX908-NEXT:    v_cmp_lt_i64_e64 s[20:21], -1, v[2:3]
+; GFX908-NEXT:    s_addc_u32 s17, s17, s13
+; GFX908-NEXT:    s_mov_b64 s[18:19], 0
+; GFX908-NEXT:    s_andn2_b64 vcc, exec, s[20:21]
 ; GFX908-NEXT:    s_cbranch_vccz .LBB3_9
 ; GFX908-NEXT:  .LBB3_5: ; %bb16
 ; GFX908-NEXT:    ; Parent Loop BB3_2 Depth=1
 ; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX908-NEXT:    s_add_u32 s20, s18, s7
-; GFX908-NEXT:    s_addc_u32 s21, s19, s9
-; GFX908-NEXT:    global_load_dword v21, v19, s[20:21] offset:-12 glc
+; GFX908-NEXT:    s_add_u32 s18, s16, s7
+; GFX908-NEXT:    s_addc_u32 s19, s17, s22
+; GFX908-NEXT:    global_load_dword v21, v19, s[18:19] offset:-12 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    global_load_dword v20, v19, s[20:21] offset:-8 glc
+; GFX908-NEXT:    global_load_dword v20, v19, s[18:19] offset:-8 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    global_load_dword v12, v19, s[20:21] offset:-4 glc
+; GFX908-NEXT:    global_load_dword v12, v19, s[18:19] offset:-4 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    global_load_dword v12, v19, s[20:21] glc
+; GFX908-NEXT:    global_load_dword v12, v19, s[18:19] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    ds_read_b64 v[12:13], v19
 ; GFX908-NEXT:    ds_read_b64 v[14:15], v0
@@ -648,27 +646,27 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX908-NEXT:    v_add_f32_e32 v11, v11, v13
 ; GFX908-NEXT:    s_branch .LBB3_4
 ; GFX908-NEXT:  .LBB3_7: ; in Loop: Header=BB3_5 Depth=2
-; GFX908-NEXT:    s_mov_b64 s[20:21], s[16:17]
-; GFX908-NEXT:    s_andn2_b64 vcc, exec, s[20:21]
+; GFX908-NEXT:    s_mov_b64 s[18:19], s[14:15]
+; GFX908-NEXT:    s_andn2_b64 vcc, exec, s[18:19]
 ; GFX908-NEXT:    s_cbranch_vccz .LBB3_4
 ; GFX908-NEXT:  ; %bb.8: ; in Loop: Header=BB3_2 Depth=1
-; GFX908-NEXT:    s_mov_b64 s[20:21], -1
+; GFX908-NEXT:    s_mov_b64 s[18:19], -1
 ; GFX908-NEXT:    ; implicit-def: $vgpr2_vgpr3
-; GFX908-NEXT:    ; implicit-def: $sgpr18_sgpr19
+; GFX908-NEXT:    ; implicit-def: $sgpr16_sgpr17
 ; GFX908-NEXT:  .LBB3_9: ; %loop.exit.guard
 ; GFX908-NEXT:    ; in Loop: Header=BB3_2 Depth=1
-; GFX908-NEXT:    s_xor_b64 s[16:17], s[20:21], -1
+; GFX908-NEXT:    s_xor_b64 s[14:15], s[18:19], -1
 ; GFX908-NEXT:  .LBB3_10: ; %Flow19
 ; GFX908-NEXT:    ; in Loop: Header=BB3_2 Depth=1
 ; GFX908-NEXT:    s_mov_b64 s[0:1], -1
-; GFX908-NEXT:    s_and_b64 vcc, exec, s[16:17]
+; GFX908-NEXT:    s_and_b64 vcc, exec, s[14:15]
 ; GFX908-NEXT:    s_cbranch_vccz .LBB3_1
 ; GFX908-NEXT:  ; %bb.11: ; %bb12
 ; GFX908-NEXT:    ; in Loop: Header=BB3_2 Depth=1
-; GFX908-NEXT:    s_add_u32 s4, s4, s8
+; GFX908-NEXT:    s_add_u32 s4, s4, s6
 ; GFX908-NEXT:    s_addc_u32 s5, s5, 0
-; GFX908-NEXT:    s_add_u32 s10, s10, s12
-; GFX908-NEXT:    s_addc_u32 s11, s11, s13
+; GFX908-NEXT:    s_add_u32 s8, s8, s10
+; GFX908-NEXT:    s_addc_u32 s9, s9, s11
 ; GFX908-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX908-NEXT:    s_branch .LBB3_1
 ; GFX908-NEXT:  .LBB3_12: ; %DummyReturnBlock
@@ -679,47 +677,46 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX90A-NEXT:    global_load_ushort v18, v[0:1], off glc
 ; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
 ; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
-; GFX90A-NEXT:    s_load_dword s7, s[8:9], 0x18
-; GFX90A-NEXT:    s_mov_b32 s6, 0
-; GFX90A-NEXT:    s_mov_b32 s9, s6
+; GFX90A-NEXT:    s_mov_b32 s7, 0
+; GFX90A-NEXT:    s_load_dword s8, s[8:9], 0x18
+; GFX90A-NEXT:    v_mov_b32_e32 v19, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX90A-NEXT:    s_sub_i32 s8, 0, s3
-; GFX90A-NEXT:    v_mov_b32_e32 v19, 0
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GFX90A-NEXT:    s_sub_i32 s6, 0, s3
+; GFX90A-NEXT:    v_cvt_f32_f16_e32 v2, s8
+; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX90A-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX90A-NEXT:    v_cvt_u32_f32_e32 v3, v0
 ; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
-; GFX90A-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v3, v2
-; GFX90A-NEXT:    v_cvt_f32_f16_e32 v2, s7
-; GFX90A-NEXT:    v_readfirstlane_b32 s10, v3
-; GFX90A-NEXT:    s_mul_i32 s8, s8, s10
-; GFX90A-NEXT:    s_mul_hi_u32 s8, s10, s8
-; GFX90A-NEXT:    s_add_i32 s10, s10, s8
-; GFX90A-NEXT:    s_mul_hi_u32 s8, s2, s10
-; GFX90A-NEXT:    s_mul_i32 s10, s8, s3
-; GFX90A-NEXT:    s_sub_i32 s2, s2, s10
-; GFX90A-NEXT:    s_add_i32 s11, s8, 1
-; GFX90A-NEXT:    s_sub_i32 s10, s2, s3
+; GFX90A-NEXT:    v_readfirstlane_b32 s9, v3
+; GFX90A-NEXT:    s_mul_i32 s6, s6, s9
+; GFX90A-NEXT:    s_mul_hi_u32 s6, s9, s6
+; GFX90A-NEXT:    s_add_i32 s9, s9, s6
+; GFX90A-NEXT:    s_mul_hi_u32 s6, s2, s9
+; GFX90A-NEXT:    s_mul_i32 s9, s6, s3
+; GFX90A-NEXT:    s_sub_i32 s2, s2, s9
+; GFX90A-NEXT:    s_add_i32 s10, s6, 1
+; GFX90A-NEXT:    s_sub_i32 s9, s2, s3
 ; GFX90A-NEXT:    s_cmp_ge_u32 s2, s3
-; GFX90A-NEXT:    s_cselect_b32 s8, s11, s8
-; GFX90A-NEXT:    s_cselect_b32 s2, s10, s2
-; GFX90A-NEXT:    s_add_i32 s10, s8, 1
+; GFX90A-NEXT:    s_cselect_b32 s6, s10, s6
+; GFX90A-NEXT:    s_cselect_b32 s2, s9, s2
+; GFX90A-NEXT:    s_add_i32 s9, s6, 1
 ; GFX90A-NEXT:    s_cmp_ge_u32 s2, s3
-; GFX90A-NEXT:    s_cselect_b32 s8, s10, s8
-; GFX90A-NEXT:    s_lshr_b32 s7, s7, 16
-; GFX90A-NEXT:    v_cvt_f32_f16_e32 v3, s7
+; GFX90A-NEXT:    s_cselect_b32 s6, s9, s6
+; GFX90A-NEXT:    s_lshr_b32 s10, s8, 16
+; GFX90A-NEXT:    v_cvt_f32_f16_e32 v3, s10
+; GFX90A-NEXT:    s_lshl_b64 s[10:11], s[6:7], 5
 ; GFX90A-NEXT:    s_lshl_b64 s[2:3], s[0:1], 5
-; GFX90A-NEXT:    s_lshl_b64 s[12:13], s[8:9], 5
-; GFX90A-NEXT:    s_lshl_b64 s[10:11], s[4:5], 5
-; GFX90A-NEXT:    s_or_b32 s10, s10, 28
+; GFX90A-NEXT:    s_lshl_b64 s[8:9], s[4:5], 5
+; GFX90A-NEXT:    s_or_b32 s8, s8, 28
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_readfirstlane_b32 s7, v18
 ; GFX90A-NEXT:    s_and_b32 s7, 0xffff, s7
 ; GFX90A-NEXT:    s_mul_i32 s1, s1, s7
-; GFX90A-NEXT:    s_mul_hi_u32 s9, s0, s7
+; GFX90A-NEXT:    s_mul_hi_u32 s12, s0, s7
 ; GFX90A-NEXT:    s_mul_i32 s0, s0, s7
-; GFX90A-NEXT:    s_add_i32 s1, s9, s1
-; GFX90A-NEXT:    s_lshl_b64 s[14:15], s[0:1], 5
+; GFX90A-NEXT:    s_add_i32 s1, s12, s1
+; GFX90A-NEXT:    s_lshl_b64 s[12:13], s[0:1], 5
 ; GFX90A-NEXT:    s_branch .LBB3_2
 ; GFX90A-NEXT:  .LBB3_1: ; %Flow20
 ; GFX90A-NEXT:    ; in Loop: Header=BB3_2 Depth=1
@@ -728,60 +725,59 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX90A-NEXT:  .LBB3_2: ; %bb9
 ; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX90A-NEXT:    ; Child Loop BB3_5 Depth 2
-; GFX90A-NEXT:    s_mov_b64 s[16:17], -1
+; GFX90A-NEXT:    s_mov_b64 s[14:15], -1
 ; GFX90A-NEXT:    s_cbranch_scc0 .LBB3_10
 ; GFX90A-NEXT:  ; %bb.3: ; %bb14
 ; GFX90A-NEXT:    ; in Loop: Header=BB3_2 Depth=1
 ; GFX90A-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
 ; GFX90A-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[4:5], -1
-; GFX90A-NEXT:    s_mov_b32 s7, s6
-; GFX90A-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[0:1]
-; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_cmp_ne_u32_e64 s[0:1], 1, v8
-; GFX90A-NEXT:    v_pk_mov_b32 v[8:9], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_cmp_lt_i64_e64 s[16:17], s[4:5], 0
-; GFX90A-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; GFX90A-NEXT:    v_pk_mov_b32 v[12:13], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[0:1]
+; GFX90A-NEXT:    v_cmp_lt_i64_e64 s[14:15], s[4:5], 0
+; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], 0, 0
+; GFX90A-NEXT:    s_mov_b64 s[16:17], s[8:9]
+; GFX90A-NEXT:    v_pk_mov_b32 v[8:9], 0, 0
+; GFX90A-NEXT:    v_pk_mov_b32 v[10:11], 0, 0
+; GFX90A-NEXT:    v_cmp_ne_u32_e64 s[0:1], 1, v12
+; GFX90A-NEXT:    v_pk_mov_b32 v[12:13], 0, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_readfirstlane_b32 s7, v4
-; GFX90A-NEXT:    v_readfirstlane_b32 s9, v5
+; GFX90A-NEXT:    v_readfirstlane_b32 s18, v5
 ; GFX90A-NEXT:    s_add_u32 s7, s7, 1
-; GFX90A-NEXT:    s_addc_u32 s9, s9, 0
-; GFX90A-NEXT:    s_mul_hi_u32 s20, s2, s7
-; GFX90A-NEXT:    s_mul_i32 s9, s2, s9
-; GFX90A-NEXT:    s_mul_i32 s21, s3, s7
-; GFX90A-NEXT:    s_add_i32 s9, s20, s9
+; GFX90A-NEXT:    s_addc_u32 s18, s18, 0
+; GFX90A-NEXT:    s_mul_hi_u32 s19, s2, s7
+; GFX90A-NEXT:    s_mul_i32 s18, s2, s18
+; GFX90A-NEXT:    s_mul_i32 s20, s3, s7
+; GFX90A-NEXT:    s_add_i32 s18, s19, s18
 ; GFX90A-NEXT:    s_mul_i32 s7, s2, s7
-; GFX90A-NEXT:    s_add_i32 s9, s9, s21
+; GFX90A-NEXT:    s_add_i32 s22, s18, s20
 ; GFX90A-NEXT:    s_branch .LBB3_5
 ; GFX90A-NEXT:  .LBB3_4: ; %bb58
 ; GFX90A-NEXT:    ; in Loop: Header=BB3_5 Depth=2
 ; GFX90A-NEXT:    v_add_co_u32_sdwa v4, vcc, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
-; GFX90A-NEXT:    s_add_u32 s18, s18, s14
-; GFX90A-NEXT:    s_addc_u32 s19, s19, s15
-; GFX90A-NEXT:    v_cmp_lt_i64_e64 s[22:23], -1, v[4:5]
-; GFX90A-NEXT:    s_mov_b64 s[20:21], 0
-; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[22:23]
+; GFX90A-NEXT:    s_add_u32 s16, s16, s12
+; GFX90A-NEXT:    s_addc_u32 s17, s17, s13
+; GFX90A-NEXT:    v_cmp_lt_i64_e64 s[20:21], -1, v[4:5]
+; GFX90A-NEXT:    s_mov_b64 s[18:19], 0
+; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[20:21]
 ; GFX90A-NEXT:    s_cbranch_vccz .LBB3_9
 ; GFX90A-NEXT:  .LBB3_5: ; %bb16
 ; GFX90A-NEXT:    ; Parent Loop BB3_2 Depth=1
 ; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX90A-NEXT:    s_add_u32 s20, s18, s7
-; GFX90A-NEXT:    s_addc_u32 s21, s19, s9
-; GFX90A-NEXT:    global_load_dword v21, v19, s[20:21] offset:-12 glc
+; GFX90A-NEXT:    s_add_u32 s18, s16, s7
+; GFX90A-NEXT:    s_addc_u32 s19, s17, s22
+; GFX90A-NEXT:    global_load_dword v21, v19, s[18:19] offset:-12 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_load_dword v20, v19, s[20:21] offset:-8 glc
+; GFX90A-NEXT:    global_load_dword v20, v19, s[18:19] offset:-8 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_load_dword v14, v19, s[20:21] offset:-4 glc
+; GFX90A-NEXT:    global_load_dword v14, v19, s[18:19] offset:-4 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_load_dword v14, v19, s[20:21] glc
+; GFX90A-NEXT:    global_load_dword v14, v19, s[18:19] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    ds_read_b64 v[14:15], v19
 ; GFX90A-NEXT:    ds_read_b64 v[16:17], v0
 ; GFX90A-NEXT:    s_and_b64 vcc, exec, s[0:1]
-; GFX90A-NEXT:    ; kill: killed $sgpr20 killed $sgpr21
+; GFX90A-NEXT:    ; kill: killed $sgpr18 killed $sgpr19
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    s_cbranch_vccnz .LBB3_7
 ; GFX90A-NEXT:  ; %bb.6: ; %bb51
@@ -800,27 +796,27 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX90A-NEXT:    v_pk_add_f32 v[12:13], v[12:13], v[14:15]
 ; GFX90A-NEXT:    s_branch .LBB3_4
 ; GFX90A-NEXT:  .LBB3_7: ; in Loop: Header=BB3_5 Depth=2
-; GFX90A-NEXT:    s_mov_b64 s[20:21], s[16:17]
-; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[20:21]
+; GFX90A-NEXT:    s_mov_b64 s[18:19], s[14:15]
+; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[18:19]
 ; GFX90A-NEXT:    s_cbranch_vccz .LBB3_4
 ; GFX90A-NEXT:  ; %bb.8: ; in Loop: Header=BB3_2 Depth=1
-; GFX90A-NEXT:    s_mov_b64 s[20:21], -1
+; GFX90A-NEXT:    s_mov_b64 s[18:19], -1
 ; GFX90A-NEXT:    ; implicit-def: $vgpr4_vgpr5
-; GFX90A-NEXT:    ; implicit-def: $sgpr18_sgpr19
+; GFX90A-NEXT:    ; implicit-def: $sgpr16_sgpr17
 ; GFX90A-NEXT:  .LBB3_9: ; %loop.exit.guard
 ; GFX90A-NEXT:    ; in Loop: Header=BB3_2 Depth=1
-; GFX90A-NEXT:    s_xor_b64 s[16:17], s[20:21], -1
+; GFX90A-NEXT:    s_xor_b64 s[14:15], s[18:19], -1
 ; GFX90A-NEXT:  .LBB3_10: ; %Flow19
 ; GFX90A-NEXT:    ; in Loop: Header=BB3_2 Depth=1
 ; GFX90A-NEXT:    s_mov_b64 s[0:1], -1
-; GFX90A-NEXT:    s_and_b64 vcc, exec, s[16:17]
+; GFX90A-NEXT:    s_and_b64 vcc, exec, s[14:15]
 ; GFX90A-NEXT:    s_cbranch_vccz .LBB3_1
 ; GFX90A-NEXT:  ; %bb.11: ; %bb12
 ; GFX90A-NEXT:    ; in Loop: Header=BB3_2 Depth=1
-; GFX90A-NEXT:    s_add_u32 s4, s4, s8
+; GFX90A-NEXT:    s_add_u32 s4, s4, s6
 ; GFX90A-NEXT:    s_addc_u32 s5, s5, 0
-; GFX90A-NEXT:    s_add_u32 s10, s10, s12
-; GFX90A-NEXT:    s_addc_u32 s11, s11, s13
+; GFX90A-NEXT:    s_add_u32 s8, s8, s10
+; GFX90A-NEXT:    s_addc_u32 s9, s9, s11
 ; GFX90A-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX90A-NEXT:    s_branch .LBB3_1
 ; GFX90A-NEXT:  .LBB3_12: ; %DummyReturnBlock
diff --git a/llvm/test/CodeGen/AMDGPU/atomic-optimizer-strict-wqm.ll b/llvm/test/CodeGen/AMDGPU/atomic-optimizer-strict-wqm.ll
index e03c9ca34b825..13f5b6598f6fa 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic-optimizer-strict-wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic-optimizer-strict-wqm.ll
@@ -11,7 +11,7 @@ define amdgpu_ps void @main(i32 %arg) {
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_mov_b32 s1, exec_lo
-; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX10-NEXT:    s_mov_b32 s2, 0
 ; GFX10-NEXT:    s_branch .LBB0_2
 ; GFX10-NEXT:  .LBB0_1: ; in Loop: Header=BB0_2 Depth=1
@@ -31,9 +31,7 @@ define amdgpu_ps void @main(i32 %arg) {
 ; GFX10-NEXT:    s_and_b32 exec_lo, exec_lo, s0
 ; GFX10-NEXT:    s_cbranch_execz .LBB0_1
 ; GFX10-NEXT:  ; %bb.4: ; in Loop: Header=BB0_2 Depth=1
-; GFX10-NEXT:    s_mov_b32 s5, s4
-; GFX10-NEXT:    s_mov_b32 s6, s4
-; GFX10-NEXT:    s_mov_b32 s7, s4
+; GFX10-NEXT:    s_mov_b64 s[6:7], s[4:5]
 ; GFX10-NEXT:    buffer_atomic_and v0, off, s[4:7], 0
 ; GFX10-NEXT:    s_branch .LBB0_1
 ; GFX10-NEXT:  .LBB0_5: ; %bb8
diff --git a/llvm/test/CodeGen/AMDGPU/bitreverse-inline-immediates.ll b/llvm/test/CodeGen/AMDGPU/bitreverse-inline-immediates.ll
index de2e25651271a..e0f9855ef2741 100644
--- a/llvm/test/CodeGen/AMDGPU/bitreverse-inline-immediates.ll
+++ b/llvm/test/CodeGen/AMDGPU/bitreverse-inline-immediates.ll
@@ -16,7 +16,7 @@ define amdgpu_kernel void @materialize_0_i32(ptr addrspace(1) %out) {
 
 ; GCN-LABEL: {{^}}materialize_0_i64:
 ; GCN: v_mov_b32_e32 v[[LOK:[0-9]+]], 0{{$}}
-; GCN: v_mov_b32_e32 v[[HIK:[0-9]+]], v[[LOK]]{{$}}
+; GCN: v_mov_b32_e32 v[[HIK:[0-9]+]], 0{{$}}
 ; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v[[[LOK]]:[[HIK]]]
 define amdgpu_kernel void @materialize_0_i64(ptr addrspace(1) %out) {
   store i64 0, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll b/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll
index a6af63b816573..ab2f424c99254 100644
--- a/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll
+++ b/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll
@@ -9,44 +9,36 @@ define amdgpu_kernel void @blender_no_live_segment_at_def_error(<4 x float> %ext
 ; CHECK-NEXT:    s_addc_u32 s13, s13, 0
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
-; CHECK-NEXT:    s_load_dwordx8 s[48:55], s[8:9], 0x0
+; CHECK-NEXT:    s_load_dwordx8 s[20:27], s[8:9], 0x0
 ; CHECK-NEXT:    s_add_u32 s0, s0, s17
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
-; CHECK-NEXT:    s_mov_b32 s12, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_cmp_lg_u32 s52, 0
+; CHECK-NEXT:    s_cmp_lg_u32 s24, 0
 ; CHECK-NEXT:    s_cbranch_scc1 .LBB0_9
 ; CHECK-NEXT:  ; %bb.1: ; %if.end13.i.i
-; CHECK-NEXT:    s_cmp_eq_u32 s54, 0
-; CHECK-NEXT:    s_cbranch_scc1 .LBB0_4
+; CHECK-NEXT:    s_cmp_eq_u32 s26, 0
+; CHECK-NEXT:    s_cbranch_scc1 .LBB0_7
 ; CHECK-NEXT:  ; %bb.2: ; %if.else251.i.i
-; CHECK-NEXT:    s_cmp_lg_u32 s55, 0
+; CHECK-NEXT:    s_cmp_lg_u32 s27, 0
 ; CHECK-NEXT:    s_mov_b32 s17, 0
 ; CHECK-NEXT:    s_cselect_b32 s12, -1, 0
 ; CHECK-NEXT:    s_and_b32 vcc_lo, exec_lo, s12
-; CHECK-NEXT:    s_cbranch_vccz .LBB0_5
+; CHECK-NEXT:    s_cbranch_vccz .LBB0_4
 ; CHECK-NEXT:  ; %bb.3:
 ; CHECK-NEXT:    s_mov_b32 s18, 0
-; CHECK-NEXT:    s_branch .LBB0_6
-; CHECK-NEXT:  .LBB0_4:
-; CHECK-NEXT:    s_mov_b32 s14, s12
-; CHECK-NEXT:    s_mov_b32 s15, s12
-; CHECK-NEXT:    s_mov_b32 s13, s12
-; CHECK-NEXT:    s_mov_b64 s[50:51], s[14:15]
-; CHECK-NEXT:    s_mov_b64 s[48:49], s[12:13]
-; CHECK-NEXT:    s_branch .LBB0_8
-; CHECK-NEXT:  .LBB0_5: ; %if.then263.i.i
-; CHECK-NEXT:    v_cmp_lt_f32_e64 s12, s53, 0
+; CHECK-NEXT:    s_branch .LBB0_5
+; CHECK-NEXT:  .LBB0_4: ; %if.then263.i.i
+; CHECK-NEXT:    v_cmp_lt_f32_e64 s12, s25, 0
 ; CHECK-NEXT:    s_mov_b32 s18, 1.0
 ; CHECK-NEXT:    s_mov_b32 s17, 0x7fc00000
-; CHECK-NEXT:  .LBB0_6: ; %Flow
-; CHECK-NEXT:    s_mov_b32 s48, 1.0
+; CHECK-NEXT:  .LBB0_5: ; %Flow
+; CHECK-NEXT:    s_mov_b32 s20, 1.0
 ; CHECK-NEXT:    s_andn2_b32 vcc_lo, exec_lo, s12
-; CHECK-NEXT:    s_mov_b32 s49, s48
-; CHECK-NEXT:    s_mov_b32 s50, s48
-; CHECK-NEXT:    s_mov_b32 s51, s48
+; CHECK-NEXT:    s_mov_b32 s21, s20
+; CHECK-NEXT:    s_mov_b32 s22, s20
+; CHECK-NEXT:    s_mov_b32 s23, s20
 ; CHECK-NEXT:    s_cbranch_vccnz .LBB0_8
-; CHECK-NEXT:  ; %bb.7: ; %if.end273.i.i
+; CHECK-NEXT:  ; %bb.6: ; %if.end273.i.i
 ; CHECK-NEXT:    s_add_u32 s12, s8, 40
 ; CHECK-NEXT:    s_addc_u32 s13, s9, 0
 ; CHECK-NEXT:    s_getpc_b64 s[20:21]
@@ -65,13 +57,12 @@ define amdgpu_kernel void @blender_no_live_segment_at_def_error(<4 x float> %ext
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0
 ; CHECK-NEXT:    s_mov_b32 s13, s15
 ; CHECK-NEXT:    s_mov_b32 s14, s16
-; CHECK-NEXT:    s_mov_b32 s48, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[20:21]
 ; CHECK-NEXT:    s_mov_b64 s[8:9], s[34:35]
-; CHECK-NEXT:    s_mov_b32 s49, s48
-; CHECK-NEXT:    s_mov_b32 s50, s48
-; CHECK-NEXT:    s_mov_b32 s51, s48
+; CHECK-NEXT:  .LBB0_7: ; %if.end294.i.i
+; CHECK-NEXT:    s_mov_b64 s[20:21], 0
+; CHECK-NEXT:    s_mov_b64 s[22:23], s[20:21]
 ; CHECK-NEXT:  .LBB0_8: ; %if.end294.i.i
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:12
@@ -80,11 +71,11 @@ define amdgpu_kernel void @blender_no_live_segment_at_def_error(<4 x float> %ext
 ; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; CHECK-NEXT:  .LBB0_9: ; %kernel_direct_lighting.exit
 ; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x20
-; CHECK-NEXT:    v_mov_b32_e32 v0, s48
+; CHECK-NEXT:    v_mov_b32_e32 v0, s20
 ; CHECK-NEXT:    v_mov_b32_e32 v4, 0
-; CHECK-NEXT:    v_mov_b32_e32 v1, s49
-; CHECK-NEXT:    v_mov_b32_e32 v2, s50
-; CHECK-NEXT:    v_mov_b32_e32 v3, s51
+; CHECK-NEXT:    v_mov_b32_e32 v1, s21
+; CHECK-NEXT:    v_mov_b32_e32 v2, s22
+; CHECK-NEXT:    v_mov_b32_e32 v3, s23
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
 ; CHECK-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/bug-cselect-b64.ll b/llvm/test/CodeGen/AMDGPU/bug-cselect-b64.ll
index ea93e3ac1e595..ba58d82ef8e60 100644
--- a/llvm/test/CodeGen/AMDGPU/bug-cselect-b64.ll
+++ b/llvm/test/CodeGen/AMDGPU/bug-cselect-b64.ll
@@ -4,15 +4,14 @@
 define amdgpu_cs <2 x i32> @f() {
 ; CHECK-LABEL: f:
 ; CHECK:       ; %bb.0: ; %bb
-; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:    s_mov_b64 s[4:5], 0
+; CHECK-NEXT:    s_mov_b32 s0, 0
+; CHECK-NEXT:    s_mov_b64 s[6:7], s[4:5]
 ; CHECK-NEXT:    s_mov_b32 s1, 0
-; CHECK-NEXT:    s_mov_b32 s5, s4
-; CHECK-NEXT:    s_mov_b32 s6, s4
-; CHECK-NEXT:    s_mov_b32 s7, s4
 ; CHECK-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; CHECK-NEXT:    v_mov_b32_e32 v1, s4
+; CHECK-NEXT:    v_mov_b32_e32 v1, s0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
 ; CHECK-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
diff --git a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll
index 986dd8a046424..ddb81ea7af764 100644
--- a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll
+++ b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll
@@ -446,19 +446,20 @@ define amdgpu_ps void @cluster_image_sample(<8 x i32> inreg %src, <4 x i32> inre
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v4, v0
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v5, v1
+; GFX11-NEXT:    v_mov_b32_e32 v6, 1.0
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_mov_b32 v6, 1.0 :: v_dual_add_f32 v11, 2.0, v5
-; GFX11-NEXT:    v_dual_add_f32 v9, 1.0, v5 :: v_dual_add_f32 v8, 1.0, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_add_f32 v10, 2.0, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_mov_b32 v10, 0 :: v_dual_add_f32 v11, 2.0, v4
+; GFX11-NEXT:    v_dual_add_f32 v8, 1.0, v4 :: v_dual_add_f32 v9, 1.0, v5
+; GFX11-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_add_f32 v12, 2.0, v5
 ; GFX11-NEXT:    v_mov_b32_e32 v7, v6
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    image_sample_d v[2:5], [v8, v9, v2, v2, v[2:3]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
-; GFX11-NEXT:    image_sample_d v[6:9], [v10, v11, v6, v6, v[6:7]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT:    image_sample_d v[2:5], [v8, v9, v10, v10, v[2:3]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT:    image_sample_d v[6:9], [v11, v12, v6, v6, v[6:7]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_add_f32_e32 v3, v3, v7
 ; GFX11-NEXT:    v_dual_add_f32 v5, v5, v9 :: v_dual_add_f32 v4, v4, v8
-; GFX11-NEXT:    v_dual_add_f32 v3, v3, v7 :: v_dual_add_f32 v2, v2, v6
+; GFX11-NEXT:    v_add_f32_e32 v2, v2, v6
 ; GFX11-NEXT:    image_store v[2:5], v[0:1], s[12:19] dmask:0xf dim:SQ_RSRC_IMG_2D unorm
 ; GFX11-NEXT:    s_endpgm
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
index a47ecb2c5d7f2..f2ee579e0d128 100644
--- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
+++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
@@ -982,60 +982,51 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
 ; GCN-NEXT:    s_movk_i32 s4, 0x207
 ; GCN-NEXT:    v_cmp_gt_i32_e32 vcc, s4, v0
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_mov_b64 s[8:9], 0
-; GCN-NEXT:    v_mov_b32_e32 v7, 0
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    s_mov_b64 s[6:7], 0
-; GCN-NEXT:    s_branch .LBB5_3
-; GCN-NEXT:  .LBB5_1: ; %Flow
-; GCN-NEXT:    ; in Loop: Header=BB5_3 Depth=1
-; GCN-NEXT:    s_or_b64 exec, exec, s[12:13]
-; GCN-NEXT:  .LBB5_2: ; %bb10
-; GCN-NEXT:    ; in Loop: Header=BB5_3 Depth=1
+; GCN-NEXT:    s_branch .LBB5_2
+; GCN-NEXT:  .LBB5_1: ; %bb10
+; GCN-NEXT:    ; in Loop: Header=BB5_2 Depth=1
 ; GCN-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; GCN-NEXT:    s_and_b64 s[6:7], exec, s[4:5]
 ; GCN-NEXT:    s_or_b64 s[8:9], s[6:7], s[8:9]
 ; GCN-NEXT:    s_mov_b64 s[6:7], 0
 ; GCN-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GCN-NEXT:    s_cbranch_execz .LBB5_7
-; GCN-NEXT:  .LBB5_3: ; %bb1
+; GCN-NEXT:    s_cbranch_execz .LBB5_5
+; GCN-NEXT:  .LBB5_2: ; %bb1
 ; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN-NEXT:    s_and_b64 s[10:11], exec, vcc
 ; GCN-NEXT:    s_or_b64 s[6:7], s[10:11], s[6:7]
 ; GCN-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GCN-NEXT:    s_cbranch_execnz .LBB5_3
-; GCN-NEXT:  ; %bb.4: ; %bb2
-; GCN-NEXT:    ; in Loop: Header=BB5_3 Depth=1
+; GCN-NEXT:    s_cbranch_execnz .LBB5_2
+; GCN-NEXT:  ; %bb.3: ; %bb2
+; GCN-NEXT:    ; in Loop: Header=BB5_2 Depth=1
 ; GCN-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GCN-NEXT:    v_mov_b32_e32 v8, v7
-; GCN-NEXT:    v_mov_b32_e32 v2, v7
-; GCN-NEXT:    v_mov_b32_e32 v6, v7
+; GCN-NEXT:    v_mov_b32_e32 v3, v1
+; GCN-NEXT:    v_mov_b32_e32 v2, v0
 ; GCN-NEXT:    s_and_saveexec_b64 s[10:11], s[4:5]
-; GCN-NEXT:    s_cbranch_execz .LBB5_2
-; GCN-NEXT:  ; %bb.5: ; %bb4
-; GCN-NEXT:    ; in Loop: Header=BB5_3 Depth=1
-; GCN-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_mov_b32_e32 v8, v7
-; GCN-NEXT:    v_mov_b32_e32 v2, v7
-; GCN-NEXT:    v_mov_b32_e32 v6, v7
+; GCN-NEXT:    s_cbranch_execz .LBB5_1
+; GCN-NEXT:  ; %bb.4: ; %bb4
+; GCN-NEXT:    ; in Loop: Header=BB5_2 Depth=1
+; GCN-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cmp_gt_f32_e64 s[6:7], 0, v0
+; GCN-NEXT:    v_cmp_gt_f32_e64 s[6:7], 0, v2
+; GCN-NEXT:    v_mov_b32_e32 v3, v1
+; GCN-NEXT:    v_mov_b32_e32 v2, v0
 ; GCN-NEXT:    s_and_saveexec_b64 s[12:13], s[6:7]
-; GCN-NEXT:    s_cbranch_execz .LBB5_1
-; GCN-NEXT:  ; %bb.6: ; %bb8
-; GCN-NEXT:    ; in Loop: Header=BB5_3 Depth=1
-; GCN-NEXT:    v_mov_b32_e32 v8, v7
-; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; GCN-NEXT:    ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6
+; GCN-NEXT:    s_or_b64 exec, exec, s[12:13]
 ; GCN-NEXT:    s_branch .LBB5_1
-; GCN-NEXT:  .LBB5_7: ; %bb12
+; GCN-NEXT:  .LBB5_5: ; %bb12
 ; GCN-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GCN-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v0, v0, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1043,135 +1034,126 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
 ; GCN-O0:       ; %bb.0: ; %bb
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN-O0-NEXT:    s_mov_b64 s[6:7], s[4:5]
-; GCN-O0-NEXT:    ; implicit-def: $vgpr7 : SGPR spill to VGPR lane
+; GCN-O0-NEXT:    ; implicit-def: $vgpr6 : SGPR spill to VGPR lane
 ; GCN-O0-NEXT:    s_waitcnt expcnt(1)
-; GCN-O0-NEXT:    v_writelane_b32 v7, s6, 0
-; GCN-O0-NEXT:    v_writelane_b32 v7, s7, 1
-; GCN-O0-NEXT:    v_writelane_b32 v7, s4, 2
-; GCN-O0-NEXT:    v_writelane_b32 v7, s5, 3
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
-; GCN-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 ; 4-byte Folded Spill
-; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
+; GCN-O0-NEXT:    v_writelane_b32 v6, s6, 0
+; GCN-O0-NEXT:    v_writelane_b32 v6, s7, 1
+; GCN-O0-NEXT:    v_writelane_b32 v6, s4, 2
+; GCN-O0-NEXT:    v_writelane_b32 v6, s5, 3
+; GCN-O0-NEXT:    s_or_saveexec_b64 s[12:13], -1
+; GCN-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    s_mov_b64 exec, s[12:13]
 ; GCN-O0-NEXT:  .LBB5_1: ; %bb1
 ; GCN-O0-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
+; GCN-O0-NEXT:    s_or_saveexec_b64 s[12:13], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
+; GCN-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    s_mov_b64 exec, s[12:13]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_readlane_b32 s8, v7, 2
-; GCN-O0-NEXT:    v_readlane_b32 s9, v7, 3
-; GCN-O0-NEXT:    v_readlane_b32 s6, v7, 0
-; GCN-O0-NEXT:    v_readlane_b32 s7, v7, 1
-; GCN-O0-NEXT:    v_writelane_b32 v7, s6, 4
-; GCN-O0-NEXT:    v_writelane_b32 v7, s7, 5
+; GCN-O0-NEXT:    v_readlane_b32 s8, v6, 2
+; GCN-O0-NEXT:    v_readlane_b32 s9, v6, 3
+; GCN-O0-NEXT:    v_readlane_b32 s6, v6, 0
+; GCN-O0-NEXT:    v_readlane_b32 s7, v6, 1
+; GCN-O0-NEXT:    v_writelane_b32 v6, s6, 4
+; GCN-O0-NEXT:    v_writelane_b32 v6, s7, 5
 ; GCN-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b32 s4, 0x207
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_cmp_lt_i32_e64 s[4:5], v0, s4
 ; GCN-O0-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
-; GCN-O0-NEXT:    v_writelane_b32 v7, s4, 6
-; GCN-O0-NEXT:    v_writelane_b32 v7, s5, 7
-; GCN-O0-NEXT:    v_writelane_b32 v7, s6, 0
-; GCN-O0-NEXT:    v_writelane_b32 v7, s7, 1
+; GCN-O0-NEXT:    v_writelane_b32 v6, s4, 6
+; GCN-O0-NEXT:    v_writelane_b32 v6, s5, 7
+; GCN-O0-NEXT:    v_writelane_b32 v6, s6, 0
+; GCN-O0-NEXT:    v_writelane_b32 v6, s7, 1
 ; GCN-O0-NEXT:    s_mov_b64 s[6:7], s[4:5]
-; GCN-O0-NEXT:    v_writelane_b32 v7, s6, 2
-; GCN-O0-NEXT:    v_writelane_b32 v7, s7, 3
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
-; GCN-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 ; 4-byte Folded Spill
-; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
+; GCN-O0-NEXT:    v_writelane_b32 v6, s6, 2
+; GCN-O0-NEXT:    v_writelane_b32 v6, s7, 3
+; GCN-O0-NEXT:    s_or_saveexec_b64 s[12:13], -1
+; GCN-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    s_mov_b64 exec, s[12:13]
 ; GCN-O0-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN-O0-NEXT:    s_cbranch_execnz .LBB5_1
 ; GCN-O0-NEXT:  ; %bb.2: ; %bb2
 ; GCN-O0-NEXT:    ; in Loop: Header=BB5_1 Depth=1
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
+; GCN-O0-NEXT:    s_or_saveexec_b64 s[12:13], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
+; GCN-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    s_mov_b64 exec, s[12:13]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_readlane_b32 s4, v7, 6
-; GCN-O0-NEXT:    v_readlane_b32 s5, v7, 7
+; GCN-O0-NEXT:    v_readlane_b32 s4, v6, 6
+; GCN-O0-NEXT:    v_readlane_b32 s5, v6, 7
 ; GCN-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b32 s6, 0
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_cmp_ne_u32_e64 s[4:5], v0, s6
 ; GCN-O0-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
-; GCN-O0-NEXT:    v_writelane_b32 v7, s4, 8
-; GCN-O0-NEXT:    v_writelane_b32 v7, s5, 9
-; GCN-O0-NEXT:    s_mov_b32 s4, 0
+; GCN-O0-NEXT:    v_writelane_b32 v6, s4, 8
+; GCN-O0-NEXT:    v_writelane_b32 v6, s5, 9
+; GCN-O0-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN-O0-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-O0-NEXT:    v_mov_b32_e32 v6, s4
-; GCN-O0-NEXT:    v_mov_b32_e32 v5, s4
+; GCN-O0-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-O0-NEXT:    v_mov_b32_e32 v4, s4
-; GCN-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
-; GCN-O0-NEXT:    v_mov_b32_e32 v1, v6
-; GCN-O0-NEXT:    v_mov_b32_e32 v2, v5
-; GCN-O0-NEXT:    v_mov_b32_e32 v3, v4
+; GCN-O0-NEXT:    v_mov_b32_e32 v5, s5
+; GCN-O0-NEXT:    ; kill: def $vgpr0_vgpr1 killed $vgpr0_vgpr1 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
+; GCN-O0-NEXT:    v_mov_b32_e32 v2, v4
+; GCN-O0-NEXT:    v_mov_b32_e32 v3, v5
 ; GCN-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b64 s[4:5], exec
-; GCN-O0-NEXT:    v_writelane_b32 v7, s4, 10
-; GCN-O0-NEXT:    v_writelane_b32 v7, s5, 11
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
-; GCN-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 ; 4-byte Folded Spill
-; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
+; GCN-O0-NEXT:    v_writelane_b32 v6, s4, 10
+; GCN-O0-NEXT:    v_writelane_b32 v6, s5, 11
+; GCN-O0-NEXT:    s_or_saveexec_b64 s[12:13], -1
+; GCN-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    s_mov_b64 exec, s[12:13]
 ; GCN-O0-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-O0-NEXT:    s_cbranch_execz .LBB5_5
 ; GCN-O0-NEXT:  ; %bb.3: ; %bb4
 ; GCN-O0-NEXT:    ; in Loop: Header=BB5_1 Depth=1
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
+; GCN-O0-NEXT:    s_or_saveexec_b64 s[12:13], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
+; GCN-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    s_mov_b64 exec, s[12:13]
 ; GCN-O0-NEXT:    ; implicit-def: $sgpr4
 ; GCN-O0-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-O0-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
 ; GCN-O0-NEXT:    s_mov_b32 s4, 0
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_cmp_lt_f32_e64 s[6:7], v0, s4
+; GCN-O0-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN-O0-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-O0-NEXT:    v_mov_b32_e32 v6, s4
-; GCN-O0-NEXT:    v_mov_b32_e32 v5, s4
+; GCN-O0-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-O0-NEXT:    v_mov_b32_e32 v4, s4
-; GCN-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
-; GCN-O0-NEXT:    v_mov_b32_e32 v1, v6
-; GCN-O0-NEXT:    v_mov_b32_e32 v2, v5
-; GCN-O0-NEXT:    v_mov_b32_e32 v3, v4
+; GCN-O0-NEXT:    v_mov_b32_e32 v5, s5
+; GCN-O0-NEXT:    ; kill: def $vgpr0_vgpr1 killed $vgpr0_vgpr1 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
+; GCN-O0-NEXT:    v_mov_b32_e32 v2, v4
+; GCN-O0-NEXT:    v_mov_b32_e32 v3, v5
 ; GCN-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b64 s[4:5], exec
-; GCN-O0-NEXT:    v_writelane_b32 v7, s4, 12
-; GCN-O0-NEXT:    v_writelane_b32 v7, s5, 13
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
-; GCN-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 ; 4-byte Folded Spill
-; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
+; GCN-O0-NEXT:    v_writelane_b32 v6, s4, 12
+; GCN-O0-NEXT:    v_writelane_b32 v6, s5, 13
+; GCN-O0-NEXT:    s_or_saveexec_b64 s[12:13], -1
+; GCN-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    s_mov_b64 exec, s[12:13]
 ; GCN-O0-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-O0-NEXT:    s_cbranch_execz .LBB5_6
 ; GCN-O0-NEXT:  ; %bb.4: ; %bb8
 ; GCN-O0-NEXT:    ; in Loop: Header=BB5_1 Depth=1
-; GCN-O0-NEXT:    s_mov_b32 s10, 0
-; GCN-O0-NEXT:    ; implicit-def: $sgpr4
-; GCN-O0-NEXT:    ; implicit-def: $sgpr5
-; GCN-O0-NEXT:    ; implicit-def: $sgpr9
-; GCN-O0-NEXT:    ; implicit-def: $sgpr5
-; GCN-O0-NEXT:    ; implicit-def: $sgpr8
-; GCN-O0-NEXT:    ; implicit-def: $sgpr5
-; GCN-O0-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
-; GCN-O0-NEXT:    s_mov_b32 s5, s10
-; GCN-O0-NEXT:    s_mov_b32 s6, s9
-; GCN-O0-NEXT:    s_mov_b32 s7, s8
+; GCN-O0-NEXT:    s_mov_b64 s[8:9], 0
+; GCN-O0-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GCN-O0-NEXT:    s_mov_b64 s[6:7], s[8:9]
 ; GCN-O0-NEXT:    s_waitcnt expcnt(1)
 ; GCN-O0-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-O0-NEXT:    v_mov_b32_e32 v1, s5
@@ -1184,13 +1166,13 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
 ; GCN-O0-NEXT:    s_branch .LBB5_6
 ; GCN-O0-NEXT:  .LBB5_5: ; %Flow2
 ; GCN-O0-NEXT:    ; in Loop: Header=BB5_1 Depth=1
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
+; GCN-O0-NEXT:    s_or_saveexec_b64 s[12:13], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
+; GCN-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    s_mov_b64 exec, s[12:13]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_readlane_b32 s4, v7, 10
-; GCN-O0-NEXT:    v_readlane_b32 s5, v7, 11
+; GCN-O0-NEXT:    v_readlane_b32 s4, v6, 10
+; GCN-O0-NEXT:    v_readlane_b32 s5, v6, 11
 ; GCN-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
@@ -1207,13 +1189,13 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
 ; GCN-O0-NEXT:    s_branch .LBB5_7
 ; GCN-O0-NEXT:  .LBB5_6: ; %Flow
 ; GCN-O0-NEXT:    ; in Loop: Header=BB5_1 Depth=1
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
+; GCN-O0-NEXT:    s_or_saveexec_b64 s[12:13], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
+; GCN-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    s_mov_b64 exec, s[12:13]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_readlane_b32 s4, v7, 12
-; GCN-O0-NEXT:    v_readlane_b32 s5, v7, 13
+; GCN-O0-NEXT:    v_readlane_b32 s4, v6, 12
+; GCN-O0-NEXT:    v_readlane_b32 s5, v6, 13
 ; GCN-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
@@ -1230,52 +1212,52 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
 ; GCN-O0-NEXT:    s_branch .LBB5_5
 ; GCN-O0-NEXT:  .LBB5_7: ; %bb10
 ; GCN-O0-NEXT:    ; in Loop: Header=BB5_1 Depth=1
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
-; GCN-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
+; GCN-O0-NEXT:    s_or_saveexec_b64 s[12:13], -1
+; GCN-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    s_mov_b64 exec, s[12:13]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_readlane_b32 s6, v7, 8
-; GCN-O0-NEXT:    v_readlane_b32 s7, v7, 9
+; GCN-O0-NEXT:    v_readlane_b32 s6, v6, 8
+; GCN-O0-NEXT:    v_readlane_b32 s7, v6, 9
 ; GCN-O0-NEXT:    s_mov_b64 s[4:5], -1
-; GCN-O0-NEXT:    v_writelane_b32 v7, s4, 14
-; GCN-O0-NEXT:    v_writelane_b32 v7, s5, 15
+; GCN-O0-NEXT:    v_writelane_b32 v6, s4, 14
+; GCN-O0-NEXT:    v_writelane_b32 v6, s5, 15
 ; GCN-O0-NEXT:    s_mov_b64 s[4:5], exec
-; GCN-O0-NEXT:    v_writelane_b32 v7, s4, 16
-; GCN-O0-NEXT:    v_writelane_b32 v7, s5, 17
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
-; GCN-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 ; 4-byte Folded Spill
-; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
+; GCN-O0-NEXT:    v_writelane_b32 v6, s4, 16
+; GCN-O0-NEXT:    v_writelane_b32 v6, s5, 17
+; GCN-O0-NEXT:    s_or_saveexec_b64 s[12:13], -1
+; GCN-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    s_mov_b64 exec, s[12:13]
 ; GCN-O0-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-O0-NEXT:    s_cbranch_execz .LBB5_9
 ; GCN-O0-NEXT:  ; %bb.8: ; %Flow1
 ; GCN-O0-NEXT:    ; in Loop: Header=BB5_1 Depth=1
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
+; GCN-O0-NEXT:    s_or_saveexec_b64 s[12:13], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
+; GCN-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    s_mov_b64 exec, s[12:13]
 ; GCN-O0-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN-O0-NEXT:    s_xor_b64 s[4:5], exec, -1
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_writelane_b32 v7, s4, 14
-; GCN-O0-NEXT:    v_writelane_b32 v7, s5, 15
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
-; GCN-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 ; 4-byte Folded Spill
-; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
+; GCN-O0-NEXT:    v_writelane_b32 v6, s4, 14
+; GCN-O0-NEXT:    v_writelane_b32 v6, s5, 15
+; GCN-O0-NEXT:    s_or_saveexec_b64 s[12:13], -1
+; GCN-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    s_mov_b64 exec, s[12:13]
 ; GCN-O0-NEXT:  .LBB5_9: ; %Flow3
 ; GCN-O0-NEXT:    ; in Loop: Header=BB5_1 Depth=1
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
+; GCN-O0-NEXT:    s_or_saveexec_b64 s[12:13], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
+; GCN-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    s_mov_b64 exec, s[12:13]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_readlane_b32 s8, v7, 16
-; GCN-O0-NEXT:    v_readlane_b32 s9, v7, 17
+; GCN-O0-NEXT:    v_readlane_b32 s8, v6, 16
+; GCN-O0-NEXT:    v_readlane_b32 s9, v6, 17
 ; GCN-O0-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GCN-O0-NEXT:    v_readlane_b32 s6, v7, 4
-; GCN-O0-NEXT:    v_readlane_b32 s7, v7, 5
-; GCN-O0-NEXT:    v_readlane_b32 s4, v7, 14
-; GCN-O0-NEXT:    v_readlane_b32 s5, v7, 15
+; GCN-O0-NEXT:    v_readlane_b32 s6, v6, 4
+; GCN-O0-NEXT:    v_readlane_b32 s7, v6, 5
+; GCN-O0-NEXT:    v_readlane_b32 s4, v6, 14
+; GCN-O0-NEXT:    v_readlane_b32 s5, v6, 15
 ; GCN-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
@@ -1284,16 +1266,16 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
 ; GCN-O0-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
 ; GCN-O0-NEXT:    s_mov_b64 s[6:7], 0
 ; GCN-O0-NEXT:    s_mov_b64 s[8:9], s[4:5]
-; GCN-O0-NEXT:    v_writelane_b32 v7, s8, 0
-; GCN-O0-NEXT:    v_writelane_b32 v7, s9, 1
-; GCN-O0-NEXT:    v_writelane_b32 v7, s6, 2
-; GCN-O0-NEXT:    v_writelane_b32 v7, s7, 3
+; GCN-O0-NEXT:    v_writelane_b32 v6, s8, 0
+; GCN-O0-NEXT:    v_writelane_b32 v6, s9, 1
+; GCN-O0-NEXT:    v_writelane_b32 v6, s6, 2
+; GCN-O0-NEXT:    v_writelane_b32 v6, s7, 3
 ; GCN-O0-NEXT:    s_mov_b64 s[6:7], s[4:5]
-; GCN-O0-NEXT:    v_writelane_b32 v7, s6, 18
-; GCN-O0-NEXT:    v_writelane_b32 v7, s7, 19
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
-; GCN-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 ; 4-byte Folded Spill
-; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
+; GCN-O0-NEXT:    v_writelane_b32 v6, s6, 18
+; GCN-O0-NEXT:    v_writelane_b32 v6, s7, 19
+; GCN-O0-NEXT:    s_or_saveexec_b64 s[12:13], -1
+; GCN-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill
+; GCN-O0-NEXT:    s_mov_b64 exec, s[12:13]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(4)
 ; GCN-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(4)
@@ -1305,13 +1287,13 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
 ; GCN-O0-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN-O0-NEXT:    s_cbranch_execnz .LBB5_1
 ; GCN-O0-NEXT:  ; %bb.10: ; %bb12
-; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
+; GCN-O0-NEXT:    s_or_saveexec_b64 s[12:13], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(4)
-; GCN-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
+; GCN-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    s_mov_b64 exec, s[12:13]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
-; GCN-O0-NEXT:    v_readlane_b32 s4, v7, 18
-; GCN-O0-NEXT:    v_readlane_b32 s5, v7, 19
+; GCN-O0-NEXT:    v_readlane_b32 s4, v6, 18
+; GCN-O0-NEXT:    v_readlane_b32 s5, v6, 19
 ; GCN-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN-O0-NEXT:  ; %bb.11: ; %bb12
 ; GCN-O0-NEXT:    s_waitcnt expcnt(3)
@@ -1344,7 +1326,7 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
 ; GCN-O0-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-O0-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
index e72f3d3ce993a..e0f7fe0d0bc31 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
@@ -2840,7 +2840,7 @@ define amdgpu_kernel void @s_test_canonicalize_undef_v4f16(ptr addrspace(1) %out
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; VI-NEXT:    v_mov_b32_e32 v0, 0
-; VI-NEXT:    v_mov_b32_e32 v1, v0
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
@@ -2851,9 +2851,10 @@ define amdgpu_kernel void @s_test_canonicalize_undef_v4f16(ptr addrspace(1) %out
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; CI-LABEL: s_test_canonicalize_undef_v4f16:
@@ -2861,8 +2862,8 @@ define amdgpu_kernel void @s_test_canonicalize_undef_v4f16(ptr addrspace(1) %out
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; CI-NEXT:    v_mov_b32_e32 v0, 0
 ; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_mov_b32 s2, -1
-; CI-NEXT:    v_mov_b32_e32 v1, v0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; CI-NEXT:    s_endpgm
@@ -2871,10 +2872,9 @@ define amdgpu_kernel void @s_test_canonicalize_undef_v4f16(ptr addrspace(1) %out
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_endpgm
   %canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> undef)
   store <4 x half> %canonicalized, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
index 4e12a30c6f6f4..996c485fe9a8d 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
@@ -1363,7 +1363,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f64(ptr addrspace(1) %out)
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; GFX678-NEXT:    v_mov_b32_e32 v0, 0
-; GFX678-NEXT:    v_mov_b32_e32 v1, v0
+; GFX678-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX678-NEXT:    v_mov_b32_e32 v2, s0
@@ -1374,29 +1374,28 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f64(ptr addrspace(1) %out)
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_fold_canonicalize_p0_f64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: test_fold_canonicalize_p0_f64:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_mov_b32_e32 v1, v0
+; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
+; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-NEXT:    s_endpgm
   %canonicalized = call double @llvm.canonicalize.f64(double 0.0)
   store double %canonicalized, ptr addrspace(1) %out
@@ -1574,7 +1573,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(ptr
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; GFX678-NEXT:    v_mov_b32_e32 v0, 0
-; GFX678-NEXT:    v_mov_b32_e32 v1, v0
+; GFX678-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX678-NEXT:    v_mov_b32_e32 v2, s0
@@ -1585,29 +1584,28 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(ptr
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal0_f64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal0_f64:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_mov_b32_e32 v1, v0
+; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
+; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-NEXT:    s_endpgm
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 4503599627370495 to double))
   store double %canonicalized, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-nondeterminism.ll b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-nondeterminism.ll
index 1e469b1951009..e475a713e243a 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-nondeterminism.ll
+++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-nondeterminism.ll
@@ -5,7 +5,6 @@ define amdgpu_gs void @f(i32 inreg %arg, i32 %arg1, i32 %arg2) {
 ; CHECK-LABEL: f:
 ; CHECK:       ; %bb.0: ; %bb
 ; CHECK-NEXT:    s_cmp_eq_u32 s0, 0
-; CHECK-NEXT:    s_mov_b32 s0, 0
 ; CHECK-NEXT:    s_cbranch_scc1 .LBB0_2
 ; CHECK-NEXT:  ; %bb.1: ; %bb3
 ; CHECK-NEXT:    v_mov_b32_e32 v5, v0
@@ -15,10 +14,9 @@ define amdgpu_gs void @f(i32 inreg %arg, i32 %arg1, i32 %arg2) {
 ; CHECK-NEXT:    v_mov_b32_e32 v5, 1
 ; CHECK-NEXT:  .LBB0_3: ; %bb4
 ; CHECK-NEXT:    v_mov_b32_e32 v6, 0
-; CHECK-NEXT:    s_mov_b32 s1, s0
-; CHECK-NEXT:    s_mov_b32 s2, s0
-; CHECK-NEXT:    s_mov_b32 s3, s0
-; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; CHECK-NEXT:    s_mov_b64 s[0:1], 0
+; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; CHECK-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; CHECK-NEXT:    v_mov_b32_e32 v7, v6
 ; CHECK-NEXT:    v_mov_b32_e32 v8, v6
 ; CHECK-NEXT:    v_mov_b32_e32 v2, v6
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
index e7c8604776ce0..750026fb23dae 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
@@ -17,16 +17,15 @@
 define amdgpu_kernel void @zero_init_kernel() {
 ; GFX9-LABEL: zero_init_kernel:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s0, 0
+; GFX9-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s8, s13
-; GFX9-NEXT:    s_mov_b32 s1, s0
-; GFX9-NEXT:    s_mov_b32 s2, s0
-; GFX9-NEXT:    s_mov_b32 s3, s0
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s9, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-NEXT:    s_mov_b32 s0, 0
 ; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:48
 ; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:32
 ; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:16
@@ -39,10 +38,8 @@ define amdgpu_kernel void @zero_init_kernel() {
 ; GFX10-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
-; GFX10-NEXT:    s_mov_b32 s0, 0
-; GFX10-NEXT:    s_mov_b32 s1, s0
-; GFX10-NEXT:    s_mov_b32 s2, s0
-; GFX10-NEXT:    s_mov_b32 s3, s0
+; GFX10-NEXT:    s_mov_b64 s[0:1], 0
+; GFX10-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s2
@@ -55,11 +52,9 @@ define amdgpu_kernel void @zero_init_kernel() {
 ;
 ; GFX11-LABEL: zero_init_kernel:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_mov_b32 s1, s0
-; GFX11-NEXT:    s_mov_b32 s2, s0
-; GFX11-NEXT:    s_mov_b32 s3, s0
+; GFX11-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-NEXT:    s_clause 0x3
@@ -71,11 +66,9 @@ define amdgpu_kernel void @zero_init_kernel() {
 ;
 ; GFX12-LABEL: zero_init_kernel:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_mov_b32 s1, s0
-; GFX12-NEXT:    s_mov_b32 s2, s0
-; GFX12-NEXT:    s_mov_b32 s3, s0
+; GFX12-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    s_clause 0x3
@@ -90,18 +83,17 @@ define amdgpu_kernel void @zero_init_kernel() {
 ; GFX9-PAL-NEXT:    s_getpc_b64 s[12:13]
 ; GFX9-PAL-NEXT:    s_mov_b32 s12, s0
 ; GFX9-PAL-NEXT:    s_load_dwordx2 s[12:13], s[12:13], 0x0
-; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
-; GFX9-PAL-NEXT:    s_mov_b32 s1, s0
-; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
-; GFX9-PAL-NEXT:    s_mov_b32 s3, s0
+; GFX9-PAL-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9-PAL-NEXT:    s_mov_b64 s[2:3], s[0:1]
+; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-PAL-NEXT:    s_and_b32 s13, s13, 0xffff
 ; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s12, s11
-; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
-; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
 ; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:48
 ; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:32
 ; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:16
@@ -110,10 +102,8 @@ define amdgpu_kernel void @zero_init_kernel() {
 ;
 ; GFX942-LABEL: zero_init_kernel:
 ; GFX942:       ; %bb.0:
-; GFX942-NEXT:    s_mov_b32 s0, 0
-; GFX942-NEXT:    s_mov_b32 s1, s0
-; GFX942-NEXT:    s_mov_b32 s2, s0
-; GFX942-NEXT:    s_mov_b32 s3, s0
+; GFX942-NEXT:    s_mov_b64 s[0:1], 0
+; GFX942-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX942-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:48
@@ -133,14 +123,13 @@ define amdgpu_kernel void @zero_init_kernel() {
 ; GFX1010-PAL-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
 ; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
-; GFX1010-PAL-NEXT:    s_mov_b32 s0, 0
-; GFX1010-PAL-NEXT:    s_mov_b32 s1, s0
-; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
-; GFX1010-PAL-NEXT:    s_mov_b32 s3, s0
+; GFX1010-PAL-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1010-PAL-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1010-PAL-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX1010-PAL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX1010-PAL-NEXT:    s_mov_b32 s0, 0
 ; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:48
 ; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:32
 ; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:16
@@ -158,10 +147,8 @@ define amdgpu_kernel void @zero_init_kernel() {
 ; GFX1030-PAL-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
 ; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
-; GFX1030-PAL-NEXT:    s_mov_b32 s0, 0
-; GFX1030-PAL-NEXT:    s_mov_b32 s1, s0
-; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
-; GFX1030-PAL-NEXT:    s_mov_b32 s3, s0
+; GFX1030-PAL-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1030-PAL-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1030-PAL-NEXT:    v_mov_b32_e32 v2, s2
@@ -174,11 +161,9 @@ define amdgpu_kernel void @zero_init_kernel() {
 ;
 ; GFX11-PAL-LABEL: zero_init_kernel:
 ; GFX11-PAL:       ; %bb.0:
-; GFX11-PAL-NEXT:    s_mov_b32 s0, 0
+; GFX11-PAL-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX11-PAL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-PAL-NEXT:    s_mov_b32 s1, s0
-; GFX11-PAL-NEXT:    s_mov_b32 s2, s0
-; GFX11-PAL-NEXT:    s_mov_b32 s3, s0
+; GFX11-PAL-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX11-PAL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-PAL-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-PAL-NEXT:    s_clause 0x3
@@ -190,11 +175,9 @@ define amdgpu_kernel void @zero_init_kernel() {
 ;
 ; GFX12-PAL-LABEL: zero_init_kernel:
 ; GFX12-PAL:       ; %bb.0:
-; GFX12-PAL-NEXT:    s_mov_b32 s0, 0
+; GFX12-PAL-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX12-PAL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-PAL-NEXT:    s_mov_b32 s1, s0
-; GFX12-PAL-NEXT:    s_mov_b32 s2, s0
-; GFX12-PAL-NEXT:    s_mov_b32 s3, s0
+; GFX12-PAL-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-PAL-NEXT:    s_clause 0x3
@@ -212,10 +195,8 @@ define void @zero_init_foo() {
 ; GFX9-LABEL: zero_init_foo:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s0, 0
-; GFX9-NEXT:    s_mov_b32 s1, s0
-; GFX9-NEXT:    s_mov_b32 s2, s0
-; GFX9-NEXT:    s_mov_b32 s3, s0
+; GFX9-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
@@ -230,10 +211,8 @@ define void @zero_init_foo() {
 ; GFX10-LABEL: zero_init_foo:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s0, 0
-; GFX10-NEXT:    s_mov_b32 s1, s0
-; GFX10-NEXT:    s_mov_b32 s2, s0
-; GFX10-NEXT:    s_mov_b32 s3, s0
+; GFX10-NEXT:    s_mov_b64 s[0:1], 0
+; GFX10-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s2
@@ -247,11 +226,9 @@ define void @zero_init_foo() {
 ; GFX11-LABEL: zero_init_foo:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_mov_b32 s1, s0
-; GFX11-NEXT:    s_mov_b32 s2, s0
-; GFX11-NEXT:    s_mov_b32 s3, s0
+; GFX11-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-NEXT:    s_clause 0x3
@@ -268,13 +245,11 @@ define void @zero_init_foo() {
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_mov_b32 s1, s0
-; GFX12-NEXT:    s_mov_b32 s2, s0
-; GFX12-NEXT:    s_mov_b32 s3, s0
+; GFX12-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    s_clause 0x3
 ; GFX12-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:48
@@ -286,10 +261,8 @@ define void @zero_init_foo() {
 ; GFX9-PAL-LABEL: zero_init_foo:
 ; GFX9-PAL:       ; %bb.0:
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
-; GFX9-PAL-NEXT:    s_mov_b32 s1, s0
-; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
-; GFX9-PAL-NEXT:    s_mov_b32 s3, s0
+; GFX9-PAL-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9-PAL-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
@@ -304,10 +277,8 @@ define void @zero_init_foo() {
 ; GFX942-LABEL: zero_init_foo:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    s_mov_b32 s0, 0
-; GFX942-NEXT:    s_mov_b32 s1, s0
-; GFX942-NEXT:    s_mov_b32 s2, s0
-; GFX942-NEXT:    s_mov_b32 s3, s0
+; GFX942-NEXT:    s_mov_b64 s[0:1], 0
+; GFX942-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX942-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:48
@@ -320,10 +291,8 @@ define void @zero_init_foo() {
 ; GFX10-PAL-LABEL: zero_init_foo:
 ; GFX10-PAL:       ; %bb.0:
 ; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-PAL-NEXT:    s_mov_b32 s0, 0
-; GFX10-PAL-NEXT:    s_mov_b32 s1, s0
-; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
-; GFX10-PAL-NEXT:    s_mov_b32 s3, s0
+; GFX10-PAL-NEXT:    s_mov_b64 s[0:1], 0
+; GFX10-PAL-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, s2
@@ -337,11 +306,9 @@ define void @zero_init_foo() {
 ; GFX11-PAL-LABEL: zero_init_foo:
 ; GFX11-PAL:       ; %bb.0:
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-PAL-NEXT:    s_mov_b32 s0, 0
+; GFX11-PAL-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX11-PAL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-PAL-NEXT:    s_mov_b32 s1, s0
-; GFX11-PAL-NEXT:    s_mov_b32 s2, s0
-; GFX11-PAL-NEXT:    s_mov_b32 s3, s0
+; GFX11-PAL-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX11-PAL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-PAL-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-PAL-NEXT:    s_clause 0x3
@@ -358,13 +325,11 @@ define void @zero_init_foo() {
 ; GFX12-PAL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-PAL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-PAL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-PAL-NEXT:    s_mov_b32 s0, 0
-; GFX12-PAL-NEXT:    s_wait_alu 0xfffe
-; GFX12-PAL-NEXT:    s_mov_b32 s1, s0
-; GFX12-PAL-NEXT:    s_mov_b32 s2, s0
-; GFX12-PAL-NEXT:    s_mov_b32 s3, s0
+; GFX12-PAL-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX12-PAL-NEXT:    s_wait_alu 0xfffe
+; GFX12-PAL-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-PAL-NEXT:    s_wait_alu 0xfffe
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-PAL-NEXT:    s_clause 0x3
 ; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:48
@@ -1043,13 +1008,13 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
 ; GFX9-NEXT:    s_mov_b32 s0, 0
 ; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s1, s0
-; GFX9-NEXT:    s_mov_b32 s2, s0
-; GFX9-NEXT:    s_mov_b32 s3, s0
+; GFX9-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-NEXT:    s_mov_b32 s0, 0
 ; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:256
 ; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:272
 ; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:288
@@ -1064,10 +1029,8 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
 ; GFX10-NEXT:    scratch_load_dword v0, off, off glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s0, 0
-; GFX10-NEXT:    s_mov_b32 s1, s0
-; GFX10-NEXT:    s_mov_b32 s2, s0
-; GFX10-NEXT:    s_mov_b32 s3, s0
+; GFX10-NEXT:    s_mov_b64 s[0:1], 0
+; GFX10-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s2
@@ -1082,11 +1045,9 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    scratch_load_b32 v0, off, off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_mov_b32 s1, s0
-; GFX11-NEXT:    s_mov_b32 s2, s0
-; GFX11-NEXT:    s_mov_b32 s3, s0
+; GFX11-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-NEXT:    s_clause 0x3
@@ -1100,11 +1061,9 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    scratch_load_b32 v0, off, off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_mov_b32 s1, s0
-; GFX12-NEXT:    s_mov_b32 s2, s0
-; GFX12-NEXT:    s_mov_b32 s3, s0
+; GFX12-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    s_clause 0x3
@@ -1120,19 +1079,19 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
 ; GFX9-PAL-NEXT:    s_mov_b32 s12, s0
 ; GFX9-PAL-NEXT:    s_load_dwordx2 s[12:13], s[12:13], 0x0
 ; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
-; GFX9-PAL-NEXT:    s_mov_b32 s1, s0
-; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
-; GFX9-PAL-NEXT:    s_mov_b32 s3, s0
 ; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-PAL-NEXT:    s_and_b32 s13, s13, 0xffff
 ; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s12, s11
 ; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
 ; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9-PAL-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
 ; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:256
 ; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:272
 ; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:288
@@ -1143,10 +1102,8 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    scratch_load_dword v0, off, off sc0 sc1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    s_mov_b32 s0, 0
-; GFX942-NEXT:    s_mov_b32 s1, s0
-; GFX942-NEXT:    s_mov_b32 s2, s0
-; GFX942-NEXT:    s_mov_b32 s3, s0
+; GFX942-NEXT:    s_mov_b64 s[0:1], 0
+; GFX942-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX942-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:256
@@ -1169,13 +1126,14 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
 ; GFX1010-PAL-NEXT:    s_mov_b32 s0, 0
 ; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s0 glc dlc
 ; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX1010-PAL-NEXT:    s_mov_b32 s1, s0
-; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
-; GFX1010-PAL-NEXT:    s_mov_b32 s3, s0
+; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX1010-PAL-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1010-PAL-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1010-PAL-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX1010-PAL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX1010-PAL-NEXT:    s_mov_b32 s0, 0
 ; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:256
 ; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:272
 ; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:288
@@ -1195,10 +1153,8 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
 ; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
 ; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, off glc dlc
 ; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX1030-PAL-NEXT:    s_mov_b32 s0, 0
-; GFX1030-PAL-NEXT:    s_mov_b32 s1, s0
-; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
-; GFX1030-PAL-NEXT:    s_mov_b32 s3, s0
+; GFX1030-PAL-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1030-PAL-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1030-PAL-NEXT:    v_mov_b32_e32 v2, s2
@@ -1213,11 +1169,9 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
 ; GFX11-PAL:       ; %bb.0:
 ; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, off glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-PAL-NEXT:    s_mov_b32 s0, 0
+; GFX11-PAL-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX11-PAL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-PAL-NEXT:    s_mov_b32 s1, s0
-; GFX11-PAL-NEXT:    s_mov_b32 s2, s0
-; GFX11-PAL-NEXT:    s_mov_b32 s3, s0
+; GFX11-PAL-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX11-PAL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-PAL-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-PAL-NEXT:    s_clause 0x3
@@ -1231,11 +1185,9 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
 ; GFX12-PAL:       ; %bb.0:
 ; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, off scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
-; GFX12-PAL-NEXT:    s_mov_b32 s0, 0
+; GFX12-PAL-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX12-PAL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-PAL-NEXT:    s_mov_b32 s1, s0
-; GFX12-PAL-NEXT:    s_mov_b32 s2, s0
-; GFX12-PAL-NEXT:    s_mov_b32 s3, s0
+; GFX12-PAL-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-PAL-NEXT:    s_clause 0x3
@@ -1258,10 +1210,8 @@ define void @zero_init_small_offset_foo() {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    scratch_load_dword v0, off, s32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s0, 0
-; GFX9-NEXT:    s_mov_b32 s1, s0
-; GFX9-NEXT:    s_mov_b32 s2, s0
-; GFX9-NEXT:    s_mov_b32 s3, s0
+; GFX9-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
@@ -1278,10 +1228,8 @@ define void @zero_init_small_offset_foo() {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    scratch_load_dword v0, off, s32 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s0, 0
-; GFX10-NEXT:    s_mov_b32 s1, s0
-; GFX10-NEXT:    s_mov_b32 s2, s0
-; GFX10-NEXT:    s_mov_b32 s3, s0
+; GFX10-NEXT:    s_mov_b64 s[0:1], 0
+; GFX10-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s2
@@ -1297,11 +1245,9 @@ define void @zero_init_small_offset_foo() {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    scratch_load_b32 v0, off, s32 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_mov_b32 s1, s0
-; GFX11-NEXT:    s_mov_b32 s2, s0
-; GFX11-NEXT:    s_mov_b32 s3, s0
+; GFX11-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-NEXT:    s_clause 0x3
@@ -1320,13 +1266,11 @@ define void @zero_init_small_offset_foo() {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    scratch_load_b32 v0, off, s32 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_mov_b32 s1, s0
-; GFX12-NEXT:    s_mov_b32 s2, s0
-; GFX12-NEXT:    s_mov_b32 s3, s0
+; GFX12-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    s_clause 0x3
 ; GFX12-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:256
@@ -1340,10 +1284,8 @@ define void @zero_init_small_offset_foo() {
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s32 glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
-; GFX9-PAL-NEXT:    s_mov_b32 s1, s0
-; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
-; GFX9-PAL-NEXT:    s_mov_b32 s3, s0
+; GFX9-PAL-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9-PAL-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
@@ -1360,10 +1302,8 @@ define void @zero_init_small_offset_foo() {
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 sc0 sc1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    s_mov_b32 s0, 0
-; GFX942-NEXT:    s_mov_b32 s1, s0
-; GFX942-NEXT:    s_mov_b32 s2, s0
-; GFX942-NEXT:    s_mov_b32 s3, s0
+; GFX942-NEXT:    s_mov_b64 s[0:1], 0
+; GFX942-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX942-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:256
@@ -1378,10 +1318,8 @@ define void @zero_init_small_offset_foo() {
 ; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s32 glc dlc
 ; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-PAL-NEXT:    s_mov_b32 s0, 0
-; GFX10-PAL-NEXT:    s_mov_b32 s1, s0
-; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
-; GFX10-PAL-NEXT:    s_mov_b32 s3, s0
+; GFX10-PAL-NEXT:    s_mov_b64 s[0:1], 0
+; GFX10-PAL-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, s2
@@ -1397,11 +1335,9 @@ define void @zero_init_small_offset_foo() {
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, s32 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-PAL-NEXT:    s_mov_b32 s0, 0
+; GFX11-PAL-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX11-PAL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-PAL-NEXT:    s_mov_b32 s1, s0
-; GFX11-PAL-NEXT:    s_mov_b32 s2, s0
-; GFX11-PAL-NEXT:    s_mov_b32 s3, s0
+; GFX11-PAL-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX11-PAL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-PAL-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-PAL-NEXT:    s_clause 0x3
@@ -1420,13 +1356,11 @@ define void @zero_init_small_offset_foo() {
 ; GFX12-PAL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, s32 scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
-; GFX12-PAL-NEXT:    s_mov_b32 s0, 0
-; GFX12-PAL-NEXT:    s_wait_alu 0xfffe
-; GFX12-PAL-NEXT:    s_mov_b32 s1, s0
-; GFX12-PAL-NEXT:    s_mov_b32 s2, s0
-; GFX12-PAL-NEXT:    s_mov_b32 s3, s0
+; GFX12-PAL-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX12-PAL-NEXT:    s_wait_alu 0xfffe
+; GFX12-PAL-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-PAL-NEXT:    s_wait_alu 0xfffe
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-PAL-NEXT:    s_clause 0x3
 ; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:256
@@ -2237,9 +2171,8 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() {
 ; GFX9-NEXT:    s_mov_b32 s0, 0
 ; GFX9-NEXT:    scratch_load_dword v0, off, s0 offset:4 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s1, s0
-; GFX9-NEXT:    s_mov_b32 s2, s0
-; GFX9-NEXT:    s_mov_b32 s3, s0
+; GFX9-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
@@ -2259,10 +2192,8 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() {
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
 ; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s0, 0
-; GFX10-NEXT:    s_mov_b32 s1, s0
-; GFX10-NEXT:    s_mov_b32 s2, s0
-; GFX10-NEXT:    s_mov_b32 s3, s0
+; GFX10-NEXT:    s_mov_b64 s[0:1], 0
+; GFX10-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s2
@@ -2278,11 +2209,9 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    scratch_load_b32 v0, off, off offset:4 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_mov_b32 s1, s0
-; GFX11-NEXT:    s_mov_b32 s2, s0
-; GFX11-NEXT:    s_mov_b32 s3, s0
+; GFX11-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-NEXT:    s_movk_i32 s0, 0x4004
@@ -2297,11 +2226,9 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() {
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    scratch_load_b32 v0, off, off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_mov_b32 s1, s0
-; GFX12-NEXT:    s_mov_b32 s2, s0
-; GFX12-NEXT:    s_mov_b32 s3, s0
+; GFX12-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    s_clause 0x3
@@ -2317,15 +2244,14 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() {
 ; GFX9-PAL-NEXT:    s_mov_b32 s12, s0
 ; GFX9-PAL-NEXT:    s_load_dwordx2 s[12:13], s[12:13], 0x0
 ; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
-; GFX9-PAL-NEXT:    s_mov_b32 s1, s0
-; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
-; GFX9-PAL-NEXT:    s_mov_b32 s3, s0
 ; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-PAL-NEXT:    s_and_b32 s13, s13, 0xffff
 ; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s12, s11
 ; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
 ; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 offset:4 glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9-PAL-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
@@ -2341,10 +2267,8 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() {
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    scratch_load_dword v0, off, off offset:4 sc0 sc1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    s_mov_b32 s0, 0
-; GFX942-NEXT:    s_mov_b32 s1, s0
-; GFX942-NEXT:    s_mov_b32 s2, s0
-; GFX942-NEXT:    s_mov_b32 s3, s0
+; GFX942-NEXT:    s_mov_b64 s[0:1], 0
+; GFX942-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX942-NEXT:    s_movk_i32 s0, 0x4004
@@ -2368,9 +2292,9 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() {
 ; GFX1010-PAL-NEXT:    s_mov_b32 s0, 0
 ; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s0 offset:4 glc dlc
 ; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX1010-PAL-NEXT:    s_mov_b32 s1, s0
-; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
-; GFX1010-PAL-NEXT:    s_mov_b32 s3, s0
+; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX1010-PAL-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1010-PAL-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1010-PAL-NEXT:    v_mov_b32_e32 v2, s2
@@ -2395,10 +2319,8 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() {
 ; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
 ; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
 ; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX1030-PAL-NEXT:    s_mov_b32 s0, 0
-; GFX1030-PAL-NEXT:    s_mov_b32 s1, s0
-; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
-; GFX1030-PAL-NEXT:    s_mov_b32 s3, s0
+; GFX1030-PAL-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1030-PAL-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1030-PAL-NEXT:    v_mov_b32_e32 v2, s2
@@ -2414,11 +2336,9 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() {
 ; GFX11-PAL:       ; %bb.0:
 ; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, off offset:4 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-PAL-NEXT:    s_mov_b32 s0, 0
+; GFX11-PAL-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX11-PAL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-PAL-NEXT:    s_mov_b32 s1, s0
-; GFX11-PAL-NEXT:    s_mov_b32 s2, s0
-; GFX11-PAL-NEXT:    s_mov_b32 s3, s0
+; GFX11-PAL-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX11-PAL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-PAL-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-PAL-NEXT:    s_movk_i32 s0, 0x4004
@@ -2433,11 +2353,9 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() {
 ; GFX12-PAL:       ; %bb.0:
 ; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, off scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
-; GFX12-PAL-NEXT:    s_mov_b32 s0, 0
+; GFX12-PAL-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX12-PAL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-PAL-NEXT:    s_mov_b32 s1, s0
-; GFX12-PAL-NEXT:    s_mov_b32 s2, s0
-; GFX12-PAL-NEXT:    s_mov_b32 s3, s0
+; GFX12-PAL-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-PAL-NEXT:    s_clause 0x3
@@ -2460,10 +2378,8 @@ define void @zero_init_large_offset_foo() {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    scratch_load_dword v0, off, s32 offset:4 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s0, 0
-; GFX9-NEXT:    s_mov_b32 s1, s0
-; GFX9-NEXT:    s_mov_b32 s2, s0
-; GFX9-NEXT:    s_mov_b32 s3, s0
+; GFX9-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
@@ -2484,10 +2400,8 @@ define void @zero_init_large_offset_foo() {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    scratch_load_dword v0, off, s32 offset:4 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s0, 0
-; GFX10-NEXT:    s_mov_b32 s1, s0
-; GFX10-NEXT:    s_mov_b32 s2, s0
-; GFX10-NEXT:    s_mov_b32 s3, s0
+; GFX10-NEXT:    s_mov_b64 s[0:1], 0
+; GFX10-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s2
@@ -2507,11 +2421,9 @@ define void @zero_init_large_offset_foo() {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    scratch_load_b32 v0, off, s32 offset:4 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_mov_b32 s1, s0
-; GFX11-NEXT:    s_mov_b32 s2, s0
-; GFX11-NEXT:    s_mov_b32 s3, s0
+; GFX11-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-NEXT:    s_add_i32 s0, s32, 0x4004
@@ -2533,13 +2445,11 @@ define void @zero_init_large_offset_foo() {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    scratch_load_b32 v0, off, s32 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_mov_b32 s1, s0
-; GFX12-NEXT:    s_mov_b32 s2, s0
-; GFX12-NEXT:    s_mov_b32 s3, s0
+; GFX12-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    s_clause 0x3
 ; GFX12-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:16384
@@ -2553,10 +2463,8 @@ define void @zero_init_large_offset_foo() {
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s32 offset:4 glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
-; GFX9-PAL-NEXT:    s_mov_b32 s1, s0
-; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
-; GFX9-PAL-NEXT:    s_mov_b32 s3, s0
+; GFX9-PAL-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9-PAL-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
@@ -2577,10 +2485,8 @@ define void @zero_init_large_offset_foo() {
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 offset:4 sc0 sc1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    s_mov_b32 s0, 0
-; GFX942-NEXT:    s_mov_b32 s1, s0
-; GFX942-NEXT:    s_mov_b32 s2, s0
-; GFX942-NEXT:    s_mov_b32 s3, s0
+; GFX942-NEXT:    s_mov_b64 s[0:1], 0
+; GFX942-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX942-NEXT:    s_add_i32 s0, s32, 0x4004
@@ -2599,10 +2505,8 @@ define void @zero_init_large_offset_foo() {
 ; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s32 offset:4 glc dlc
 ; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX1010-PAL-NEXT:    s_mov_b32 s0, 0
-; GFX1010-PAL-NEXT:    s_mov_b32 s1, s0
-; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
-; GFX1010-PAL-NEXT:    s_mov_b32 s3, s0
+; GFX1010-PAL-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1010-PAL-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1010-PAL-NEXT:    v_mov_b32_e32 v2, s2
@@ -2625,10 +2529,8 @@ define void @zero_init_large_offset_foo() {
 ; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, s32 offset:4 glc dlc
 ; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX1030-PAL-NEXT:    s_mov_b32 s0, 0
-; GFX1030-PAL-NEXT:    s_mov_b32 s1, s0
-; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
-; GFX1030-PAL-NEXT:    s_mov_b32 s3, s0
+; GFX1030-PAL-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1030-PAL-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1030-PAL-NEXT:    v_mov_b32_e32 v2, s2
@@ -2648,11 +2550,9 @@ define void @zero_init_large_offset_foo() {
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, s32 offset:4 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-PAL-NEXT:    s_mov_b32 s0, 0
+; GFX11-PAL-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX11-PAL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-PAL-NEXT:    s_mov_b32 s1, s0
-; GFX11-PAL-NEXT:    s_mov_b32 s2, s0
-; GFX11-PAL-NEXT:    s_mov_b32 s3, s0
+; GFX11-PAL-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX11-PAL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-PAL-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-PAL-NEXT:    s_add_i32 s0, s32, 0x4004
@@ -2674,13 +2574,11 @@ define void @zero_init_large_offset_foo() {
 ; GFX12-PAL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, s32 scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
-; GFX12-PAL-NEXT:    s_mov_b32 s0, 0
-; GFX12-PAL-NEXT:    s_wait_alu 0xfffe
-; GFX12-PAL-NEXT:    s_mov_b32 s1, s0
-; GFX12-PAL-NEXT:    s_mov_b32 s2, s0
-; GFX12-PAL-NEXT:    s_mov_b32 s3, s0
+; GFX12-PAL-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX12-PAL-NEXT:    s_wait_alu 0xfffe
+; GFX12-PAL-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-PAL-NEXT:    s_wait_alu 0xfffe
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-PAL-NEXT:    s_clause 0x3
 ; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:16384
@@ -4770,12 +4668,12 @@ bb:
 define amdgpu_ps void @large_offset() {
 ; GFX9-LABEL: large_offset:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
-; GFX9-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX9-NEXT:    s_mov_b32 s0, 0
 ; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:3024
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -4798,11 +4696,11 @@ define amdgpu_ps void @large_offset() {
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_movk_i32 s0, 0x810
 ; GFX10-NEXT:    s_add_i32 s1, s0, 0x3c0
-; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v0
-; GFX10-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s1
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_load_dwordx4 v[0:3], off, s1 glc dlc
@@ -4819,12 +4717,11 @@ define amdgpu_ps void @large_offset() {
 ; GFX11-LABEL: large_offset:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    s_mov_b32 s0, 16
 ; GFX11-NEXT:    s_movk_i32 s1, 0x810
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_mov_b32_e32 v1, v0
-; GFX11-NEXT:    v_mov_b32_e32 v2, v0
-; GFX11-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:3024 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    scratch_load_b128 v[0:3], off, off offset:3024 glc dlc
@@ -4840,11 +4737,11 @@ define amdgpu_ps void @large_offset() {
 ; GFX12-LABEL: large_offset:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:    s_movk_i32 s1, 0x800
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, v0
-; GFX12-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
 ; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:3008 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_load_b128 v[0:3], off, off offset:3008 scope:SCOPE_SYS
@@ -4863,9 +4760,9 @@ define amdgpu_ps void @large_offset() {
 ; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
 ; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
 ; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s0
@@ -4887,10 +4784,8 @@ define amdgpu_ps void @large_offset() {
 ;
 ; GFX942-LABEL: large_offset:
 ; GFX942:       ; %bb.0: ; %bb
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    v_mov_b32_e32 v1, v0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], 0
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
 ; GFX942-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:3024 sc0 sc1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:3024 sc0 sc1
@@ -4917,11 +4812,11 @@ define amdgpu_ps void @large_offset() {
 ; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
 ; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
 ; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1010-PAL-NEXT:    s_movk_i32 s0, 0x810
 ; GFX1010-PAL-NEXT:    s_add_i32 s1, s0, 0x3c0
-; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1010-PAL-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX1010-PAL-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1010-PAL-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s1
 ; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1010-PAL-NEXT:    scratch_load_dwordx4 v[0:3], off, s1 glc dlc
@@ -4948,11 +4843,11 @@ define amdgpu_ps void @large_offset() {
 ; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
 ; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
 ; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1030-PAL-NEXT:    s_movk_i32 s0, 0x810
 ; GFX1030-PAL-NEXT:    s_add_i32 s1, s0, 0x3c0
-; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1030-PAL-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX1030-PAL-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1030-PAL-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s1
 ; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1030-PAL-NEXT:    scratch_load_dwordx4 v[0:3], off, s1 glc dlc
@@ -4969,12 +4864,11 @@ define amdgpu_ps void @large_offset() {
 ; GFX11-PAL-LABEL: large_offset:
 ; GFX11-PAL:       ; %bb.0: ; %bb
 ; GFX11-PAL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-PAL-NEXT:    s_mov_b32 s0, 16
 ; GFX11-PAL-NEXT:    s_movk_i32 s1, 0x810
 ; GFX11-PAL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, v0
-; GFX11-PAL-NEXT:    v_mov_b32_e32 v2, v0
-; GFX11-PAL-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-PAL-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
 ; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:3024 dlc
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-PAL-NEXT:    scratch_load_b128 v[0:3], off, off offset:3024 glc dlc
@@ -4990,11 +4884,11 @@ define amdgpu_ps void @large_offset() {
 ; GFX12-PAL-LABEL: large_offset:
 ; GFX12-PAL:       ; %bb.0: ; %bb
 ; GFX12-PAL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-PAL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12-PAL-NEXT:    s_mov_b32 s0, 0
 ; GFX12-PAL-NEXT:    s_movk_i32 s1, 0x800
 ; GFX12-PAL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-PAL-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, v0
-; GFX12-PAL-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-PAL-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
 ; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:3008 scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_load_b128 v[0:3], off, off offset:3008 scope:SCOPE_SYS
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
index 6384fdba7a45a..232e394c5fc2d 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
@@ -1982,11 +1982,9 @@ define amdgpu_gfx <512 x i32> @return_512xi32() #0 {
 ; GFX11-LABEL: return_512xi32:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_mov_b32 s3, s0
-; GFX11-NEXT:    s_mov_b32 s1, s0
-; GFX11-NEXT:    s_mov_b32 s2, s0
+; GFX11-NEXT:    s_mov_b64 s[0:1], 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX11-NEXT:    v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2
 ; GFX11-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
 ; GFX11-NEXT:    s_clause 0x1f
@@ -3186,11 +3184,9 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v60, s33 offset:1600 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX11-NEXT:    v_mov_b32_e32 v4, 0
-; GFX11-NEXT:    s_mov_b32 s1, s0
-; GFX11-NEXT:    s_mov_b32 s2, s0
-; GFX11-NEXT:    s_mov_b32 s3, s0
+; GFX11-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-NEXT:    s_mov_b32 s36, s34
diff --git a/llvm/test/CodeGen/AMDGPU/hazard-recognizer-src-shared-base.ll b/llvm/test/CodeGen/AMDGPU/hazard-recognizer-src-shared-base.ll
index 4aa49f2c9296d..11f931aacbc96 100644
--- a/llvm/test/CodeGen/AMDGPU/hazard-recognizer-src-shared-base.ll
+++ b/llvm/test/CodeGen/AMDGPU/hazard-recognizer-src-shared-base.ll
@@ -5,9 +5,9 @@ define amdgpu_kernel void @foo() {
 ; CHECK-LABEL: foo:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_mov_b64 s[0:1], src_shared_base
-; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; CHECK-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s1
-; CHECK-NEXT:    v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v0
+; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
+; CHECK-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, 0
 ; CHECK-NEXT:    flat_store_b64 v[0:1], v[2:3]
 ; CHECK-NEXT:    s_endpgm
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
index 8dbd6c5d133ea..8090a36c07b42 100644
--- a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
+++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
@@ -6,100 +6,127 @@ define void @main(i1 %arg) #0 {
 ; CHECK:       ; %bb.0: ; %bb
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; CHECK-NEXT:    buffer_store_dword v5, off, s[0:3], s32 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
-; CHECK-NEXT:    v_writelane_b32 v5, s30, 0
-; CHECK-NEXT:    v_writelane_b32 v5, s31, 1
-; CHECK-NEXT:    v_writelane_b32 v5, s34, 2
-; CHECK-NEXT:    v_writelane_b32 v5, s35, 3
-; CHECK-NEXT:    v_writelane_b32 v5, s36, 4
-; CHECK-NEXT:    v_writelane_b32 v5, s37, 5
-; CHECK-NEXT:    v_writelane_b32 v5, s38, 6
-; CHECK-NEXT:    s_getpc_b64 s[24:25]
-; CHECK-NEXT:    v_writelane_b32 v5, s39, 7
-; CHECK-NEXT:    s_movk_i32 s20, 0xf0
-; CHECK-NEXT:    s_mov_b32 s21, s24
-; CHECK-NEXT:    v_writelane_b32 v5, s48, 8
-; CHECK-NEXT:    s_load_dwordx16 s[4:19], s[20:21], 0x0
-; CHECK-NEXT:    s_mov_b64 s[20:21], 0
-; CHECK-NEXT:    v_writelane_b32 v5, s49, 9
-; CHECK-NEXT:    s_load_dwordx4 s[20:23], s[20:21], 0x0
-; CHECK-NEXT:    v_writelane_b32 v5, s50, 10
+; CHECK-NEXT:    v_writelane_b32 v6, s30, 0
+; CHECK-NEXT:    v_writelane_b32 v6, s31, 1
+; CHECK-NEXT:    v_writelane_b32 v6, s34, 2
+; CHECK-NEXT:    v_writelane_b32 v6, s35, 3
+; CHECK-NEXT:    v_writelane_b32 v6, s36, 4
+; CHECK-NEXT:    v_writelane_b32 v6, s37, 5
+; CHECK-NEXT:    v_writelane_b32 v6, s38, 6
+; CHECK-NEXT:    v_writelane_b32 v6, s39, 7
+; CHECK-NEXT:    v_writelane_b32 v6, s48, 8
+; CHECK-NEXT:    v_writelane_b32 v6, s49, 9
+; CHECK-NEXT:    v_writelane_b32 v6, s50, 10
+; CHECK-NEXT:    s_getpc_b64 s[8:9]
+; CHECK-NEXT:    s_mov_b64 s[12:13], 0
+; CHECK-NEXT:    v_writelane_b32 v6, s51, 11
+; CHECK-NEXT:    s_movk_i32 s10, 0xf0
+; CHECK-NEXT:    s_mov_b32 s11, s8
+; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[12:13], 0x0
+; CHECK-NEXT:    s_load_dwordx16 s[36:51], s[10:11], 0x0
+; CHECK-NEXT:    ; implicit-def: $vgpr7 : SGPR spill to VGPR lane
+; CHECK-NEXT:    v_writelane_b32 v6, s52, 12
+; CHECK-NEXT:    v_writelane_b32 v6, s53, 13
+; CHECK-NEXT:    v_writelane_b32 v6, s54, 14
+; CHECK-NEXT:    v_writelane_b32 v6, s55, 15
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_movk_i32 s22, 0x130
-; CHECK-NEXT:    s_mov_b32 s23, s24
-; CHECK-NEXT:    v_writelane_b32 v5, s51, 11
-; CHECK-NEXT:    s_load_dwordx16 s[36:51], s[22:23], 0x0
-; CHECK-NEXT:    s_mov_b32 s28, 0
-; CHECK-NEXT:    v_mov_b32_e32 v1, 0
-; CHECK-NEXT:    v_mov_b32_e32 v2, s20
-; CHECK-NEXT:    v_mov_b32_e32 v3, v1
-; CHECK-NEXT:    s_mov_b32 s29, s28
-; CHECK-NEXT:    s_mov_b32 s30, s28
-; CHECK-NEXT:    s_mov_b32 s31, s28
-; CHECK-NEXT:    image_sample_lz v3, v[2:3], s[12:19], s[28:31] dmask:0x1
-; CHECK-NEXT:    v_mov_b32_e32 v2, v1
-; CHECK-NEXT:    ; implicit-def: $vgpr6 : SGPR spill to VGPR lane
-; CHECK-NEXT:    v_writelane_b32 v5, s52, 12
+; CHECK-NEXT:    v_writelane_b32 v7, s36, 0
+; CHECK-NEXT:    v_writelane_b32 v7, s37, 1
+; CHECK-NEXT:    v_writelane_b32 v7, s38, 2
+; CHECK-NEXT:    v_writelane_b32 v7, s39, 3
+; CHECK-NEXT:    v_writelane_b32 v7, s40, 4
+; CHECK-NEXT:    v_writelane_b32 v7, s41, 5
+; CHECK-NEXT:    v_writelane_b32 v7, s42, 6
+; CHECK-NEXT:    v_writelane_b32 v7, s43, 7
+; CHECK-NEXT:    v_writelane_b32 v6, s64, 16
+; CHECK-NEXT:    v_writelane_b32 v7, s44, 8
+; CHECK-NEXT:    v_writelane_b32 v6, s65, 17
+; CHECK-NEXT:    v_writelane_b32 v7, s45, 9
+; CHECK-NEXT:    v_writelane_b32 v6, s66, 18
+; CHECK-NEXT:    s_movk_i32 s6, 0x130
+; CHECK-NEXT:    s_mov_b32 s7, s8
+; CHECK-NEXT:    v_writelane_b32 v7, s46, 10
+; CHECK-NEXT:    v_writelane_b32 v6, s67, 19
+; CHECK-NEXT:    s_load_dwordx16 s[52:67], s[6:7], 0x0
+; CHECK-NEXT:    v_writelane_b32 v7, s47, 11
+; CHECK-NEXT:    v_writelane_b32 v7, s48, 12
+; CHECK-NEXT:    v_writelane_b32 v7, s49, 13
+; CHECK-NEXT:    s_mov_b64 s[28:29], 0
+; CHECK-NEXT:    v_mov_b32_e32 v3, 0
+; CHECK-NEXT:    v_writelane_b32 v7, s50, 14
+; CHECK-NEXT:    v_mov_b32_e32 v1, s4
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0
+; CHECK-NEXT:    s_mov_b64 s[30:31], s[28:29]
+; CHECK-NEXT:    v_mov_b32_e32 v4, 0
+; CHECK-NEXT:    v_writelane_b32 v7, s51, 15
+; CHECK-NEXT:    image_sample_lz v1, v[1:2], s[44:51], s[28:31] dmask:0x1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_writelane_b32 v6, s36, 0
-; CHECK-NEXT:    v_writelane_b32 v6, s37, 1
-; CHECK-NEXT:    v_writelane_b32 v6, s38, 2
-; CHECK-NEXT:    v_writelane_b32 v6, s39, 3
-; CHECK-NEXT:    v_writelane_b32 v6, s40, 4
-; CHECK-NEXT:    v_writelane_b32 v6, s41, 5
-; CHECK-NEXT:    image_sample_lz v4, v[1:2], s[36:43], s[28:31] dmask:0x1
-; CHECK-NEXT:    v_writelane_b32 v6, s42, 6
-; CHECK-NEXT:    v_writelane_b32 v6, s43, 7
-; CHECK-NEXT:    v_writelane_b32 v6, s44, 8
-; CHECK-NEXT:    v_writelane_b32 v6, s45, 9
-; CHECK-NEXT:    v_writelane_b32 v5, s53, 13
-; CHECK-NEXT:    v_writelane_b32 v6, s46, 10
-; CHECK-NEXT:    v_writelane_b32 v5, s54, 14
-; CHECK-NEXT:    v_writelane_b32 v6, s47, 11
-; CHECK-NEXT:    v_writelane_b32 v5, s55, 15
-; CHECK-NEXT:    v_writelane_b32 v6, s48, 12
-; CHECK-NEXT:    v_writelane_b32 v5, s64, 16
-; CHECK-NEXT:    v_writelane_b32 v6, s49, 13
-; CHECK-NEXT:    v_writelane_b32 v5, s65, 17
-; CHECK-NEXT:    v_writelane_b32 v6, s50, 14
-; CHECK-NEXT:    v_writelane_b32 v5, s66, 18
-; CHECK-NEXT:    v_writelane_b32 v6, s51, 15
-; CHECK-NEXT:    s_mov_b32 s40, 48
-; CHECK-NEXT:    s_movk_i32 s56, 0x1f0
+; CHECK-NEXT:    v_writelane_b32 v7, s52, 16
+; CHECK-NEXT:    v_writelane_b32 v7, s53, 17
+; CHECK-NEXT:    v_writelane_b32 v7, s54, 18
+; CHECK-NEXT:    v_writelane_b32 v7, s55, 19
+; CHECK-NEXT:    v_writelane_b32 v7, s56, 20
+; CHECK-NEXT:    v_writelane_b32 v7, s57, 21
+; CHECK-NEXT:    image_sample_lz v5, v[3:4], s[52:59], s[28:31] dmask:0x1
+; CHECK-NEXT:    v_writelane_b32 v7, s58, 22
+; CHECK-NEXT:    v_writelane_b32 v7, s59, 23
+; CHECK-NEXT:    v_writelane_b32 v7, s60, 24
+; CHECK-NEXT:    v_writelane_b32 v7, s61, 25
+; CHECK-NEXT:    v_writelane_b32 v7, s62, 26
+; CHECK-NEXT:    v_writelane_b32 v7, s63, 27
+; CHECK-NEXT:    v_writelane_b32 v7, s64, 28
+; CHECK-NEXT:    v_writelane_b32 v7, s65, 29
+; CHECK-NEXT:    v_writelane_b32 v7, s66, 30
+; CHECK-NEXT:    s_mov_b32 s4, 48
+; CHECK-NEXT:    s_movk_i32 s40, 0x1f0
 ; CHECK-NEXT:    s_movk_i32 s34, 0x2f0
-; CHECK-NEXT:    s_mov_b32 s41, s24
-; CHECK-NEXT:    s_mov_b32 s57, s24
-; CHECK-NEXT:    s_mov_b32 s35, s24
-; CHECK-NEXT:    v_writelane_b32 v5, s67, 19
-; CHECK-NEXT:    s_load_dwordx8 s[20:27], s[40:41], 0x0
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    s_load_dwordx16 s[36:51], s[56:57], 0x0
+; CHECK-NEXT:    s_mov_b32 s5, s8
+; CHECK-NEXT:    s_mov_b32 s41, s8
+; CHECK-NEXT:    s_mov_b32 s35, s8
+; CHECK-NEXT:    v_writelane_b32 v7, s67, 31
+; CHECK-NEXT:    s_load_dwordx8 s[20:27], s[4:5], 0x0
+; CHECK-NEXT:    s_load_dwordx16 s[52:67], s[40:41], 0x0
 ; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
-; CHECK-NEXT:    s_load_dwordx16 s[52:67], s[34:35], 0x0
+; CHECK-NEXT:    s_load_dwordx16 s[36:51], s[34:35], 0x0
 ; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; CHECK-NEXT:    v_writelane_b32 v5, s68, 20
+; CHECK-NEXT:    v_writelane_b32 v6, s68, 20
 ; CHECK-NEXT:    s_xor_b64 s[72:73], vcc, -1
-; CHECK-NEXT:    v_writelane_b32 v5, s69, 21
+; CHECK-NEXT:    v_writelane_b32 v6, s69, 21
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_mul_f32_e32 v0, v4, v3
+; CHECK-NEXT:    v_mul_f32_e32 v0, v5, v1
 ; CHECK-NEXT:    s_and_saveexec_b64 vcc, s[72:73]
 ; CHECK-NEXT:    s_xor_b64 s[34:35], exec, vcc
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_3
 ; CHECK-NEXT:  ; %bb.1: ; %bb48
-; CHECK-NEXT:    image_sample_lz v3, v[1:2], s[12:19], s[28:31] dmask:0x1
-; CHECK-NEXT:    v_mov_b32_e32 v2, 0
+; CHECK-NEXT:    v_readlane_b32 s4, v7, 0
+; CHECK-NEXT:    v_readlane_b32 s12, v7, 8
+; CHECK-NEXT:    v_readlane_b32 s13, v7, 9
+; CHECK-NEXT:    v_readlane_b32 s14, v7, 10
+; CHECK-NEXT:    v_readlane_b32 s15, v7, 11
+; CHECK-NEXT:    v_readlane_b32 s16, v7, 12
+; CHECK-NEXT:    v_readlane_b32 s17, v7, 13
+; CHECK-NEXT:    v_readlane_b32 s18, v7, 14
+; CHECK-NEXT:    v_readlane_b32 s19, v7, 15
+; CHECK-NEXT:    v_mov_b32_e32 v1, v2
 ; CHECK-NEXT:    s_and_b64 vcc, exec, -1
+; CHECK-NEXT:    v_readlane_b32 s5, v7, 1
+; CHECK-NEXT:    v_readlane_b32 s6, v7, 2
+; CHECK-NEXT:    v_readlane_b32 s7, v7, 3
+; CHECK-NEXT:    image_sample_lz v3, v[3:4], s[12:19], s[28:31] dmask:0x1
+; CHECK-NEXT:    v_readlane_b32 s8, v7, 4
+; CHECK-NEXT:    v_readlane_b32 s9, v7, 5
+; CHECK-NEXT:    v_readlane_b32 s10, v7, 6
+; CHECK-NEXT:    v_readlane_b32 s11, v7, 7
 ; CHECK-NEXT:  .LBB0_2: ; %bb50
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    s_mov_b32 s29, s28
-; CHECK-NEXT:    s_mov_b32 s30, s28
-; CHECK-NEXT:    s_mov_b32 s31, s28
+; CHECK-NEXT:    s_mov_b64 s[30:31], s[28:29]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    image_sample_lz v4, v[1:2], s[44:51], s[24:27] dmask:0x1
+; CHECK-NEXT:    image_sample_lz v4, v[1:2], s[60:67], s[24:27] dmask:0x1
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    image_sample_lz v1, v[1:2], s[60:67], s[28:31] dmask:0x1
+; CHECK-NEXT:    image_sample_lz v1, v[1:2], s[44:51], s[28:31] dmask:0x1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_sub_f32_e32 v1, v1, v4
 ; CHECK-NEXT:    v_mul_f32_e32 v1, v1, v0
@@ -107,68 +134,69 @@ define void @main(i1 %arg) #0 {
 ; CHECK-NEXT:    s_mov_b64 vcc, vcc
 ; CHECK-NEXT:    s_cbranch_vccnz .LBB0_2
 ; CHECK-NEXT:  .LBB0_3: ; %Flow14
-; CHECK-NEXT:    s_andn2_saveexec_b64 s[12:13], s[34:35]
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_andn2_saveexec_b64 s[24:25], s[34:35]
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_10
 ; CHECK-NEXT:  ; %bb.4: ; %bb32
 ; CHECK-NEXT:    s_and_saveexec_b64 s[14:15], s[72:73]
-; CHECK-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
+; CHECK-NEXT:    s_xor_b64 s[26:27], exec, s[14:15]
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_6
 ; CHECK-NEXT:  ; %bb.5: ; %bb43
-; CHECK-NEXT:    s_mov_b32 s16, 0
-; CHECK-NEXT:    s_mov_b32 s17, s16
-; CHECK-NEXT:    v_mov_b32_e32 v2, s16
-; CHECK-NEXT:    v_mov_b32_e32 v3, s17
-; CHECK-NEXT:    s_mov_b32 s18, s16
-; CHECK-NEXT:    s_mov_b32 s19, s16
-; CHECK-NEXT:    image_sample_lz v1, v[2:3], s[4:11], s[16:19] dmask:0x1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_mov_b64 s[4:5], s[36:37]
-; CHECK-NEXT:    s_mov_b64 s[6:7], s[38:39]
-; CHECK-NEXT:    s_mov_b64 s[8:9], s[40:41]
-; CHECK-NEXT:    s_mov_b64 s[10:11], s[42:43]
-; CHECK-NEXT:    v_readlane_b32 s36, v6, 0
-; CHECK-NEXT:    v_readlane_b32 s44, v6, 8
-; CHECK-NEXT:    v_readlane_b32 s45, v6, 9
-; CHECK-NEXT:    v_readlane_b32 s46, v6, 10
-; CHECK-NEXT:    v_readlane_b32 s47, v6, 11
-; CHECK-NEXT:    v_readlane_b32 s48, v6, 12
-; CHECK-NEXT:    v_readlane_b32 s49, v6, 13
-; CHECK-NEXT:    v_readlane_b32 s50, v6, 14
-; CHECK-NEXT:    v_readlane_b32 s51, v6, 15
-; CHECK-NEXT:    v_readlane_b32 s37, v6, 1
-; CHECK-NEXT:    v_readlane_b32 s38, v6, 2
-; CHECK-NEXT:    v_readlane_b32 s39, v6, 3
-; CHECK-NEXT:    v_readlane_b32 s40, v6, 4
-; CHECK-NEXT:    v_readlane_b32 s41, v6, 5
-; CHECK-NEXT:    image_sample_lz v0, v[2:3], s[44:51], s[20:23] dmask:0x1
-; CHECK-NEXT:    v_readlane_b32 s42, v6, 6
-; CHECK-NEXT:    v_readlane_b32 s43, v6, 7
-; CHECK-NEXT:    v_mov_b32_e32 v2, 0
-; CHECK-NEXT:    s_mov_b64 s[42:43], s[10:11]
-; CHECK-NEXT:    v_mov_b32_e32 v3, v2
-; CHECK-NEXT:    s_mov_b64 s[40:41], s[8:9]
-; CHECK-NEXT:    s_mov_b64 s[38:39], s[6:7]
-; CHECK-NEXT:    s_mov_b64 s[36:37], s[4:5]
+; CHECK-NEXT:    s_mov_b64 s[44:45], 0
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    v_readlane_b32 s4, v7, 0
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    s_mov_b64 s[46:47], s[44:45]
+; CHECK-NEXT:    v_readlane_b32 s5, v7, 1
+; CHECK-NEXT:    v_readlane_b32 s6, v7, 2
+; CHECK-NEXT:    v_readlane_b32 s7, v7, 3
+; CHECK-NEXT:    v_readlane_b32 s8, v7, 4
+; CHECK-NEXT:    v_readlane_b32 s9, v7, 5
+; CHECK-NEXT:    v_readlane_b32 s10, v7, 6
+; CHECK-NEXT:    v_readlane_b32 s11, v7, 7
+; CHECK-NEXT:    v_readlane_b32 s12, v7, 8
+; CHECK-NEXT:    v_readlane_b32 s13, v7, 9
+; CHECK-NEXT:    v_readlane_b32 s14, v7, 10
+; CHECK-NEXT:    v_readlane_b32 s15, v7, 11
+; CHECK-NEXT:    v_readlane_b32 s16, v7, 12
+; CHECK-NEXT:    v_readlane_b32 s17, v7, 13
+; CHECK-NEXT:    v_readlane_b32 s18, v7, 14
+; CHECK-NEXT:    v_readlane_b32 s19, v7, 15
+; CHECK-NEXT:    image_sample_lz v2, v[0:1], s[4:11], s[44:47] dmask:0x1
+; CHECK-NEXT:    v_readlane_b32 s4, v7, 16
+; CHECK-NEXT:    v_readlane_b32 s12, v7, 24
+; CHECK-NEXT:    v_readlane_b32 s13, v7, 25
+; CHECK-NEXT:    v_readlane_b32 s14, v7, 26
+; CHECK-NEXT:    v_readlane_b32 s15, v7, 27
+; CHECK-NEXT:    v_readlane_b32 s16, v7, 28
+; CHECK-NEXT:    v_readlane_b32 s17, v7, 29
+; CHECK-NEXT:    v_readlane_b32 s18, v7, 30
+; CHECK-NEXT:    v_readlane_b32 s19, v7, 31
+; CHECK-NEXT:    v_mov_b32_e32 v3, 0
+; CHECK-NEXT:    v_mov_b32_e32 v4, v3
+; CHECK-NEXT:    v_readlane_b32 s5, v7, 17
+; CHECK-NEXT:    v_readlane_b32 s6, v7, 18
+; CHECK-NEXT:    v_readlane_b32 s7, v7, 19
+; CHECK-NEXT:    image_sample_lz v0, v[0:1], s[12:19], s[20:23] dmask:0x1
+; CHECK-NEXT:    v_readlane_b32 s8, v7, 20
+; CHECK-NEXT:    v_readlane_b32 s9, v7, 21
+; CHECK-NEXT:    v_readlane_b32 s10, v7, 22
+; CHECK-NEXT:    v_readlane_b32 s11, v7, 23
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_dwordx3 v[1:3], off, s[16:19], 0
+; CHECK-NEXT:    buffer_store_dwordx3 v[2:4], off, s[44:47], 0
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; CHECK-NEXT:    buffer_store_dwordx4 v[0:3], off, s[44:47], 0
 ; CHECK-NEXT:    ; implicit-def: $vgpr0
 ; CHECK-NEXT:  .LBB0_6: ; %Flow12
-; CHECK-NEXT:    s_andn2_saveexec_b64 s[4:5], s[14:15]
+; CHECK-NEXT:    s_andn2_saveexec_b64 s[14:15], s[26:27]
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_9
 ; CHECK-NEXT:  ; %bb.7: ; %bb33.preheader
-; CHECK-NEXT:    s_mov_b32 s8, 0
-; CHECK-NEXT:    s_mov_b32 s6, s8
-; CHECK-NEXT:    s_mov_b32 s7, s8
-; CHECK-NEXT:    v_mov_b32_e32 v1, s6
-; CHECK-NEXT:    s_mov_b32 s9, s8
-; CHECK-NEXT:    s_mov_b32 s10, s8
-; CHECK-NEXT:    s_mov_b32 s11, s8
-; CHECK-NEXT:    v_mov_b32_e32 v2, s7
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    image_sample_lz v3, v[1:2], s[36:43], s[8:11] dmask:0x1
-; CHECK-NEXT:    image_sample_lz v4, v[1:2], s[52:59], s[8:11] dmask:0x1
+; CHECK-NEXT:    s_mov_b64 s[16:17], 0
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0
+; CHECK-NEXT:    s_mov_b64 s[18:19], s[16:17]
+; CHECK-NEXT:    image_sample_lz v3, v[1:2], s[52:59], s[16:19] dmask:0x1
+; CHECK-NEXT:    image_sample_lz v4, v[1:2], s[36:43], s[16:19] dmask:0x1
 ; CHECK-NEXT:    s_and_b64 vcc, exec, 0
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_sub_f32_e32 v1, v4, v3
@@ -181,35 +209,34 @@ define void @main(i1 %arg) #0 {
 ; CHECK-NEXT:    s_mov_b64 vcc, vcc
 ; CHECK-NEXT:    s_cbranch_vccz .LBB0_8
 ; CHECK-NEXT:  .LBB0_9: ; %Flow13
-; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
+; CHECK-NEXT:    s_or_b64 exec, exec, s[14:15]
 ; CHECK-NEXT:  .LBB0_10: ; %UnifiedReturnBlock
-; CHECK-NEXT:    s_or_b64 exec, exec, s[12:13]
-; CHECK-NEXT:    v_readlane_b32 s69, v5, 21
-; CHECK-NEXT:    v_readlane_b32 s68, v5, 20
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_readlane_b32 s67, v5, 19
-; CHECK-NEXT:    v_readlane_b32 s66, v5, 18
-; CHECK-NEXT:    v_readlane_b32 s65, v5, 17
-; CHECK-NEXT:    v_readlane_b32 s64, v5, 16
-; CHECK-NEXT:    v_readlane_b32 s55, v5, 15
-; CHECK-NEXT:    v_readlane_b32 s54, v5, 14
-; CHECK-NEXT:    v_readlane_b32 s53, v5, 13
-; CHECK-NEXT:    v_readlane_b32 s52, v5, 12
-; CHECK-NEXT:    v_readlane_b32 s51, v5, 11
-; CHECK-NEXT:    v_readlane_b32 s50, v5, 10
-; CHECK-NEXT:    v_readlane_b32 s49, v5, 9
-; CHECK-NEXT:    v_readlane_b32 s48, v5, 8
-; CHECK-NEXT:    v_readlane_b32 s39, v5, 7
-; CHECK-NEXT:    v_readlane_b32 s38, v5, 6
-; CHECK-NEXT:    v_readlane_b32 s37, v5, 5
-; CHECK-NEXT:    v_readlane_b32 s36, v5, 4
-; CHECK-NEXT:    v_readlane_b32 s35, v5, 3
-; CHECK-NEXT:    v_readlane_b32 s34, v5, 2
-; CHECK-NEXT:    v_readlane_b32 s31, v5, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v5, 0
+; CHECK-NEXT:    s_or_b64 exec, exec, s[24:25]
+; CHECK-NEXT:    v_readlane_b32 s69, v6, 21
+; CHECK-NEXT:    v_readlane_b32 s68, v6, 20
+; CHECK-NEXT:    v_readlane_b32 s67, v6, 19
+; CHECK-NEXT:    v_readlane_b32 s66, v6, 18
+; CHECK-NEXT:    v_readlane_b32 s65, v6, 17
+; CHECK-NEXT:    v_readlane_b32 s64, v6, 16
+; CHECK-NEXT:    v_readlane_b32 s55, v6, 15
+; CHECK-NEXT:    v_readlane_b32 s54, v6, 14
+; CHECK-NEXT:    v_readlane_b32 s53, v6, 13
+; CHECK-NEXT:    v_readlane_b32 s52, v6, 12
+; CHECK-NEXT:    v_readlane_b32 s51, v6, 11
+; CHECK-NEXT:    v_readlane_b32 s50, v6, 10
+; CHECK-NEXT:    v_readlane_b32 s49, v6, 9
+; CHECK-NEXT:    v_readlane_b32 s48, v6, 8
+; CHECK-NEXT:    v_readlane_b32 s39, v6, 7
+; CHECK-NEXT:    v_readlane_b32 s38, v6, 6
+; CHECK-NEXT:    v_readlane_b32 s37, v6, 5
+; CHECK-NEXT:    v_readlane_b32 s36, v6, 4
+; CHECK-NEXT:    v_readlane_b32 s35, v6, 3
+; CHECK-NEXT:    v_readlane_b32 s34, v6, 2
+; CHECK-NEXT:    v_readlane_b32 s31, v6, 1
+; CHECK-NEXT:    v_readlane_b32 s30, v6, 0
 ; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; CHECK-NEXT:    buffer_load_dword v5, off, s[0:3], s32 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/imm.ll b/llvm/test/CodeGen/AMDGPU/imm.ll
index a328bbe8b4ddc..42168354897e5 100644
--- a/llvm/test/CodeGen/AMDGPU/imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/imm.ll
@@ -1532,8 +1532,8 @@ define amdgpu_kernel void @store_inline_imm_0.0_f64(ptr addrspace(1) %out) {
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; SI-NEXT:    v_mov_b32_e32 v0, 0
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-NEXT:    s_mov_b32 s2, -1
-; SI-NEXT:    v_mov_b32_e32 v1, v0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
@@ -1543,8 +1543,8 @@ define amdgpu_kernel void @store_inline_imm_0.0_f64(ptr addrspace(1) %out) {
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_mov_b32 s2, -1
-; VI-NEXT:    v_mov_b32_e32 v1, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
index 56a3ce7281030..3a40688b0c45c 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
@@ -9544,78 +9544,77 @@ define amdgpu_cs void @insert_or_disj_index(ptr addrspace(1) %out, ptr addrspace
 ; NOOPT-NEXT:    s_mov_b32 s19, 0xe8f000
 ; NOOPT-NEXT:    s_add_u32 s16, s16, s5
 ; NOOPT-NEXT:    s_addc_u32 s17, s17, 0
-; NOOPT-NEXT:    ; implicit-def: $vgpr33 : SGPR spill to VGPR lane
-; NOOPT-NEXT:    v_writelane_b32 v33, s4, 0
-; NOOPT-NEXT:    s_mov_b32 s4, s1
-; NOOPT-NEXT:    v_readlane_b32 s1, v33, 0
-; NOOPT-NEXT:    v_writelane_b32 v33, s4, 1
-; NOOPT-NEXT:    s_mov_b32 s4, s0
-; NOOPT-NEXT:    v_readlane_b32 s0, v33, 1
+; NOOPT-NEXT:    s_mov_b32 s5, s4
+; NOOPT-NEXT:    s_mov_b32 s4, s3
+; NOOPT-NEXT:    s_mov_b32 s6, s2
+; NOOPT-NEXT:    s_mov_b32 s7, s1
 ; NOOPT-NEXT:    buffer_store_dword v4, off, s[16:19], 0 offset:144 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    v_mov_b32_e32 v2, v1
-; NOOPT-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
-; NOOPT-NEXT:    s_mov_b32 s5, s0
-; NOOPT-NEXT:    s_mov_b32 s6, s2
-; NOOPT-NEXT:    s_mov_b32 s7, s3
-; NOOPT-NEXT:    ; implicit-def: $sgpr0
-; NOOPT-NEXT:    ; implicit-def: $sgpr0
+; NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; NOOPT-NEXT:    s_mov_b32 s1, s7
+; NOOPT-NEXT:    s_mov_b32 s2, s6
+; NOOPT-NEXT:    s_mov_b32 s3, s4
+; NOOPT-NEXT:    ; implicit-def: $sgpr4
+; NOOPT-NEXT:    ; implicit-def: $sgpr4
 ; NOOPT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
 ; NOOPT-NEXT:    v_mov_b32_e32 v1, v2
 ; NOOPT-NEXT:    buffer_store_dword v0, off, s[16:19], 0 offset:136 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    buffer_store_dword v1, off, s[16:19], 0 offset:140 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    s_mov_b32 s8, 0xf000
-; NOOPT-NEXT:    s_mov_b32 s0, 0
-; NOOPT-NEXT:    v_writelane_b32 v33, s0, 2
-; NOOPT-NEXT:    s_mov_b32 s2, s0
-; NOOPT-NEXT:    s_mov_b32 s3, s8
-; NOOPT-NEXT:    s_mov_b32 s8, s0
-; NOOPT-NEXT:    s_mov_b32 s9, s0
+; NOOPT-NEXT:    s_mov_b32 s4, 0
+; NOOPT-NEXT:    s_mov_b32 s6, s4
+; NOOPT-NEXT:    s_mov_b32 s7, s8
+; NOOPT-NEXT:    s_mov_b32 s8, s4
+; NOOPT-NEXT:    s_mov_b32 s9, s4
 ; NOOPT-NEXT:    ; kill: def $sgpr8_sgpr9 killed $sgpr8_sgpr9 def $sgpr8_sgpr9_sgpr10_sgpr11
-; NOOPT-NEXT:    s_mov_b64 s[10:11], s[2:3]
-; NOOPT-NEXT:    v_writelane_b32 v33, s8, 3
-; NOOPT-NEXT:    v_writelane_b32 v33, s9, 4
-; NOOPT-NEXT:    v_writelane_b32 v33, s10, 5
-; NOOPT-NEXT:    v_writelane_b32 v33, s11, 6
-; NOOPT-NEXT:    ; kill: def $sgpr8_sgpr9_sgpr10_sgpr11 killed $sgpr4_sgpr5_sgpr6_sgpr7
-; NOOPT-NEXT:    ; implicit-def: $sgpr2_sgpr3
+; NOOPT-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; NOOPT-NEXT:    ; implicit-def: $vgpr33 : SGPR spill to VGPR lane
+; NOOPT-NEXT:    v_writelane_b32 v33, s8, 0
+; NOOPT-NEXT:    v_writelane_b32 v33, s9, 1
+; NOOPT-NEXT:    v_writelane_b32 v33, s10, 2
+; NOOPT-NEXT:    v_writelane_b32 v33, s11, 3
+; NOOPT-NEXT:    ; kill: def $sgpr8_sgpr9_sgpr10_sgpr11 killed $sgpr0_sgpr1_sgpr2_sgpr3
+; NOOPT-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; NOOPT-NEXT:    s_waitcnt expcnt(1)
-; NOOPT-NEXT:    v_mov_b32_e32 v0, s1
-; NOOPT-NEXT:    buffer_load_dword v0, v0, s[4:7], s0 offen
+; NOOPT-NEXT:    v_mov_b32_e32 v0, s5
+; NOOPT-NEXT:    buffer_load_dword v0, v0, s[0:3], s4 offen
 ; NOOPT-NEXT:    s_waitcnt vmcnt(0)
 ; NOOPT-NEXT:    buffer_store_dword v0, off, s[16:19], 0 offset:132 ; 4-byte Folded Spill
+; NOOPT-NEXT:    s_mov_b64 s[0:1], 0
+; NOOPT-NEXT:    v_writelane_b32 v33, s0, 4
+; NOOPT-NEXT:    v_writelane_b32 v33, s1, 5
 ; NOOPT-NEXT:    s_waitcnt expcnt(0)
 ; NOOPT-NEXT:    v_mov_b32_e32 v0, s0
-; NOOPT-NEXT:    v_mov_b32_e32 v30, s0
-; NOOPT-NEXT:    v_mov_b32_e32 v29, s0
+; NOOPT-NEXT:    v_mov_b32_e32 v1, s1
+; NOOPT-NEXT:    v_mov_b32_e32 v29, s1
 ; NOOPT-NEXT:    v_mov_b32_e32 v28, s0
-; NOOPT-NEXT:    v_mov_b32_e32 v27, s0
+; NOOPT-NEXT:    v_mov_b32_e32 v27, s1
 ; NOOPT-NEXT:    v_mov_b32_e32 v26, s0
-; NOOPT-NEXT:    v_mov_b32_e32 v25, s0
+; NOOPT-NEXT:    v_mov_b32_e32 v25, s1
 ; NOOPT-NEXT:    v_mov_b32_e32 v24, s0
-; NOOPT-NEXT:    v_mov_b32_e32 v23, s0
+; NOOPT-NEXT:    v_mov_b32_e32 v23, s1
 ; NOOPT-NEXT:    v_mov_b32_e32 v22, s0
-; NOOPT-NEXT:    v_mov_b32_e32 v21, s0
+; NOOPT-NEXT:    v_mov_b32_e32 v21, s1
 ; NOOPT-NEXT:    v_mov_b32_e32 v20, s0
-; NOOPT-NEXT:    v_mov_b32_e32 v19, s0
+; NOOPT-NEXT:    v_mov_b32_e32 v19, s1
 ; NOOPT-NEXT:    v_mov_b32_e32 v18, s0
-; NOOPT-NEXT:    v_mov_b32_e32 v17, s0
+; NOOPT-NEXT:    v_mov_b32_e32 v17, s1
 ; NOOPT-NEXT:    v_mov_b32_e32 v16, s0
-; NOOPT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 killed $exec
-; NOOPT-NEXT:    v_mov_b32_e32 v1, v30
-; NOOPT-NEXT:    v_mov_b32_e32 v2, v29
-; NOOPT-NEXT:    v_mov_b32_e32 v3, v28
-; NOOPT-NEXT:    v_mov_b32_e32 v4, v27
-; NOOPT-NEXT:    v_mov_b32_e32 v5, v26
-; NOOPT-NEXT:    v_mov_b32_e32 v6, v25
-; NOOPT-NEXT:    v_mov_b32_e32 v7, v24
-; NOOPT-NEXT:    v_mov_b32_e32 v8, v23
-; NOOPT-NEXT:    v_mov_b32_e32 v9, v22
-; NOOPT-NEXT:    v_mov_b32_e32 v10, v21
-; NOOPT-NEXT:    v_mov_b32_e32 v11, v20
-; NOOPT-NEXT:    v_mov_b32_e32 v12, v19
-; NOOPT-NEXT:    v_mov_b32_e32 v13, v18
-; NOOPT-NEXT:    v_mov_b32_e32 v14, v17
-; NOOPT-NEXT:    v_mov_b32_e32 v15, v16
+; NOOPT-NEXT:    ; kill: def $vgpr0_vgpr1 killed $vgpr0_vgpr1 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 killed $exec
+; NOOPT-NEXT:    v_mov_b32_e32 v2, v28
+; NOOPT-NEXT:    v_mov_b32_e32 v3, v29
+; NOOPT-NEXT:    v_mov_b32_e32 v4, v26
+; NOOPT-NEXT:    v_mov_b32_e32 v5, v27
+; NOOPT-NEXT:    v_mov_b32_e32 v6, v24
+; NOOPT-NEXT:    v_mov_b32_e32 v7, v25
+; NOOPT-NEXT:    v_mov_b32_e32 v8, v22
+; NOOPT-NEXT:    v_mov_b32_e32 v9, v23
+; NOOPT-NEXT:    v_mov_b32_e32 v10, v20
+; NOOPT-NEXT:    v_mov_b32_e32 v11, v21
+; NOOPT-NEXT:    v_mov_b32_e32 v12, v18
+; NOOPT-NEXT:    v_mov_b32_e32 v13, v19
+; NOOPT-NEXT:    v_mov_b32_e32 v14, v16
+; NOOPT-NEXT:    v_mov_b32_e32 v15, v17
 ; NOOPT-NEXT:    buffer_store_dword v0, off, s[16:19], 0 offset:68 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    buffer_store_dword v1, off, s[16:19], 0 offset:72 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    buffer_store_dword v2, off, s[16:19], 0 offset:76 ; 4-byte Folded Spill
@@ -9633,8 +9632,8 @@ define amdgpu_cs void @insert_or_disj_index(ptr addrspace(1) %out, ptr addrspace
 ; NOOPT-NEXT:    buffer_store_dword v14, off, s[16:19], 0 offset:124 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    buffer_store_dword v15, off, s[16:19], 0 offset:128 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    s_mov_b64 s[0:1], exec
-; NOOPT-NEXT:    v_writelane_b32 v33, s0, 7
-; NOOPT-NEXT:    v_writelane_b32 v33, s1, 8
+; NOOPT-NEXT:    v_writelane_b32 v33, s0, 6
+; NOOPT-NEXT:    v_writelane_b32 v33, s1, 7
 ; NOOPT-NEXT:    s_or_saveexec_b64 s[12:13], -1
 ; NOOPT-NEXT:    buffer_store_dword v33, off, s[16:19], 0 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    s_mov_b64 exec, s[12:13]
@@ -9661,8 +9660,8 @@ define amdgpu_cs void @insert_or_disj_index(ptr addrspace(1) %out, ptr addrspace
 ; NOOPT-NEXT:    buffer_load_dword v33, off, s[16:19], 0 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_mov_b64 exec, s[12:13]
 ; NOOPT-NEXT:    s_waitcnt vmcnt(0)
-; NOOPT-NEXT:    v_readlane_b32 s0, v33, 9
-; NOOPT-NEXT:    v_readlane_b32 s1, v33, 10
+; NOOPT-NEXT:    v_readlane_b32 s0, v33, 8
+; NOOPT-NEXT:    v_readlane_b32 s1, v33, 9
 ; NOOPT-NEXT:    buffer_load_dword v0, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v1, off, s[16:19], 0 offset:8 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v2, off, s[16:19], 0 offset:12 ; 4-byte Folded Reload
@@ -9727,8 +9726,8 @@ define amdgpu_cs void @insert_or_disj_index(ptr addrspace(1) %out, ptr addrspace
 ; NOOPT-NEXT:    buffer_store_dword v14, off, s[16:19], 0 offset:60 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    buffer_store_dword v15, off, s[16:19], 0 offset:64 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    s_mov_b64 s[2:3], s[0:1]
-; NOOPT-NEXT:    v_writelane_b32 v33, s2, 9
-; NOOPT-NEXT:    v_writelane_b32 v33, s3, 10
+; NOOPT-NEXT:    v_writelane_b32 v33, s2, 8
+; NOOPT-NEXT:    v_writelane_b32 v33, s3, 9
 ; NOOPT-NEXT:    s_or_saveexec_b64 s[12:13], -1
 ; NOOPT-NEXT:    buffer_store_dword v33, off, s[16:19], 0 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    s_mov_b64 exec, s[12:13]
@@ -9740,18 +9739,18 @@ define amdgpu_cs void @insert_or_disj_index(ptr addrspace(1) %out, ptr addrspace
 ; NOOPT-NEXT:    buffer_load_dword v33, off, s[16:19], 0 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_mov_b64 exec, s[12:13]
 ; NOOPT-NEXT:    s_waitcnt vmcnt(0)
-; NOOPT-NEXT:    v_readlane_b32 s0, v33, 7
-; NOOPT-NEXT:    v_readlane_b32 s1, v33, 8
+; NOOPT-NEXT:    v_readlane_b32 s0, v33, 6
+; NOOPT-NEXT:    v_readlane_b32 s1, v33, 7
 ; NOOPT-NEXT:    s_mov_b64 exec, s[0:1]
 ; NOOPT-NEXT:  ; %bb.3:
 ; NOOPT-NEXT:    s_or_saveexec_b64 s[12:13], -1
 ; NOOPT-NEXT:    buffer_load_dword v33, off, s[16:19], 0 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_mov_b64 exec, s[12:13]
 ; NOOPT-NEXT:    s_waitcnt vmcnt(0)
-; NOOPT-NEXT:    v_readlane_b32 s0, v33, 3
-; NOOPT-NEXT:    v_readlane_b32 s1, v33, 4
-; NOOPT-NEXT:    v_readlane_b32 s2, v33, 5
-; NOOPT-NEXT:    v_readlane_b32 s3, v33, 6
+; NOOPT-NEXT:    v_readlane_b32 s0, v33, 0
+; NOOPT-NEXT:    v_readlane_b32 s1, v33, 1
+; NOOPT-NEXT:    v_readlane_b32 s2, v33, 2
+; NOOPT-NEXT:    v_readlane_b32 s3, v33, 3
 ; NOOPT-NEXT:    buffer_load_dword v4, off, s[16:19], 0 offset:136 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v5, off, s[16:19], 0 offset:140 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v17, off, s[16:19], 0 offset:148 ; 4-byte Folded Reload
@@ -9839,26 +9838,26 @@ define amdgpu_cs void @insert_or_disj_index(ptr addrspace(1) %out, ptr addrspace
 ; SI-MOVREL:       ; %bb.0: ; %entry
 ; SI-MOVREL-NEXT:    v_mov_b32_e32 v2, s4
 ; SI-MOVREL-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
-; SI-MOVREL-NEXT:    s_mov_b32 s2, 0
 ; SI-MOVREL-NEXT:    v_mov_b32_e32 v5, 0
+; SI-MOVREL-NEXT:    v_mov_b32_e32 v6, 0
+; SI-MOVREL-NEXT:    s_mov_b32 s2, 0
+; SI-MOVREL-NEXT:    v_mov_b32_e32 v8, v6
+; SI-MOVREL-NEXT:    v_mov_b32_e32 v10, v6
+; SI-MOVREL-NEXT:    v_mov_b32_e32 v12, v6
+; SI-MOVREL-NEXT:    v_mov_b32_e32 v14, v6
+; SI-MOVREL-NEXT:    v_mov_b32_e32 v16, v6
+; SI-MOVREL-NEXT:    v_mov_b32_e32 v18, v6
+; SI-MOVREL-NEXT:    v_mov_b32_e32 v20, v6
 ; SI-MOVREL-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-MOVREL-NEXT:    s_mov_b32 s0, s2
 ; SI-MOVREL-NEXT:    s_mov_b32 s1, s2
-; SI-MOVREL-NEXT:    v_mov_b32_e32 v6, v5
 ; SI-MOVREL-NEXT:    v_mov_b32_e32 v7, v5
-; SI-MOVREL-NEXT:    v_mov_b32_e32 v8, v5
 ; SI-MOVREL-NEXT:    v_mov_b32_e32 v9, v5
-; SI-MOVREL-NEXT:    v_mov_b32_e32 v10, v5
 ; SI-MOVREL-NEXT:    v_mov_b32_e32 v11, v5
-; SI-MOVREL-NEXT:    v_mov_b32_e32 v12, v5
 ; SI-MOVREL-NEXT:    v_mov_b32_e32 v13, v5
-; SI-MOVREL-NEXT:    v_mov_b32_e32 v14, v5
 ; SI-MOVREL-NEXT:    v_mov_b32_e32 v15, v5
-; SI-MOVREL-NEXT:    v_mov_b32_e32 v16, v5
 ; SI-MOVREL-NEXT:    v_mov_b32_e32 v17, v5
-; SI-MOVREL-NEXT:    v_mov_b32_e32 v18, v5
 ; SI-MOVREL-NEXT:    v_mov_b32_e32 v19, v5
-; SI-MOVREL-NEXT:    v_mov_b32_e32 v20, v5
 ; SI-MOVREL-NEXT:    s_mov_b64 s[4:5], exec
 ; SI-MOVREL-NEXT:  .LBB27_1: ; =>This Inner Loop Header: Depth=1
 ; SI-MOVREL-NEXT:    s_waitcnt vmcnt(0)
@@ -9882,21 +9881,21 @@ define amdgpu_cs void @insert_or_disj_index(ptr addrspace(1) %out, ptr addrspace
 ; VI-MOVREL-NEXT:    v_mov_b32_e32 v2, s4
 ; VI-MOVREL-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
 ; VI-MOVREL-NEXT:    v_mov_b32_e32 v5, 0
-; VI-MOVREL-NEXT:    v_mov_b32_e32 v6, v5
+; VI-MOVREL-NEXT:    v_mov_b32_e32 v6, 0
+; VI-MOVREL-NEXT:    v_mov_b32_e32 v8, v6
+; VI-MOVREL-NEXT:    v_mov_b32_e32 v10, v6
+; VI-MOVREL-NEXT:    v_mov_b32_e32 v12, v6
+; VI-MOVREL-NEXT:    v_mov_b32_e32 v14, v6
+; VI-MOVREL-NEXT:    v_mov_b32_e32 v16, v6
+; VI-MOVREL-NEXT:    v_mov_b32_e32 v18, v6
+; VI-MOVREL-NEXT:    v_mov_b32_e32 v20, v6
 ; VI-MOVREL-NEXT:    v_mov_b32_e32 v7, v5
-; VI-MOVREL-NEXT:    v_mov_b32_e32 v8, v5
 ; VI-MOVREL-NEXT:    v_mov_b32_e32 v9, v5
-; VI-MOVREL-NEXT:    v_mov_b32_e32 v10, v5
 ; VI-MOVREL-NEXT:    v_mov_b32_e32 v11, v5
-; VI-MOVREL-NEXT:    v_mov_b32_e32 v12, v5
 ; VI-MOVREL-NEXT:    v_mov_b32_e32 v13, v5
-; VI-MOVREL-NEXT:    v_mov_b32_e32 v14, v5
 ; VI-MOVREL-NEXT:    v_mov_b32_e32 v15, v5
-; VI-MOVREL-NEXT:    v_mov_b32_e32 v16, v5
 ; VI-MOVREL-NEXT:    v_mov_b32_e32 v17, v5
-; VI-MOVREL-NEXT:    v_mov_b32_e32 v18, v5
 ; VI-MOVREL-NEXT:    v_mov_b32_e32 v19, v5
-; VI-MOVREL-NEXT:    v_mov_b32_e32 v20, v5
 ; VI-MOVREL-NEXT:    s_mov_b64 s[0:1], exec
 ; VI-MOVREL-NEXT:  .LBB27_1: ; =>This Inner Loop Header: Depth=1
 ; VI-MOVREL-NEXT:    s_waitcnt vmcnt(0)
@@ -9926,21 +9925,21 @@ define amdgpu_cs void @insert_or_disj_index(ptr addrspace(1) %out, ptr addrspace
 ; VI-IDXMODE-NEXT:    v_mov_b32_e32 v2, s4
 ; VI-IDXMODE-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
 ; VI-IDXMODE-NEXT:    v_mov_b32_e32 v5, 0
-; VI-IDXMODE-NEXT:    v_mov_b32_e32 v6, v5
+; VI-IDXMODE-NEXT:    v_mov_b32_e32 v6, 0
+; VI-IDXMODE-NEXT:    v_mov_b32_e32 v8, v6
+; VI-IDXMODE-NEXT:    v_mov_b32_e32 v10, v6
+; VI-IDXMODE-NEXT:    v_mov_b32_e32 v12, v6
+; VI-IDXMODE-NEXT:    v_mov_b32_e32 v14, v6
+; VI-IDXMODE-NEXT:    v_mov_b32_e32 v16, v6
+; VI-IDXMODE-NEXT:    v_mov_b32_e32 v18, v6
+; VI-IDXMODE-NEXT:    v_mov_b32_e32 v20, v6
 ; VI-IDXMODE-NEXT:    v_mov_b32_e32 v7, v5
-; VI-IDXMODE-NEXT:    v_mov_b32_e32 v8, v5
 ; VI-IDXMODE-NEXT:    v_mov_b32_e32 v9, v5
-; VI-IDXMODE-NEXT:    v_mov_b32_e32 v10, v5
 ; VI-IDXMODE-NEXT:    v_mov_b32_e32 v11, v5
-; VI-IDXMODE-NEXT:    v_mov_b32_e32 v12, v5
 ; VI-IDXMODE-NEXT:    v_mov_b32_e32 v13, v5
-; VI-IDXMODE-NEXT:    v_mov_b32_e32 v14, v5
 ; VI-IDXMODE-NEXT:    v_mov_b32_e32 v15, v5
-; VI-IDXMODE-NEXT:    v_mov_b32_e32 v16, v5
 ; VI-IDXMODE-NEXT:    v_mov_b32_e32 v17, v5
-; VI-IDXMODE-NEXT:    v_mov_b32_e32 v18, v5
 ; VI-IDXMODE-NEXT:    v_mov_b32_e32 v19, v5
-; VI-IDXMODE-NEXT:    v_mov_b32_e32 v20, v5
 ; VI-IDXMODE-NEXT:    s_mov_b64 s[0:1], exec
 ; VI-IDXMODE-NEXT:  .LBB27_1: ; =>This Inner Loop Header: Depth=1
 ; VI-IDXMODE-NEXT:    s_waitcnt vmcnt(0)
@@ -9971,21 +9970,21 @@ define amdgpu_cs void @insert_or_disj_index(ptr addrspace(1) %out, ptr addrspace
 ; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-IDXMODE-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
 ; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v5, 0
-; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v6, v5
+; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v6, 0
+; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v8, v6
+; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v10, v6
+; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v12, v6
+; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v14, v6
+; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v16, v6
+; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v18, v6
+; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v20, v6
 ; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v7, v5
-; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v8, v5
 ; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v9, v5
-; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v10, v5
 ; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v11, v5
-; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v12, v5
 ; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v13, v5
-; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v14, v5
 ; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v15, v5
-; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v16, v5
 ; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v17, v5
-; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v18, v5
 ; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v19, v5
-; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v20, v5
 ; GFX9-IDXMODE-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX9-IDXMODE-NEXT:  .LBB27_1: ; =>This Inner Loop Header: Depth=1
 ; GFX9-IDXMODE-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll b/llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll
index f961e857f39e5..37990085e6abf 100644
--- a/llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll
+++ b/llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll
@@ -15,11 +15,9 @@ define void @issue92561(ptr addrspace(1) %arg) {
 ; SDAG-NEXT:    global_load_b128 v[4:7], v[0:1], off offset:16
 ; SDAG-NEXT:    global_load_b128 v[0:3], v[0:1], off
 ; SDAG-NEXT:    v_mov_b32_e32 v8, 0
-; SDAG-NEXT:    s_mov_b32 s12, 0
+; SDAG-NEXT:    s_mov_b64 s[12:13], 0
 ; SDAG-NEXT:    s_mov_b32 s3, exec_lo
-; SDAG-NEXT:    s_mov_b32 s13, s12
-; SDAG-NEXT:    s_mov_b32 s14, s12
-; SDAG-NEXT:    s_mov_b32 s15, s12
+; SDAG-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:  .LBB0_1: ; =>This Inner Loop Header: Depth=1
 ; SDAG-NEXT:    v_readfirstlane_b32 s4, v0
@@ -51,14 +49,10 @@ define void @issue92561(ptr addrspace(1) %arg) {
 ; SDAG-NEXT:    s_mov_b32 exec_lo, s3
 ; SDAG-NEXT:    v_dual_mov_b32 v0, 0x7fc00000 :: v_dual_mov_b32 v1, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v2, 1.0
-; SDAG-NEXT:    s_mov_b32 s0, s12
-; SDAG-NEXT:    s_mov_b32 s1, s12
-; SDAG-NEXT:    s_mov_b32 s2, s12
-; SDAG-NEXT:    s_mov_b32 s3, s12
-; SDAG-NEXT:    s_mov_b32 s4, s12
-; SDAG-NEXT:    s_mov_b32 s5, s12
-; SDAG-NEXT:    s_mov_b32 s6, s12
-; SDAG-NEXT:    s_mov_b32 s7, s12
+; SDAG-NEXT:    s_mov_b64 s[0:1], s[12:13]
+; SDAG-NEXT:    s_mov_b64 s[2:3], s[12:13]
+; SDAG-NEXT:    s_mov_b64 s[4:5], s[12:13]
+; SDAG-NEXT:    s_mov_b64 s[6:7], s[12:13]
 ; SDAG-NEXT:    s_clause 0x2
 ; SDAG-NEXT:    image_sample_c_lz v0, [v1, v1, v0, v1], s[0:7], s[12:15] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
 ; SDAG-NEXT:    image_sample_c_lz v3, [v1, v1, v1, v1], s[0:7], s[12:15] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
diff --git a/llvm/test/CodeGen/AMDGPU/issue98474-need-live-out-undef-subregister-def.ll b/llvm/test/CodeGen/AMDGPU/issue98474-need-live-out-undef-subregister-def.ll
index 7caa563d8b298..6a5a7cc3e6219 100644
--- a/llvm/test/CodeGen/AMDGPU/issue98474-need-live-out-undef-subregister-def.ll
+++ b/llvm/test/CodeGen/AMDGPU/issue98474-need-live-out-undef-subregister-def.ll
@@ -8,12 +8,10 @@ define amdgpu_vs void @test(i32 inreg %cmp, i32 %e0) {
 ; CHECK-LABEL: test:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_cmp_eq_u32 s0, 0
-; CHECK-NEXT:    s_mov_b32 s0, 0
 ; CHECK-NEXT:    s_cbranch_scc1 .LBB0_2
 ; CHECK-NEXT:  ; %bb.1: ; %load
-; CHECK-NEXT:    s_mov_b32 s1, s0
-; CHECK-NEXT:    s_mov_b32 s2, s0
-; CHECK-NEXT:    s_mov_b32 s3, s0
+; CHECK-NEXT:    s_mov_b64 s[0:1], 0
+; CHECK-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    buffer_load_format_xy v[1:2], v1, s[0:3], 0 idxen
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
index 8732c77778b01..65bc36644fdad 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
@@ -464,7 +464,9 @@ bb:
 ; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_imm_splat:
 ; GCN-DAG:         v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0
 ; GCN-DAG:         v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0
-; NOLIT-SRCC-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, 0
+; NOLIT-SRCC-DAG:  v_mov_b32_e32 v0, 0
+; NOLIT-SRCC-DAG:  v_mov_b32_e32 v1, 0
+; NOLIT-SRCC-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
 ; NOLIT-SRCC:      v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9:]+}}]
 ; LIT-SRCC:        v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], 0
 ; GFX90A:          v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], 0
diff --git a/llvm/test/CodeGen/AMDGPU/masked-load-vectortypes.ll b/llvm/test/CodeGen/AMDGPU/masked-load-vectortypes.ll
index 3b855a56a5abb..beff113dc0661 100644
--- a/llvm/test/CodeGen/AMDGPU/masked-load-vectortypes.ll
+++ b/llvm/test/CodeGen/AMDGPU/masked-load-vectortypes.ll
@@ -7,11 +7,11 @@ define <2 x i32> @uniform_masked_load_ptr1_mask_v2i32(ptr addrspace(1) inreg noc
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], 0
 ; GFX942-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX942-NEXT:    s_cbranch_execz .LBB0_2
 ; GFX942-NEXT:  ; %bb.1: ; %cond.load
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    global_load_dwordx2 v[0:1], v0, s[0:1]
 ; GFX942-NEXT:  .LBB0_2:
 ; GFX942-NEXT:    s_or_b64 exec, exec, s[2:3]
@@ -30,13 +30,12 @@ define <4 x i32> @uniform_masked_load_ptr1_mask_v4i32(ptr addrspace(1) inreg noc
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    v_mov_b32_e32 v1, v0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], 0
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
 ; GFX942-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX942-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX942-NEXT:  ; %bb.1: ; %cond.load
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    global_load_dwordx4 v[0:3], v0, s[0:1]
 ; GFX942-NEXT:  .LBB1_2:
 ; GFX942-NEXT:    s_or_b64 exec, exec, s[2:3]
@@ -55,13 +54,12 @@ define <4 x float> @uniform_masked_load_ptr1_mask_v4f32(ptr addrspace(1) inreg n
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    v_mov_b32_e32 v1, v0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], 0
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
 ; GFX942-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX942-NEXT:    s_cbranch_execz .LBB2_2
 ; GFX942-NEXT:  ; %bb.1: ; %cond.load
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    global_load_dwordx4 v[0:3], v0, s[0:1]
 ; GFX942-NEXT:  .LBB2_2:
 ; GFX942-NEXT:    s_or_b64 exec, exec, s[2:3]
@@ -80,20 +78,16 @@ define <8 x i32> @uniform_masked_load_ptr1_mask_v8i32(ptr addrspace(1) inreg noc
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    v_mov_b32_e32 v1, v0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-NEXT:    v_mov_b32_e32 v3, v0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v0
-; GFX942-NEXT:    v_mov_b32_e32 v7, v0
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], 0
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[0:1]
 ; GFX942-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX942-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX942-NEXT:  ; %bb.1: ; %cond.load
-; GFX942-NEXT:    global_load_dwordx4 v[4:7], v0, s[0:1] offset:16
-; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    global_load_dwordx4 v[0:3], v0, s[0:1]
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-NEXT:    global_load_dwordx4 v[4:7], v8, s[0:1] offset:16
+; GFX942-NEXT:    global_load_dwordx4 v[0:3], v8, s[0:1]
 ; GFX942-NEXT:  .LBB3_2:
 ; GFX942-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
@@ -111,20 +105,16 @@ define <8 x float> @uniform_masked_load_ptr1_mask_v8f32(ptr addrspace(1) inreg n
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    v_mov_b32_e32 v1, v0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-NEXT:    v_mov_b32_e32 v3, v0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v0
-; GFX942-NEXT:    v_mov_b32_e32 v7, v0
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], 0
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[0:1]
 ; GFX942-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX942-NEXT:    s_cbranch_execz .LBB4_2
 ; GFX942-NEXT:  ; %bb.1: ; %cond.load
-; GFX942-NEXT:    global_load_dwordx4 v[4:7], v0, s[0:1] offset:16
-; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    global_load_dwordx4 v[0:3], v0, s[0:1]
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-NEXT:    global_load_dwordx4 v[4:7], v8, s[0:1] offset:16
+; GFX942-NEXT:    global_load_dwordx4 v[0:3], v8, s[0:1]
 ; GFX942-NEXT:  .LBB4_2:
 ; GFX942-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
@@ -142,13 +132,12 @@ define <8 x i16> @uniform_masked_load_ptr1_mask_v8i16(ptr addrspace(1) inreg noc
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    v_mov_b32_e32 v1, v0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], 0
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
 ; GFX942-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX942-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX942-NEXT:  ; %bb.1: ; %cond.load
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    global_load_dwordx4 v[0:3], v0, s[0:1]
 ; GFX942-NEXT:  .LBB5_2:
 ; GFX942-NEXT:    s_or_b64 exec, exec, s[2:3]
@@ -167,13 +156,12 @@ define <8 x half> @uniform_masked_load_ptr1_mask_v8f16(ptr addrspace(1) inreg no
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    v_mov_b32_e32 v1, v0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], 0
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
 ; GFX942-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX942-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX942-NEXT:  ; %bb.1: ; %cond.load
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    global_load_dwordx4 v[0:3], v0, s[0:1]
 ; GFX942-NEXT:  .LBB6_2:
 ; GFX942-NEXT:    s_or_b64 exec, exec, s[2:3]
@@ -192,13 +180,12 @@ define <8 x bfloat> @uniform_masked_load_ptr1_mask_v8bf16(ptr addrspace(1) inreg
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    v_mov_b32_e32 v1, v0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], 0
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
 ; GFX942-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX942-NEXT:    s_cbranch_execz .LBB7_2
 ; GFX942-NEXT:  ; %bb.1: ; %cond.load
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    global_load_dwordx4 v[0:3], v0, s[0:1]
 ; GFX942-NEXT:  .LBB7_2:
 ; GFX942-NEXT:    s_or_b64 exec, exec, s[2:3]
diff --git a/llvm/test/CodeGen/AMDGPU/max-hard-clause-length.ll b/llvm/test/CodeGen/AMDGPU/max-hard-clause-length.ll
index af713179a888d..4d5b532d3c90b 100644
--- a/llvm/test/CodeGen/AMDGPU/max-hard-clause-length.ll
+++ b/llvm/test/CodeGen/AMDGPU/max-hard-clause-length.ll
@@ -11,11 +11,9 @@ define amdgpu_kernel void @long_store_chain(ptr addrspace(1) %p) {
 ; GFX10-LABEL: long_store_chain:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
-; GFX10-NEXT:    s_mov_b32 s0, 0
+; GFX10-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
-; GFX10-NEXT:    s_mov_b32 s1, s0
-; GFX10-NEXT:    s_mov_b32 s2, s0
-; GFX10-NEXT:    s_mov_b32 s3, s0
+; GFX10-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s2
@@ -92,11 +90,9 @@ define amdgpu_kernel void @long_store_chain(ptr addrspace(1) %p) {
 ; GFX11-LABEL: long_store_chain:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_mov_b32 s1, s0
-; GFX11-NEXT:    s_mov_b32 s2, s0
-; GFX11-NEXT:    s_mov_b32 s3, s0
+; GFX11-NEXT:    s_mov_b64 s[0:1], 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
 ; GFX11-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s2
@@ -175,11 +171,9 @@ define amdgpu_kernel void @long_store_chain(ptr addrspace(1) %p) {
 ; GFX12-LABEL: long_store_chain:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_mov_b32 s1, s0
-; GFX12-NEXT:    s_mov_b32 s2, s0
-; GFX12-NEXT:    s_mov_b32 s3, s0
+; GFX12-NEXT:    s_mov_b64 s[0:1], 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s2
diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll b/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll
index 8157b1a7f7c80..8a8c423ba113a 100644
--- a/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll
+++ b/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll
@@ -175,9 +175,9 @@ define void @issue63986_reduced_expanded(i64 %idxprom) {
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:  .LBB1_8: ; %post-loop-memcpy-expansion
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0
-; CHECK-NEXT:    v_mov_b32_e32 v3, v2
+; CHECK-NEXT:    v_mov_b32_e32 v3, 0
+; CHECK-NEXT:    v_mov_b32_e32 v5, v3
 ; CHECK-NEXT:    v_mov_b32_e32 v4, v2
-; CHECK-NEXT:    v_mov_b32_e32 v5, v2
 ; CHECK-NEXT:    s_and_b64 vcc, exec, 0
 ; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:  .LBB1_9: ; %loop-memcpy-expansion2
diff --git a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
index 4a635d6e7f59f..e2b28ee5e92b0 100644
--- a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
@@ -458,7 +458,9 @@ exit:
 
 ; GCN-LABEL: {{^}}test_mfma_loop_mfma_forward_init:
 
-; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0
+; GFX908:          v_mov_b32_e32 v0, 0
+; GFX908:          v_mov_b32_e32 v1, 0
+; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
 ; GFX908:          v_mfma_f32_32x32x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9:]+}}]
 ; GFX90A-NOT:      v_accvgpr
 ; GFX90A:          v_mfma_f32_32x32x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, 0{{$}}
@@ -499,7 +501,9 @@ exit:
 
 ; GCN-LABEL: {{^}}test_mfma_loop_agpr_init:
 
-; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0
+; GFX908:          v_mov_b32_e32 v0, 0
+; GFX908:          v_mov_b32_e32 v1, 0
+; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
 ; GFX908:          v_mfma_f32_32x32x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9:]+}}]
 ; GFX90A-NOT:      v_accvgpr
 ; GFX90A:          v_mfma_f32_32x32x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, 0{{$}}
@@ -577,9 +581,8 @@ exit:
 
 ; GCN-LABEL: {{^}}test_mfma_nested_loop_zeroinit:
 
-; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
-; GFX90A:          v_accvgpr_write_b32 [[LEAD:a[0-9]+]], 0
-; GFX90A-COUNT-31: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]]
+; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GFX90A-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
 
 ; Check that we do not copy agprs to vgprs and back in an outer loop.
 
diff --git a/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll b/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll
index 43e3a1fa29483..890c7234b2e45 100644
--- a/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll
+++ b/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll
@@ -28,20 +28,22 @@ define void @nonkernel() {
 ; GFX9-LABEL: nonkernel:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9-NEXT:    ds_write_b32 v0, v0 offset:8
-; GFX9-NEXT:    ds_write_b64 v0, v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    ds_write_b32 v2, v2 offset:8
+; GFX9-NEXT:    ds_write_b64 v2, v[0:1]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: nonkernel:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-NEXT:    v_mov_b32_e32 v1, v0
-; GFX10-NEXT:    ds_write_b32 v0, v0 offset:8
-; GFX10-NEXT:    ds_write_b64 v0, v[0:1]
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    ds_write_b32 v2, v2 offset:8
+; GFX10-NEXT:    ds_write_b64 v2, v[0:1]
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll b/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll
index d62f045674ace..4b58fdab44bc6 100644
--- a/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll
+++ b/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll
@@ -8,7 +8,7 @@ define amdgpu_cs void @_amdgpu_cs_main(float %0, i32 %1) {
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX10-NEXT:    s_mov_b32 s1, 0
 ; GFX10-NEXT:    ; implicit-def: $sgpr2
 ; GFX10-NEXT:    s_inst_prefetch 0x1
@@ -29,17 +29,11 @@ define amdgpu_cs void @_amdgpu_cs_main(float %0, i32 %1) {
 ; GFX10-NEXT:    s_cbranch_execz .LBB0_1
 ; GFX10-NEXT:  ; %bb.3: ; %branch2_merge
 ; GFX10-NEXT:    ; in Loop: Header=BB0_2 Depth=1
-; GFX10-NEXT:    s_mov_b32 s5, s4
-; GFX10-NEXT:    s_mov_b32 s6, s4
-; GFX10-NEXT:    s_mov_b32 s7, s4
-; GFX10-NEXT:    s_mov_b32 s8, s4
-; GFX10-NEXT:    s_mov_b32 s9, s4
-; GFX10-NEXT:    s_mov_b32 s10, s4
-; GFX10-NEXT:    s_mov_b32 s11, s4
-; GFX10-NEXT:    s_mov_b32 s12, s4
-; GFX10-NEXT:    s_mov_b32 s13, s4
-; GFX10-NEXT:    s_mov_b32 s14, s4
-; GFX10-NEXT:    s_mov_b32 s15, s4
+; GFX10-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GFX10-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX10-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX10-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX10-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX10-NEXT:    s_andn2_b32 s2, s2, exec_lo
 ; GFX10-NEXT:    image_sample_lz v1, [v2, v2, v1], s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_3D
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
@@ -57,7 +51,7 @@ define amdgpu_cs void @_amdgpu_cs_main(float %0, i32 %1) {
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-NEXT:    s_mov_b32 s4, 0
+; GFX12-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:    ; implicit-def: $sgpr2
 ; GFX12-NEXT:    s_branch .LBB0_2
@@ -77,17 +71,11 @@ define amdgpu_cs void @_amdgpu_cs_main(float %0, i32 %1) {
 ; GFX12-NEXT:    s_cbranch_execz .LBB0_1
 ; GFX12-NEXT:  ; %bb.3: ; %branch2_merge
 ; GFX12-NEXT:    ; in Loop: Header=BB0_2 Depth=1
-; GFX12-NEXT:    s_mov_b32 s5, s4
-; GFX12-NEXT:    s_mov_b32 s6, s4
-; GFX12-NEXT:    s_mov_b32 s7, s4
-; GFX12-NEXT:    s_mov_b32 s8, s4
-; GFX12-NEXT:    s_mov_b32 s9, s4
-; GFX12-NEXT:    s_mov_b32 s10, s4
-; GFX12-NEXT:    s_mov_b32 s11, s4
-; GFX12-NEXT:    s_mov_b32 s12, s4
-; GFX12-NEXT:    s_mov_b32 s13, s4
-; GFX12-NEXT:    s_mov_b32 s14, s4
-; GFX12-NEXT:    s_mov_b32 s15, s4
+; GFX12-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GFX12-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX12-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX12-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX12-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX12-NEXT:    s_and_not1_b32 s2, s2, exec_lo
 ; GFX12-NEXT:    image_sample_lz v1, [v2, v2, v1], s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_3D
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll
index 3844d6054e130..493139df6357c 100644
--- a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll
+++ b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll
@@ -8,34 +8,32 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) {
 ; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    v_accvgpr_write_b32 a2, v0
-; GFX942-NEXT:    s_mov_b32 s2, 0
+; GFX942-NEXT:    s_mov_b32 s4, 0
 ; GFX942-NEXT:    v_accvgpr_write_b32 a1, 0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX942-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX942-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; GFX942-NEXT:    v_cmp_ne_u32_e64 s[0:1], 1, v0
-; GFX942-NEXT:    s_mov_b32 s3, 0
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], 0
 ; GFX942-NEXT:    s_branch .LBB0_2
 ; GFX942-NEXT:  .LBB0_1: ; %bb2
 ; GFX942-NEXT:    ; in Loop: Header=BB0_2 Depth=1
-; GFX942-NEXT:    s_or_b32 s4, s3, 1
-; GFX942-NEXT:    s_ashr_i32 s5, s3, 31
-; GFX942-NEXT:    s_mov_b32 s3, s2
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-NEXT:    s_nop 2
 ; GFX942-NEXT:    v_accvgpr_mov_b32 a0, a2
 ; GFX942-NEXT:    v_accvgpr_mov_b32 a2, a1
 ; GFX942-NEXT:    v_accvgpr_mov_b32 a3, a1
-; GFX942-NEXT:    s_and_b32 s3, s5, s4
-; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mfma_f32_16x16x16_f16 a[2:5], v[2:3], v[2:3], a[0:3]
+; GFX942-NEXT:    s_or_b32 s2, s4, 1
+; GFX942-NEXT:    s_ashr_i32 s3, s4, 31
+; GFX942-NEXT:    v_mfma_f32_16x16x16_f16 a[2:5], v[0:1], v[0:1], a[0:3]
+; GFX942-NEXT:    s_and_b32 s4, s3, s2
 ; GFX942-NEXT:    s_cbranch_execz .LBB0_4
 ; GFX942-NEXT:  .LBB0_2: ; %bb
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_and_b64 vcc, exec, s[0:1]
 ; GFX942-NEXT:    s_cbranch_vccz .LBB0_1
 ; GFX942-NEXT:  ; %bb.3:
-; GFX942-NEXT:    ; implicit-def: $sgpr3
+; GFX942-NEXT:    ; implicit-def: $sgpr4
 ; GFX942-NEXT:    ; implicit-def: $agpr2
 ; GFX942-NEXT:  .LBB0_4: ; %common.ret
 ; GFX942-NEXT:    s_endpgm
@@ -45,38 +43,36 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) {
 ; GFX908-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; GFX908-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX908-NEXT:    v_accvgpr_write_b32 a1, 0
-; GFX908-NEXT:    s_mov_b32 s2, 0
+; GFX908-NEXT:    s_mov_b32 s4, 0
 ; GFX908-NEXT:    v_accvgpr_write_b32 a2, v0
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX908-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX908-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GFX908-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; GFX908-NEXT:    v_cmp_ne_u32_e64 s[0:1], 1, v0
-; GFX908-NEXT:    s_mov_b32 s3, 0
+; GFX908-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; GFX908-NEXT:    v_mov_b32_e32 v0, 0
+; GFX908-NEXT:    v_mov_b32_e32 v1, 0
+; GFX908-NEXT:    v_cmp_ne_u32_e64 s[0:1], 1, v2
 ; GFX908-NEXT:    s_branch .LBB0_2
 ; GFX908-NEXT:  .LBB0_1: ; %bb2
 ; GFX908-NEXT:    ; in Loop: Header=BB0_2 Depth=1
-; GFX908-NEXT:    s_or_b32 s4, s3, 1
-; GFX908-NEXT:    s_ashr_i32 s5, s3, 31
-; GFX908-NEXT:    s_mov_b32 s3, s2
-; GFX908-NEXT:    v_mov_b32_e32 v1, s2
-; GFX908-NEXT:    s_nop 2
-; GFX908-NEXT:    v_accvgpr_read_b32 v0, a2
-; GFX908-NEXT:    v_mov_b32_e32 v2, s3
+; GFX908-NEXT:    s_nop 4
+; GFX908-NEXT:    v_accvgpr_read_b32 v2, a2
 ; GFX908-NEXT:    v_accvgpr_read_b32 v4, a1
 ; GFX908-NEXT:    v_accvgpr_read_b32 v3, a1
-; GFX908-NEXT:    v_accvgpr_write_b32 a0, v0
+; GFX908-NEXT:    v_accvgpr_write_b32 a0, v2
 ; GFX908-NEXT:    v_accvgpr_write_b32 a2, v4
 ; GFX908-NEXT:    v_accvgpr_write_b32 a3, v3
-; GFX908-NEXT:    s_and_b32 s3, s5, s4
-; GFX908-NEXT:    v_mfma_f32_16x16x16f16 a[2:5], v[1:2], v[1:2], a[0:3]
+; GFX908-NEXT:    s_or_b32 s2, s4, 1
+; GFX908-NEXT:    v_mfma_f32_16x16x16f16 a[2:5], v[0:1], v[0:1], a[0:3]
+; GFX908-NEXT:    s_ashr_i32 s3, s4, 31
+; GFX908-NEXT:    s_and_b32 s4, s3, s2
 ; GFX908-NEXT:    s_cbranch_execz .LBB0_4
 ; GFX908-NEXT:  .LBB0_2: ; %bb
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_and_b64 vcc, exec, s[0:1]
 ; GFX908-NEXT:    s_cbranch_vccz .LBB0_1
 ; GFX908-NEXT:  ; %bb.3:
-; GFX908-NEXT:    ; implicit-def: $sgpr3
+; GFX908-NEXT:    ; implicit-def: $sgpr4
 ; GFX908-NEXT:    ; implicit-def: $agpr2
 ; GFX908-NEXT:  .LBB0_4: ; %common.ret
 ; GFX908-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/scalar-float-sop2.ll b/llvm/test/CodeGen/AMDGPU/scalar-float-sop2.ll
index debbfce7dadcc..9769718481f18 100644
--- a/llvm/test/CodeGen/AMDGPU/scalar-float-sop2.ll
+++ b/llvm/test/CodeGen/AMDGPU/scalar-float-sop2.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX1150 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -global-isel -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX1150 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -global-isel -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GIGFX1150 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX12 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -global-isel -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -global-isel -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GIGFX12 %s
 
 define amdgpu_vs float @fadd_f32(float inreg %a, float inreg %b) {
 ; CHECK-LABEL: fadd_f32:
@@ -45,12 +45,26 @@ define amdgpu_vs float @fmin_f32(float inreg %a, float inreg %b) {
 ; GFX1150-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX1150-NEXT:    ; return to shader part epilog
 ;
+; GIGFX1150-LABEL: fmin_f32:
+; GIGFX1150:       ; %bb.0:
+; GIGFX1150-NEXT:    s_min_f32 s0, s0, s1
+; GIGFX1150-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; GIGFX1150-NEXT:    v_mov_b32_e32 v0, s0
+; GIGFX1150-NEXT:    ; return to shader part epilog
+;
 ; GFX12-LABEL: fmin_f32:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_min_num_f32 s0, s0, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-NEXT:    ; return to shader part epilog
+;
+; GIGFX12-LABEL: fmin_f32:
+; GIGFX12:       ; %bb.0:
+; GIGFX12-NEXT:    s_min_num_f32 s0, s0, s1
+; GIGFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; GIGFX12-NEXT:    v_mov_b32_e32 v0, s0
+; GIGFX12-NEXT:    ; return to shader part epilog
    %min = call float @llvm.minnum.f32(float %a, float %b)
    ret float %min
 }
@@ -63,12 +77,26 @@ define amdgpu_vs float @fmax_f32(float inreg %a, float inreg %b) {
 ; GFX1150-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX1150-NEXT:    ; return to shader part epilog
 ;
+; GIGFX1150-LABEL: fmax_f32:
+; GIGFX1150:       ; %bb.0:
+; GIGFX1150-NEXT:    s_max_f32 s0, s0, s1
+; GIGFX1150-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; GIGFX1150-NEXT:    v_mov_b32_e32 v0, s0
+; GIGFX1150-NEXT:    ; return to shader part epilog
+;
 ; GFX12-LABEL: fmax_f32:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_max_num_f32 s0, s0, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-NEXT:    ; return to shader part epilog
+;
+; GIGFX12-LABEL: fmax_f32:
+; GIGFX12:       ; %bb.0:
+; GIGFX12-NEXT:    s_max_num_f32 s0, s0, s1
+; GIGFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; GIGFX12-NEXT:    v_mov_b32_e32 v0, s0
+; GIGFX12-NEXT:    ; return to shader part epilog
    %max = call float @llvm.maxnum.f32(float %a, float %b)
    ret float %max
 }
@@ -114,12 +142,26 @@ define amdgpu_vs half @fmin_f16(half inreg %a, half inreg %b) {
 ; GFX1150-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX1150-NEXT:    ; return to shader part epilog
 ;
+; GIGFX1150-LABEL: fmin_f16:
+; GIGFX1150:       ; %bb.0:
+; GIGFX1150-NEXT:    s_min_f16 s0, s0, s1
+; GIGFX1150-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; GIGFX1150-NEXT:    v_mov_b32_e32 v0, s0
+; GIGFX1150-NEXT:    ; return to shader part epilog
+;
 ; GFX12-LABEL: fmin_f16:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_min_num_f16 s0, s0, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-NEXT:    ; return to shader part epilog
+;
+; GIGFX12-LABEL: fmin_f16:
+; GIGFX12:       ; %bb.0:
+; GIGFX12-NEXT:    s_min_num_f16 s0, s0, s1
+; GIGFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; GIGFX12-NEXT:    v_mov_b32_e32 v0, s0
+; GIGFX12-NEXT:    ; return to shader part epilog
    %min = call half @llvm.minnum.f16(half %a, half %b)
    ret half %min
 }
@@ -132,12 +174,26 @@ define amdgpu_vs half @fmax_f16(half inreg %a, half inreg %b) {
 ; GFX1150-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX1150-NEXT:    ; return to shader part epilog
 ;
+; GIGFX1150-LABEL: fmax_f16:
+; GIGFX1150:       ; %bb.0:
+; GIGFX1150-NEXT:    s_max_f16 s0, s0, s1
+; GIGFX1150-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; GIGFX1150-NEXT:    v_mov_b32_e32 v0, s0
+; GIGFX1150-NEXT:    ; return to shader part epilog
+;
 ; GFX12-LABEL: fmax_f16:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_max_num_f16 s0, s0, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-NEXT:    ; return to shader part epilog
+;
+; GIGFX12-LABEL: fmax_f16:
+; GIGFX12:       ; %bb.0:
+; GIGFX12-NEXT:    s_max_num_f16 s0, s0, s1
+; GIGFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; GIGFX12-NEXT:    v_mov_b32_e32 v0, s0
+; GIGFX12-NEXT:    ; return to shader part epilog
    %max = call half @llvm.maxnum.f16(half %a, half %b)
    ret half %max
 }
@@ -211,31 +267,53 @@ define amdgpu_vs half @fmac_f16_with_mov(half inreg %a, half inreg %b, half inre
 define amdgpu_ps float @_amdgpu_ps_main() {
 ; GFX1150-LABEL: _amdgpu_ps_main:
 ; GFX1150:       ; %bb.0: ; %bb
-; GFX1150-NEXT:    s_mov_b32 s0, 0
-; GFX1150-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1150-NEXT:    s_mov_b32 s1, s0
-; GFX1150-NEXT:    s_mov_b32 s2, s0
-; GFX1150-NEXT:    s_mov_b32 s3, s0
+; GFX1150-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1150-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_3)
+; GFX1150-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX1150-NEXT:    s_buffer_load_b64 s[0:1], s[0:3], 0x0
 ; GFX1150-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1150-NEXT:    s_fmac_f32 s0, s1, 4.0
-; GFX1150-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
 ; GFX1150-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX1150-NEXT:    ; return to shader part epilog
 ;
+; GIGFX1150-LABEL: _amdgpu_ps_main:
+; GIGFX1150:       ; %bb.0: ; %bb
+; GIGFX1150-NEXT:    s_mov_b32 s0, 0
+; GIGFX1150-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GIGFX1150-NEXT:    s_mov_b32 s1, s0
+; GIGFX1150-NEXT:    s_mov_b32 s2, s0
+; GIGFX1150-NEXT:    s_mov_b32 s3, s0
+; GIGFX1150-NEXT:    s_buffer_load_b64 s[0:1], s[0:3], 0x0
+; GIGFX1150-NEXT:    s_waitcnt lgkmcnt(0)
+; GIGFX1150-NEXT:    s_fmac_f32 s0, s1, 4.0
+; GIGFX1150-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; GIGFX1150-NEXT:    v_mov_b32_e32 v0, s0
+; GIGFX1150-NEXT:    ; return to shader part epilog
+;
 ; GFX12-LABEL: _amdgpu_ps_main:
 ; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_mov_b32 s1, s0
-; GFX12-NEXT:    s_mov_b32 s2, s0
-; GFX12-NEXT:    s_mov_b32 s3, s0
+; GFX12-NEXT:    s_mov_b64 s[0:1], 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_3)
+; GFX12-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX12-NEXT:    s_buffer_load_b64 s[0:1], s[0:3], 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_fmac_f32 s0, s1, 4.0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-NEXT:    ; return to shader part epilog
+;
+; GIGFX12-LABEL: _amdgpu_ps_main:
+; GIGFX12:       ; %bb.0: ; %bb
+; GIGFX12-NEXT:    s_mov_b32 s0, 0
+; GIGFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GIGFX12-NEXT:    s_mov_b32 s1, s0
+; GIGFX12-NEXT:    s_mov_b32 s2, s0
+; GIGFX12-NEXT:    s_mov_b32 s3, s0
+; GIGFX12-NEXT:    s_buffer_load_b64 s[0:1], s[0:3], 0x0
+; GIGFX12-NEXT:    s_wait_kmcnt 0x0
+; GIGFX12-NEXT:    s_fmac_f32 s0, s1, 4.0
+; GIGFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; GIGFX12-NEXT:    v_mov_b32_e32 v0, s0
+; GIGFX12-NEXT:    ; return to shader part epilog
 bb:
   %i = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> zeroinitializer, i32 0, i32 0)
   %i1 = bitcast i32 %i to float
diff --git a/llvm/test/CodeGen/AMDGPU/scheduler-rp-calc-one-successor-two-predecessors-bug.ll b/llvm/test/CodeGen/AMDGPU/scheduler-rp-calc-one-successor-two-predecessors-bug.ll
index 118c47e680709..bc9a3ec97ae34 100644
--- a/llvm/test/CodeGen/AMDGPU/scheduler-rp-calc-one-successor-two-predecessors-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/scheduler-rp-calc-one-successor-two-predecessors-bug.ll
@@ -9,38 +9,32 @@ declare void @llvm.amdgcn.exp.compr.v2f16(i32 immarg, i32 immarg, <2 x half>, <2
 define amdgpu_ps void @_amdgpu_ps_main(float %arg) {
 ; GFX900-LABEL: _amdgpu_ps_main:
 ; GFX900:       ; %bb.0: ; %bb
-; GFX900-NEXT:    s_mov_b64 s[4:5], exec
+; GFX900-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX900-NEXT:    s_wqm_b64 exec, exec
 ; GFX900-NEXT:    v_mov_b32_e32 v1, v0
-; GFX900-NEXT:    s_mov_b32 s0, 0
 ; GFX900-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v1
 ; GFX900-NEXT:    ; implicit-def: $vgpr0
 ; GFX900-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX900-NEXT:    s_xor_b64 s[6:7], exec, s[2:3]
+; GFX900-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
 ; GFX900-NEXT:    s_cbranch_execz .LBB0_2
 ; GFX900-NEXT:  ; %bb.1: ; %bb1
+; GFX900-NEXT:    s_mov_b64 s[12:13], 0
 ; GFX900-NEXT:    v_mov_b32_e32 v0, 0
-; GFX900-NEXT:    s_mov_b32 s1, s0
-; GFX900-NEXT:    s_mov_b32 s2, s0
-; GFX900-NEXT:    s_mov_b32 s3, s0
-; GFX900-NEXT:    s_mov_b32 s8, s0
-; GFX900-NEXT:    s_mov_b32 s9, s0
-; GFX900-NEXT:    s_mov_b32 s10, s0
-; GFX900-NEXT:    s_mov_b32 s11, s0
-; GFX900-NEXT:    s_mov_b32 s12, s0
-; GFX900-NEXT:    s_mov_b32 s13, s0
-; GFX900-NEXT:    s_mov_b32 s14, s0
-; GFX900-NEXT:    s_mov_b32 s15, s0
-; GFX900-NEXT:    image_sample v[0:1], v[0:1], s[8:15], s[0:3] dmask:0x3
+; GFX900-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX900-NEXT:    s_mov_b64 s[4:5], s[12:13]
+; GFX900-NEXT:    s_mov_b64 s[6:7], s[12:13]
+; GFX900-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX900-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX900-NEXT:    image_sample v[0:1], v[0:1], s[4:11], s[12:15] dmask:0x3
 ; GFX900-NEXT:  .LBB0_2: ; %Flow
-; GFX900-NEXT:    s_or_saveexec_b64 s[0:1], s[6:7]
-; GFX900-NEXT:    s_and_b64 exec, exec, s[4:5]
-; GFX900-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX900-NEXT:    s_or_saveexec_b64 s[2:3], s[2:3]
+; GFX900-NEXT:    s_and_b64 exec, exec, s[0:1]
+; GFX900-NEXT:    s_and_b64 s[2:3], exec, s[2:3]
 ; GFX900-NEXT:    v_mov_b32_e32 v2, 1.0
-; GFX900-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX900-NEXT:    s_xor_b64 exec, exec, s[2:3]
 ; GFX900-NEXT:    s_cbranch_execz .LBB0_5
 ; GFX900-NEXT:  ; %bb.3: ; %bb5
-; GFX900-NEXT:    s_andn2_b64 s[4:5], s[4:5], exec
+; GFX900-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
 ; GFX900-NEXT:    s_cbranch_scc0 .LBB0_6
 ; GFX900-NEXT:  ; %bb.4: ; %bb5
 ; GFX900-NEXT:    s_mov_b64 exec, 0
@@ -48,7 +42,7 @@ define amdgpu_ps void @_amdgpu_ps_main(float %arg) {
 ; GFX900-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX900-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX900-NEXT:  .LBB0_5: ; %bb6
-; GFX900-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX900-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    v_cvt_pkrtz_f16_f32 v1, 0, v1
 ; GFX900-NEXT:    v_cvt_pkrtz_f16_f32 v0, v2, v0
diff --git a/llvm/test/CodeGen/AMDGPU/set_kill_i1_for_floation_point_comparison.ll b/llvm/test/CodeGen/AMDGPU/set_kill_i1_for_floation_point_comparison.ll
index 5f101c360f148..bb54184dd73c1 100644
--- a/llvm/test/CodeGen/AMDGPU/set_kill_i1_for_floation_point_comparison.ll
+++ b/llvm/test/CodeGen/AMDGPU/set_kill_i1_for_floation_point_comparison.ll
@@ -6,14 +6,14 @@ define amdgpu_ps void @_amdgpu_ps_main() {
   ; CHECK: bb.0.entry:
   ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1, [[S_MOV_B32_]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3
+  ; CHECK-NEXT:   [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0
+  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B]], %subreg.sub0_sub1, [[S_MOV_B]], %subreg.sub2_sub3
   ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM killed [[REG_SEQUENCE]], 0, 0 :: (dereferenceable invariant load (s32))
-  ; CHECK-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
-  ; CHECK-NEXT:   nofpexcept S_CMP_NLT_F32 [[S_BUFFER_LOAD_DWORD_IMM]], [[S_MOV_B32_1]], implicit-def $scc, implicit $mode
+  ; CHECK-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
+  ; CHECK-NEXT:   nofpexcept S_CMP_NLT_F32 [[S_BUFFER_LOAD_DWORD_IMM]], [[S_MOV_B32_]], implicit-def $scc, implicit $mode
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sreg_32_xm0_xexec = COPY $scc
   ; CHECK-NEXT:   SI_KILL_I1_PSEUDO killed [[COPY]], 0, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
-  ; CHECK-NEXT:   nofpexcept S_CMP_LT_F32 [[S_BUFFER_LOAD_DWORD_IMM]], [[S_MOV_B32_1]], implicit-def $scc, implicit $mode
+  ; CHECK-NEXT:   nofpexcept S_CMP_LT_F32 [[S_BUFFER_LOAD_DWORD_IMM]], [[S_MOV_B32_]], implicit-def $scc, implicit $mode
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit $scc
   ; CHECK-NEXT:   S_BRANCH %bb.1
   ; CHECK-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/si-optimize-vgpr-live-range-dbg-instr.ll b/llvm/test/CodeGen/AMDGPU/si-optimize-vgpr-live-range-dbg-instr.ll
index d34769ad0fcf0..376c49e4ab769 100644
--- a/llvm/test/CodeGen/AMDGPU/si-optimize-vgpr-live-range-dbg-instr.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-optimize-vgpr-live-range-dbg-instr.ll
@@ -10,9 +10,9 @@ define void @__omp_offloading_35_36570d3__ZN6openmc31process_advance_particle_ev
 ; GCN-NEXT:    .cfi_startproc
 ; GCN-NEXT:  ; %bb.0: ; %bb
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-NEXT:    v_mov_b32_e32 v2, 0
-; GCN-NEXT:    global_load_dwordx2 v[1:2], v[1:2], off
+; GCN-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-NEXT:    v_mov_b32_e32 v4, 0
+; GCN-NEXT:    global_load_dwordx2 v[1:2], v[3:4], off
 ; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GCN-NEXT:    s_xor_b64 s[4:5], vcc, -1
@@ -27,8 +27,6 @@ define void @__omp_offloading_35_36570d3__ZN6openmc31process_advance_particle_ev
 ; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ; GCN-NEXT:  .LBB0_3: ; %bb2
-; GCN-NEXT:    v_mov_b32_e32 v3, 0
-; GCN-NEXT:    v_mov_b32_e32 v4, v3
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    flat_store_dwordx2 v[1:2], v[3:4]
 ; GCN-NEXT:    ; implicit-def: $vgpr1_vgpr2
@@ -36,7 +34,7 @@ define void @__omp_offloading_35_36570d3__ZN6openmc31process_advance_particle_ev
 ; GCN-NEXT:    s_cbranch_execz .LBB0_2
 ; GCN-NEXT:  .LBB0_4: ; %bb1
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
-; GCN-NEXT:    v_mov_b32_e32 v4, v3
+; GCN-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    flat_store_dwordx2 v[1:2], v[3:4]
 ; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/si-scheduler-exports.ll b/llvm/test/CodeGen/AMDGPU/si-scheduler-exports.ll
index ac271ff6a258b..a2751cad5e902 100644
--- a/llvm/test/CodeGen/AMDGPU/si-scheduler-exports.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-scheduler-exports.ll
@@ -5,15 +5,14 @@ define amdgpu_gs void @_amdgpu_gs_main() {
 ; CHECK-LABEL: _amdgpu_gs_main:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
-; CHECK-NEXT:    s_mov_b32 s0, 0
-; CHECK-NEXT:    s_mov_b32 s1, s0
-; CHECK-NEXT:    s_mov_b32 s2, s0
-; CHECK-NEXT:    v_mov_b32_e32 v1, v0
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    v_mov_b32_e32 v4, 0
+; CHECK-NEXT:    s_mov_b64 s[0:1], 0
+; CHECK-NEXT:    s_mov_b64 s[2:3], s[0:1]
+; CHECK-NEXT:    v_mov_b32_e32 v3, v1
 ; CHECK-NEXT:    v_mov_b32_e32 v2, v0
-; CHECK-NEXT:    v_mov_b32_e32 v3, v0
-; CHECK-NEXT:    s_mov_b32 s3, s0
 ; CHECK-NEXT:    exp mrt0 off, off, off, off
-; CHECK-NEXT:    buffer_store_dwordx4 v[0:3], v0, s[0:3], 0 idxen
+; CHECK-NEXT:    buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
 ; CHECK-NEXT:    s_endpgm
 entry:
   call void @llvm.amdgcn.exp.f32(i32 0, i32 0, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, i1 false, i1 false)
diff --git a/llvm/test/CodeGen/AMDGPU/smfmac_no_agprs.ll b/llvm/test/CodeGen/AMDGPU/smfmac_no_agprs.ll
index 899ec36e9b2fe..a4e876604f50d 100644
--- a/llvm/test/CodeGen/AMDGPU/smfmac_no_agprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/smfmac_no_agprs.ll
@@ -6,25 +6,22 @@ define protected amdgpu_kernel void @test(ptr addrspace(1) %in, ptr addrspace(1)
 ; GFX942-LABEL: test:
 ; GFX942:       ; %bb.0: ; %entry
 ; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-NEXT:    v_mov_b32_e32 v3, v0
-; GFX942-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], 0
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
+; GFX942-NEXT:    v_mov_b32_e32 v10, 0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x0
-; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[2:3]
-; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[0:1]
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v12, s4
-; GFX942-NEXT:    v_mov_b32_e32 v13, s5
-; GFX942-NEXT:    v_mov_b32_e32 v4, s6
-; GFX942-NEXT:    v_mov_b32_e32 v5, s7
-; GFX942-NEXT:    v_mov_b32_e32 v6, s7
-; GFX942-NEXT:    v_mov_b32_e32 v7, s7
+; GFX942-NEXT:    v_mov_b32_e32 v8, s4
+; GFX942-NEXT:    v_mov_b32_e32 v9, s5
+; GFX942-NEXT:    v_mov_b32_e32 v0, s6
+; GFX942-NEXT:    v_mov_b32_e32 v1, s7
+; GFX942-NEXT:    v_mov_b32_e32 v2, s7
+; GFX942-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_smfmac_i32_16x16x64_i8 v[8:11], v[12:13], v[4:7], v13
+; GFX942-NEXT:    v_smfmac_i32_16x16x64_i8 v[4:7], v[8:9], v[0:3], v9
 ; GFX942-NEXT:    s_nop 6
-; GFX942-NEXT:    global_store_dword v0, v11, s[2:3] offset:12
+; GFX942-NEXT:    global_store_dword v10, v7, s[2:3] offset:12
 ; GFX942-NEXT:    s_endpgm
 entry:
   %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 0
diff --git a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
index c611c4b502817..681ceb9e87cbd 100644
--- a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
+++ b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
@@ -50,12 +50,12 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK-NEXT:   KILL [[S_ADD_U32_]].sub0, [[S_ADD_U32_]].sub1
   ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[S_LOAD_DWORDX4_IMM]], 0, 0 :: (dereferenceable invariant load (s32))
   ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-  ; CHECK-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = S_MOV_B32 0
+  ; CHECK-NEXT:   undef [[S_MOV_B:%[0-9]+]].sub0_sub1:sgpr_128 = S_MOV_B64_IMM_PSEUDO 0
   ; CHECK-NEXT:   [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET undef %118:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+  ; CHECK-NEXT:   [[S_MOV_B:%[0-9]+]].sub2_sub3:sgpr_128 = COPY [[S_MOV_B]].sub0_sub1
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], undef %89:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   KILL undef %89:sgpr_128
-  ; CHECK-NEXT:   KILL undef %118:sgpr_128
   ; CHECK-NEXT:   [[S_SUB_I32_2:%[0-9]+]]:sreg_32 = S_SUB_I32 [[S_BUFFER_LOAD_DWORD_IMM1]], 31, implicit-def dead $scc
   ; CHECK-NEXT:   undef [[S_ADD_U32_1:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_]], implicit-def $scc
   ; CHECK-NEXT:   [[S_ADD_U32_1:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
@@ -86,19 +86,19 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK-NEXT:   [[S_ADD_U32_11:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %45:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
   ; CHECK-NEXT:   [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_]], 16, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_2]], 16, implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B32_]], [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load (s32))
-  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B32_]], undef %301:sreg_32, 0, 0 :: (dereferenceable invariant load (s32))
-  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B32_]], [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load (s32))
-  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[S_MOV_B32_]], 16, 0 :: (dereferenceable invariant load (s32))
-  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %356:sgpr_128, undef %357:sreg_32, 0, 0 :: (dereferenceable invariant load (s32))
-  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %367:sgpr_128, 16, 0 :: (dereferenceable invariant load (s32))
+  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B]], [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load (s32))
+  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B]], undef %296:sreg_32, 0, 0 :: (dereferenceable invariant load (s32))
+  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B]], [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load (s32))
+  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[S_MOV_B]], 16, 0 :: (dereferenceable invariant load (s32))
+  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %351:sgpr_128, undef %352:sreg_32, 0, 0 :: (dereferenceable invariant load (s32))
+  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %362:sgpr_128, 16, 0 :: (dereferenceable invariant load (s32))
   ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM4:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_3]], 64, 0 :: (invariant load (s128) from %ir.99, addrspace 4)
   ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM5:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_4]], 64, 0 :: (invariant load (s128) from %ir.107, addrspace 4)
   ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM6:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_6]], 0, 0 :: (invariant load (s128) from %ir.117, addrspace 4)
   ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM7:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_7]], 0, 0 :: (invariant load (s128) from %ir.124, addrspace 4)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM2]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
-  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %351:sgpr_128, [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load (s32))
-  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %362:sgpr_128, [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load (s32))
+  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %346:sgpr_128, [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load (s32))
+  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %357:sgpr_128, [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load (s32))
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM3]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[S_ADD_I32_2:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM]], -98, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ADD_I32_3:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM1]], -114, implicit-def dead $scc
@@ -116,7 +116,7 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK-NEXT:   [[S_LSHL_B32_3:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY12]], 4, implicit-def dead $scc
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN4:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM4]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[S_ADD_I32_6:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_3]], 16, implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %383:sgpr_128, [[S_ADD_I32_6]], 0, 0 :: (dereferenceable invariant load (s32))
+  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %378:sgpr_128, [[S_ADD_I32_6]], 0, 0 :: (dereferenceable invariant load (s32))
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN5:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM5]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM9:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 224, 0 :: (invariant load (s128) from %ir.129, addrspace 4)
   ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM10:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY7]], 224, 0 :: (invariant load (s128) from %ir.145, addrspace 4)
@@ -154,9 +154,10 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN13:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM11]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub3:sgpr_128 = S_MOV_B32 553734060
   ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub2:sgpr_128 = S_MOV_B32 -1
+  ; CHECK-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
   ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]]
   ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM16:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_9]], 0, 0 :: (invariant load (s128) from %ir.170, addrspace 4)
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub1
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]]
   ; CHECK-NEXT:   [[COPY15:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORD_IMM]]
   ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY15]], 0, 0 :: (dereferenceable invariant load (s32))
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN14:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM15]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
@@ -198,9 +199,9 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK-NEXT:   [[COPY17:%[0-9]+]].sub1:sgpr_128 = COPY [[S_AND_B32_1]]
   ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY17]], 0, 0 :: (dereferenceable invariant load (s32))
   ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM23:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_16]], 160, 0 :: (invariant load (s128) from %ir.256, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef %469:sreg_64, 0, 0 :: (invariant load (s32) from `ptr addrspace(4) undef`, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef %464:sreg_64, 0, 0 :: (invariant load (s32) from `ptr addrspace(4) undef`, addrspace 4)
   ; CHECK-NEXT:   KILL [[S_ADD_U32_16]].sub0, [[S_ADD_U32_16]].sub1
-  ; CHECK-NEXT:   KILL undef %469:sreg_64
+  ; CHECK-NEXT:   KILL undef %464:sreg_64
   ; CHECK-NEXT:   KILL [[COPY17]].sub0_sub1_sub2, [[COPY17]].sub3
   ; CHECK-NEXT:   [[S_LSHL_B32_8:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY14]], 3, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM24:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_17]], 160, 0 :: (invariant load (s128) from %ir.265, addrspace 4)
@@ -211,8 +212,8 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_21]], 168, 0 :: (invariant load (s32) from %ir.305, align 8, addrspace 4)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN21:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM23]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN22:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM24]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
-  ; CHECK-NEXT:   KILL [[S_LOAD_DWORDX4_IMM23]]
   ; CHECK-NEXT:   KILL [[S_LOAD_DWORDX4_IMM24]]
+  ; CHECK-NEXT:   KILL [[S_LOAD_DWORDX4_IMM23]]
   ; CHECK-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORD_IMM1]], 65535, implicit-def dead $scc
   ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]]
   ; CHECK-NEXT:   [[COPY18:%[0-9]+]].sub1:sgpr_128 = COPY [[S_AND_B32_2]]
@@ -351,13 +352,13 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK-NEXT:   [[V_OR_B32_e64_64:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_63]], [[V_ADD_U32_e64_28]], implicit $exec
   ; CHECK-NEXT:   [[V_ADD_U32_e64_30:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -593, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
   ; CHECK-NEXT:   [[V_OR_B32_e64_65:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_64]], [[V_ADD_U32_e64_29]], implicit $exec
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM undef %542:sreg_64, 0, 0 :: (invariant load (s256) from `ptr addrspace(4) undef`, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM undef %537:sreg_64, 0, 0 :: (invariant load (s256) from `ptr addrspace(4) undef`, addrspace 4)
   ; CHECK-NEXT:   [[V_OR_B32_e64_66:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_65]], [[V_ADD_U32_e64_30]], implicit $exec
   ; CHECK-NEXT:   [[S_ADD_I32_24:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM8]], -594, implicit-def dead $scc
   ; CHECK-NEXT:   [[V_OR_B32_e64_67:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_24]], [[V_OR_B32_e64_66]], implicit $exec
   ; CHECK-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 0, [[V_OR_B32_e64_67]], implicit $exec
   ; CHECK-NEXT:   undef [[V_CNDMASK_B32_e64_:%[0-9]+]].sub3:vreg_128 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[V_CMP_EQ_U32_e64_]], implicit $exec
-  ; CHECK-NEXT:   IMAGE_STORE_V4_V2_nsa_gfx10 [[V_CNDMASK_B32_e64_]], undef %556:vgpr_32, undef %558:vgpr_32, [[S_LOAD_DWORDX8_IMM]], 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), addrspace 8)
+  ; CHECK-NEXT:   IMAGE_STORE_V4_V2_nsa_gfx10 [[V_CNDMASK_B32_e64_]], undef %551:vgpr_32, undef %553:vgpr_32, [[S_LOAD_DWORDX8_IMM]], 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), addrspace 8)
   ; CHECK-NEXT:   S_ENDPGM 0
 .expVert:
   %0 = extractelement <31 x i32> %userData, i64 2
diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
index 474482b2d89ff..d3dcab18a25a2 100644
--- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
+++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
@@ -34,11 +34,11 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[36:37], s[6:7]
 ; GLOBALNESS1-NEXT:    s_load_dwordx4 s[52:55], s[8:9], 0x0
 ; GLOBALNESS1-NEXT:    s_load_dword s6, s[8:9], 0x14
-; GLOBALNESS1-NEXT:    v_mov_b32_e32 v42, 0
-; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[44:45], 0, 0
-; GLOBALNESS1-NEXT:    global_store_dword v[44:45], v42, off
+; GLOBALNESS1-NEXT:    v_mov_b32_e32 v40, 0
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[42:43], 0, 0
+; GLOBALNESS1-NEXT:    global_store_dword v[42:43], v40, off
 ; GLOBALNESS1-NEXT:    s_waitcnt lgkmcnt(0)
-; GLOBALNESS1-NEXT:    global_load_dword v2, v42, s[52:53]
+; GLOBALNESS1-NEXT:    global_load_dword v2, v40, s[52:53]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[48:49], s[4:5]
 ; GLOBALNESS1-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x18
 ; GLOBALNESS1-NEXT:    s_load_dword s7, s[8:9], 0x20
@@ -70,21 +70,21 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[38:39], s[8:9]
 ; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[8:9], 1, v1
-; GLOBALNESS1-NEXT:    ; implicit-def: $vgpr59 : SGPR spill to VGPR lane
+; GLOBALNESS1-NEXT:    ; implicit-def: $vgpr58 : SGPR spill to VGPR lane
 ; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[66:67], 1, v0
 ; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; GLOBALNESS1-NEXT:    v_writelane_b32 v59, s8, 0
+; GLOBALNESS1-NEXT:    v_writelane_b32 v58, s8, 0
 ; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[68:69], 1, v0
-; GLOBALNESS1-NEXT:    v_writelane_b32 v59, s9, 1
+; GLOBALNESS1-NEXT:    v_writelane_b32 v58, s9, 1
 ; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[70:71], 1, v3
-; GLOBALNESS1-NEXT:    v_mov_b32_e32 v46, 0x80
+; GLOBALNESS1-NEXT:    v_mov_b32_e32 v44, 0x80
 ; GLOBALNESS1-NEXT:    s_mov_b32 s82, s16
 ; GLOBALNESS1-NEXT:    s_mov_b32 s83, s15
 ; GLOBALNESS1-NEXT:    s_mov_b32 s84, s14
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[34:35], s[10:11]
-; GLOBALNESS1-NEXT:    v_mov_b32_e32 v47, 0
+; GLOBALNESS1-NEXT:    v_mov_b32_e32 v45, 0
 ; GLOBALNESS1-NEXT:    s_mov_b32 s32, 0
-; GLOBALNESS1-NEXT:    ; implicit-def: $vgpr56_vgpr57
+; GLOBALNESS1-NEXT:    ; implicit-def: $vgpr46_vgpr47
 ; GLOBALNESS1-NEXT:    s_waitcnt vmcnt(0)
 ; GLOBALNESS1-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v2
 ; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -93,24 +93,24 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
 ; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, v0
 ; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GLOBALNESS1-NEXT:    v_writelane_b32 v59, s4, 2
+; GLOBALNESS1-NEXT:    v_writelane_b32 v58, s4, 2
 ; GLOBALNESS1-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
-; GLOBALNESS1-NEXT:    v_writelane_b32 v59, s5, 3
+; GLOBALNESS1-NEXT:    v_writelane_b32 v58, s5, 3
 ; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, v3
 ; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GLOBALNESS1-NEXT:    v_writelane_b32 v59, s4, 4
-; GLOBALNESS1-NEXT:    v_writelane_b32 v59, s5, 5
+; GLOBALNESS1-NEXT:    v_writelane_b32 v58, s4, 4
+; GLOBALNESS1-NEXT:    v_writelane_b32 v58, s5, 5
 ; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, v2
-; GLOBALNESS1-NEXT:    v_writelane_b32 v59, s4, 6
-; GLOBALNESS1-NEXT:    v_writelane_b32 v59, s5, 7
+; GLOBALNESS1-NEXT:    v_writelane_b32 v58, s4, 6
+; GLOBALNESS1-NEXT:    v_writelane_b32 v58, s5, 7
 ; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[80:81], 1, v1
-; GLOBALNESS1-NEXT:    v_writelane_b32 v59, s70, 8
-; GLOBALNESS1-NEXT:    v_writelane_b32 v59, s71, 9
+; GLOBALNESS1-NEXT:    v_writelane_b32 v58, s70, 8
+; GLOBALNESS1-NEXT:    v_writelane_b32 v58, s71, 9
 ; GLOBALNESS1-NEXT:    s_branch .LBB1_4
 ; GLOBALNESS1-NEXT:  .LBB1_1: ; %bb70.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT:    v_readlane_b32 s6, v59, 6
-; GLOBALNESS1-NEXT:    v_readlane_b32 s7, v59, 7
+; GLOBALNESS1-NEXT:    v_readlane_b32 s6, v58, 6
+; GLOBALNESS1-NEXT:    v_readlane_b32 s7, v58, 7
 ; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[6:7]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccz .LBB1_28
 ; GLOBALNESS1-NEXT:  .LBB1_2: ; %Flow15
@@ -120,15 +120,15 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:  .LBB1_3: ; %Flow28
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[6:7]
-; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[56:57], v[0:1], v[0:1] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[46:47], v[0:1], v[0:1] op_sel:[0,1]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_29
 ; GLOBALNESS1-NEXT:  .LBB1_4: ; %bb5
 ; GLOBALNESS1-NEXT:    ; =>This Loop Header: Depth=1
 ; GLOBALNESS1-NEXT:    ; Child Loop BB1_16 Depth 2
-; GLOBALNESS1-NEXT:    flat_load_dword v40, v[46:47]
+; GLOBALNESS1-NEXT:    flat_load_dword v56, v[44:45]
 ; GLOBALNESS1-NEXT:    s_add_u32 s8, s38, 40
-; GLOBALNESS1-NEXT:    buffer_store_dword v42, off, s[0:3], 0
-; GLOBALNESS1-NEXT:    flat_load_dword v58, v[46:47]
+; GLOBALNESS1-NEXT:    buffer_store_dword v40, off, s[0:3], 0
+; GLOBALNESS1-NEXT:    flat_load_dword v57, v[44:45]
 ; GLOBALNESS1-NEXT:    s_addc_u32 s9, s39, 0
 ; GLOBALNESS1-NEXT:    s_getpc_b64 s[4:5]
 ; GLOBALNESS1-NEXT:    s_add_u32 s4, s4, wobble at gotpcrel32@lo+4
@@ -176,7 +176,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_cbranch_vccz .LBB1_3
 ; GLOBALNESS1-NEXT:  ; %bb.10: ; %baz.exit.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT:    flat_load_dword v0, v[44:45]
+; GLOBALNESS1-NEXT:    flat_load_dword v0, v[42:43]
 ; GLOBALNESS1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GLOBALNESS1-NEXT:    v_cmp_gt_i32_e64 s[86:87], 0, v0
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v0, 0
@@ -185,21 +185,20 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_cbranch_execz .LBB1_25
 ; GLOBALNESS1-NEXT:  ; %bb.11: ; %bb33.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT:    global_load_dwordx2 v[0:1], v[44:45], off
-; GLOBALNESS1-NEXT:    v_writelane_b32 v59, s8, 10
-; GLOBALNESS1-NEXT:    v_writelane_b32 v59, s9, 11
-; GLOBALNESS1-NEXT:    v_readlane_b32 s4, v59, 2
-; GLOBALNESS1-NEXT:    v_readlane_b32 s5, v59, 3
+; GLOBALNESS1-NEXT:    global_load_dwordx2 v[0:1], v[42:43], off
+; GLOBALNESS1-NEXT:    v_writelane_b32 v58, s8, 10
+; GLOBALNESS1-NEXT:    v_writelane_b32 v58, s9, 11
+; GLOBALNESS1-NEXT:    v_readlane_b32 s4, v58, 2
+; GLOBALNESS1-NEXT:    v_readlane_b32 s5, v58, 3
 ; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[4:5]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_13
 ; GLOBALNESS1-NEXT:  ; %bb.12: ; %bb39.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT:    v_mov_b32_e32 v43, v42
-; GLOBALNESS1-NEXT:    global_store_dwordx2 v[44:45], v[42:43], off
+; GLOBALNESS1-NEXT:    global_store_dwordx2 v[42:43], v[42:43], off
 ; GLOBALNESS1-NEXT:  .LBB1_13: ; %bb44.lr.ph.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v58
-; GLOBALNESS1-NEXT:    v_cndmask_b32_e32 v2, 0, v40, vcc
+; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v57
+; GLOBALNESS1-NEXT:    v_cndmask_b32_e32 v2, 0, v56, vcc
 ; GLOBALNESS1-NEXT:    s_waitcnt vmcnt(0)
 ; GLOBALNESS1-NEXT:    v_cmp_nlt_f64_e32 vcc, 0, v[0:1]
 ; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -228,8 +227,8 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_21
 ; GLOBALNESS1-NEXT:  ; %bb.19: ; %bb3.i.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS1-NEXT:    v_readlane_b32 s4, v59, 0
-; GLOBALNESS1-NEXT:    v_readlane_b32 s5, v59, 1
+; GLOBALNESS1-NEXT:    v_readlane_b32 s4, v58, 0
+; GLOBALNESS1-NEXT:    v_readlane_b32 s5, v58, 1
 ; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[4:5]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_21
 ; GLOBALNESS1-NEXT:  ; %bb.20: ; %bb6.i.i
@@ -265,25 +264,24 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_mov_b32 s13, s83
 ; GLOBALNESS1-NEXT:    s_mov_b32 s14, s82
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v41
-; GLOBALNESS1-NEXT:    global_store_dwordx2 v[44:45], v[56:57], off
+; GLOBALNESS1-NEXT:    global_store_dwordx2 v[42:43], v[46:47], off
 ; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[54:55]
 ; GLOBALNESS1-NEXT:    s_and_saveexec_b64 s[4:5], s[96:97]
 ; GLOBALNESS1-NEXT:    s_cbranch_execz .LBB1_14
 ; GLOBALNESS1-NEXT:  ; %bb.23: ; %bb62.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS1-NEXT:    v_mov_b32_e32 v43, v42
-; GLOBALNESS1-NEXT:    global_store_dwordx2 v[44:45], v[42:43], off
+; GLOBALNESS1-NEXT:    global_store_dwordx2 v[42:43], v[42:43], off
 ; GLOBALNESS1-NEXT:    s_branch .LBB1_14
 ; GLOBALNESS1-NEXT:  .LBB1_24: ; %Flow23
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    s_load_dwordx4 s[4:7], s[38:39], 0x0
-; GLOBALNESS1-NEXT:    v_readlane_b32 s70, v59, 8
-; GLOBALNESS1-NEXT:    v_readlane_b32 s8, v59, 10
+; GLOBALNESS1-NEXT:    v_readlane_b32 s70, v58, 8
+; GLOBALNESS1-NEXT:    v_readlane_b32 s8, v58, 10
 ; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
-; GLOBALNESS1-NEXT:    v_readlane_b32 s71, v59, 9
+; GLOBALNESS1-NEXT:    v_readlane_b32 s71, v58, 9
 ; GLOBALNESS1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GLOBALNESS1-NEXT:    s_mov_b32 s55, s7
-; GLOBALNESS1-NEXT:    v_readlane_b32 s9, v59, 11
+; GLOBALNESS1-NEXT:    v_readlane_b32 s9, v58, 11
 ; GLOBALNESS1-NEXT:  .LBB1_25: ; %Flow24
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    s_or_b64 exec, exec, s[52:53]
@@ -291,19 +289,17 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_cbranch_execz .LBB1_2
 ; GLOBALNESS1-NEXT:  ; %bb.26: ; %bb67.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT:    v_readlane_b32 s6, v59, 4
-; GLOBALNESS1-NEXT:    v_readlane_b32 s7, v59, 5
+; GLOBALNESS1-NEXT:    v_readlane_b32 s6, v58, 4
+; GLOBALNESS1-NEXT:    v_readlane_b32 s7, v58, 5
 ; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[6:7]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_1
 ; GLOBALNESS1-NEXT:  ; %bb.27: ; %bb69.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT:    v_mov_b32_e32 v43, v42
-; GLOBALNESS1-NEXT:    global_store_dwordx2 v[44:45], v[42:43], off
+; GLOBALNESS1-NEXT:    global_store_dwordx2 v[42:43], v[42:43], off
 ; GLOBALNESS1-NEXT:    s_branch .LBB1_1
 ; GLOBALNESS1-NEXT:  .LBB1_28: ; %bb73.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT:    v_mov_b32_e32 v43, v42
-; GLOBALNESS1-NEXT:    global_store_dwordx2 v[44:45], v[42:43], off
+; GLOBALNESS1-NEXT:    global_store_dwordx2 v[42:43], v[42:43], off
 ; GLOBALNESS1-NEXT:    s_branch .LBB1_2
 ; GLOBALNESS1-NEXT:  .LBB1_29: ; %loop.exit.guard
 ; GLOBALNESS1-NEXT:    s_andn2_b64 vcc, exec, s[8:9]
@@ -348,11 +344,11 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[36:37], s[6:7]
 ; GLOBALNESS0-NEXT:    s_load_dwordx4 s[52:55], s[8:9], 0x0
 ; GLOBALNESS0-NEXT:    s_load_dword s6, s[8:9], 0x14
-; GLOBALNESS0-NEXT:    v_mov_b32_e32 v42, 0
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[44:45], 0, 0
-; GLOBALNESS0-NEXT:    global_store_dword v[44:45], v42, off
+; GLOBALNESS0-NEXT:    v_mov_b32_e32 v40, 0
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[42:43], 0, 0
+; GLOBALNESS0-NEXT:    global_store_dword v[42:43], v40, off
 ; GLOBALNESS0-NEXT:    s_waitcnt lgkmcnt(0)
-; GLOBALNESS0-NEXT:    global_load_dword v2, v42, s[52:53]
+; GLOBALNESS0-NEXT:    global_load_dword v2, v40, s[52:53]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[48:49], s[4:5]
 ; GLOBALNESS0-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x18
 ; GLOBALNESS0-NEXT:    s_load_dword s7, s[8:9], 0x20
@@ -384,21 +380,21 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[38:39], s[8:9]
 ; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[8:9], 1, v1
-; GLOBALNESS0-NEXT:    ; implicit-def: $vgpr59 : SGPR spill to VGPR lane
+; GLOBALNESS0-NEXT:    ; implicit-def: $vgpr58 : SGPR spill to VGPR lane
 ; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[66:67], 1, v0
 ; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; GLOBALNESS0-NEXT:    v_writelane_b32 v59, s8, 0
+; GLOBALNESS0-NEXT:    v_writelane_b32 v58, s8, 0
 ; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[68:69], 1, v0
-; GLOBALNESS0-NEXT:    v_writelane_b32 v59, s9, 1
+; GLOBALNESS0-NEXT:    v_writelane_b32 v58, s9, 1
 ; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[84:85], 1, v3
-; GLOBALNESS0-NEXT:    v_mov_b32_e32 v46, 0x80
+; GLOBALNESS0-NEXT:    v_mov_b32_e32 v44, 0x80
 ; GLOBALNESS0-NEXT:    s_mov_b32 s70, s16
 ; GLOBALNESS0-NEXT:    s_mov_b32 s71, s15
 ; GLOBALNESS0-NEXT:    s_mov_b32 s82, s14
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[34:35], s[10:11]
-; GLOBALNESS0-NEXT:    v_mov_b32_e32 v47, 0
+; GLOBALNESS0-NEXT:    v_mov_b32_e32 v45, 0
 ; GLOBALNESS0-NEXT:    s_mov_b32 s32, 0
-; GLOBALNESS0-NEXT:    ; implicit-def: $vgpr56_vgpr57
+; GLOBALNESS0-NEXT:    ; implicit-def: $vgpr46_vgpr47
 ; GLOBALNESS0-NEXT:    s_waitcnt vmcnt(0)
 ; GLOBALNESS0-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v2
 ; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -407,24 +403,24 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
 ; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, v0
 ; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GLOBALNESS0-NEXT:    v_writelane_b32 v59, s4, 2
+; GLOBALNESS0-NEXT:    v_writelane_b32 v58, s4, 2
 ; GLOBALNESS0-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
-; GLOBALNESS0-NEXT:    v_writelane_b32 v59, s5, 3
+; GLOBALNESS0-NEXT:    v_writelane_b32 v58, s5, 3
 ; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, v3
 ; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GLOBALNESS0-NEXT:    v_writelane_b32 v59, s4, 4
-; GLOBALNESS0-NEXT:    v_writelane_b32 v59, s5, 5
+; GLOBALNESS0-NEXT:    v_writelane_b32 v58, s4, 4
+; GLOBALNESS0-NEXT:    v_writelane_b32 v58, s5, 5
 ; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, v2
-; GLOBALNESS0-NEXT:    v_writelane_b32 v59, s4, 6
-; GLOBALNESS0-NEXT:    v_writelane_b32 v59, s5, 7
+; GLOBALNESS0-NEXT:    v_writelane_b32 v58, s4, 6
+; GLOBALNESS0-NEXT:    v_writelane_b32 v58, s5, 7
 ; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[80:81], 1, v1
-; GLOBALNESS0-NEXT:    v_writelane_b32 v59, s84, 8
-; GLOBALNESS0-NEXT:    v_writelane_b32 v59, s85, 9
+; GLOBALNESS0-NEXT:    v_writelane_b32 v58, s84, 8
+; GLOBALNESS0-NEXT:    v_writelane_b32 v58, s85, 9
 ; GLOBALNESS0-NEXT:    s_branch .LBB1_4
 ; GLOBALNESS0-NEXT:  .LBB1_1: ; %bb70.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    v_readlane_b32 s6, v59, 6
-; GLOBALNESS0-NEXT:    v_readlane_b32 s7, v59, 7
+; GLOBALNESS0-NEXT:    v_readlane_b32 s6, v58, 6
+; GLOBALNESS0-NEXT:    v_readlane_b32 s7, v58, 7
 ; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[6:7]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccz .LBB1_28
 ; GLOBALNESS0-NEXT:  .LBB1_2: ; %Flow15
@@ -434,15 +430,15 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:  .LBB1_3: ; %Flow28
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[6:7]
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[56:57], v[0:1], v[0:1] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[46:47], v[0:1], v[0:1] op_sel:[0,1]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_29
 ; GLOBALNESS0-NEXT:  .LBB1_4: ; %bb5
 ; GLOBALNESS0-NEXT:    ; =>This Loop Header: Depth=1
 ; GLOBALNESS0-NEXT:    ; Child Loop BB1_16 Depth 2
-; GLOBALNESS0-NEXT:    flat_load_dword v40, v[46:47]
+; GLOBALNESS0-NEXT:    flat_load_dword v56, v[44:45]
 ; GLOBALNESS0-NEXT:    s_add_u32 s8, s38, 40
-; GLOBALNESS0-NEXT:    buffer_store_dword v42, off, s[0:3], 0
-; GLOBALNESS0-NEXT:    flat_load_dword v58, v[46:47]
+; GLOBALNESS0-NEXT:    buffer_store_dword v40, off, s[0:3], 0
+; GLOBALNESS0-NEXT:    flat_load_dword v57, v[44:45]
 ; GLOBALNESS0-NEXT:    s_addc_u32 s9, s39, 0
 ; GLOBALNESS0-NEXT:    s_getpc_b64 s[4:5]
 ; GLOBALNESS0-NEXT:    s_add_u32 s4, s4, wobble at gotpcrel32@lo+4
@@ -490,7 +486,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_cbranch_vccz .LBB1_3
 ; GLOBALNESS0-NEXT:  ; %bb.10: ; %baz.exit.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    flat_load_dword v0, v[44:45]
+; GLOBALNESS0-NEXT:    flat_load_dword v0, v[42:43]
 ; GLOBALNESS0-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GLOBALNESS0-NEXT:    v_cmp_gt_i32_e64 s[86:87], 0, v0
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v0, 0
@@ -499,22 +495,21 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_cbranch_execz .LBB1_25
 ; GLOBALNESS0-NEXT:  ; %bb.11: ; %bb33.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    global_load_dwordx2 v[0:1], v[44:45], off
-; GLOBALNESS0-NEXT:    v_writelane_b32 v59, s8, 10
-; GLOBALNESS0-NEXT:    v_writelane_b32 v59, s9, 11
-; GLOBALNESS0-NEXT:    v_readlane_b32 s4, v59, 2
-; GLOBALNESS0-NEXT:    v_readlane_b32 s5, v59, 3
+; GLOBALNESS0-NEXT:    global_load_dwordx2 v[0:1], v[42:43], off
+; GLOBALNESS0-NEXT:    v_writelane_b32 v58, s8, 10
+; GLOBALNESS0-NEXT:    v_writelane_b32 v58, s9, 11
+; GLOBALNESS0-NEXT:    v_readlane_b32 s4, v58, 2
+; GLOBALNESS0-NEXT:    v_readlane_b32 s5, v58, 3
 ; GLOBALNESS0-NEXT:    s_mov_b32 s83, s55
 ; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[4:5]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_13
 ; GLOBALNESS0-NEXT:  ; %bb.12: ; %bb39.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    v_mov_b32_e32 v43, v42
-; GLOBALNESS0-NEXT:    global_store_dwordx2 v[44:45], v[42:43], off
+; GLOBALNESS0-NEXT:    global_store_dwordx2 v[42:43], v[42:43], off
 ; GLOBALNESS0-NEXT:  .LBB1_13: ; %bb44.lr.ph.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v58
-; GLOBALNESS0-NEXT:    v_cndmask_b32_e32 v2, 0, v40, vcc
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v57
+; GLOBALNESS0-NEXT:    v_cndmask_b32_e32 v2, 0, v56, vcc
 ; GLOBALNESS0-NEXT:    s_waitcnt vmcnt(0)
 ; GLOBALNESS0-NEXT:    v_cmp_nlt_f64_e32 vcc, 0, v[0:1]
 ; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -543,8 +538,8 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_21
 ; GLOBALNESS0-NEXT:  ; %bb.19: ; %bb3.i.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS0-NEXT:    v_readlane_b32 s4, v59, 0
-; GLOBALNESS0-NEXT:    v_readlane_b32 s5, v59, 1
+; GLOBALNESS0-NEXT:    v_readlane_b32 s4, v58, 0
+; GLOBALNESS0-NEXT:    v_readlane_b32 s5, v58, 1
 ; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[4:5]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_21
 ; GLOBALNESS0-NEXT:  ; %bb.20: ; %bb6.i.i
@@ -580,23 +575,22 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_mov_b32 s13, s71
 ; GLOBALNESS0-NEXT:    s_mov_b32 s14, s70
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v41
-; GLOBALNESS0-NEXT:    global_store_dwordx2 v[44:45], v[56:57], off
+; GLOBALNESS0-NEXT:    global_store_dwordx2 v[42:43], v[46:47], off
 ; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[54:55]
 ; GLOBALNESS0-NEXT:    s_and_saveexec_b64 s[4:5], s[96:97]
 ; GLOBALNESS0-NEXT:    s_cbranch_execz .LBB1_14
 ; GLOBALNESS0-NEXT:  ; %bb.23: ; %bb62.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS0-NEXT:    v_mov_b32_e32 v43, v42
-; GLOBALNESS0-NEXT:    global_store_dwordx2 v[44:45], v[42:43], off
+; GLOBALNESS0-NEXT:    global_store_dwordx2 v[42:43], v[42:43], off
 ; GLOBALNESS0-NEXT:    s_branch .LBB1_14
 ; GLOBALNESS0-NEXT:  .LBB1_24: ; %Flow23
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    v_readlane_b32 s84, v59, 8
-; GLOBALNESS0-NEXT:    v_readlane_b32 s8, v59, 10
+; GLOBALNESS0-NEXT:    v_readlane_b32 s84, v58, 8
+; GLOBALNESS0-NEXT:    v_readlane_b32 s8, v58, 10
 ; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
 ; GLOBALNESS0-NEXT:    s_mov_b32 s55, s83
-; GLOBALNESS0-NEXT:    v_readlane_b32 s85, v59, 9
-; GLOBALNESS0-NEXT:    v_readlane_b32 s9, v59, 11
+; GLOBALNESS0-NEXT:    v_readlane_b32 s85, v58, 9
+; GLOBALNESS0-NEXT:    v_readlane_b32 s9, v58, 11
 ; GLOBALNESS0-NEXT:  .LBB1_25: ; %Flow24
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    s_or_b64 exec, exec, s[52:53]
@@ -604,19 +598,17 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_cbranch_execz .LBB1_2
 ; GLOBALNESS0-NEXT:  ; %bb.26: ; %bb67.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    v_readlane_b32 s6, v59, 4
-; GLOBALNESS0-NEXT:    v_readlane_b32 s7, v59, 5
+; GLOBALNESS0-NEXT:    v_readlane_b32 s6, v58, 4
+; GLOBALNESS0-NEXT:    v_readlane_b32 s7, v58, 5
 ; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[6:7]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_1
 ; GLOBALNESS0-NEXT:  ; %bb.27: ; %bb69.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    v_mov_b32_e32 v43, v42
-; GLOBALNESS0-NEXT:    global_store_dwordx2 v[44:45], v[42:43], off
+; GLOBALNESS0-NEXT:    global_store_dwordx2 v[42:43], v[42:43], off
 ; GLOBALNESS0-NEXT:    s_branch .LBB1_1
 ; GLOBALNESS0-NEXT:  .LBB1_28: ; %bb73.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    v_mov_b32_e32 v43, v42
-; GLOBALNESS0-NEXT:    global_store_dwordx2 v[44:45], v[42:43], off
+; GLOBALNESS0-NEXT:    global_store_dwordx2 v[42:43], v[42:43], off
 ; GLOBALNESS0-NEXT:    s_branch .LBB1_2
 ; GLOBALNESS0-NEXT:  .LBB1_29: ; %loop.exit.guard
 ; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[8:9]
diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
index a401f989a2507..9867cd9495005 100644
--- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
@@ -459,11 +459,10 @@ define amdgpu_kernel void @v8i8_phi_zeroinit(ptr addrspace(1) %src1, ptr addrspa
 ; GFX942-NEXT:  ; %bb.1: ; %bb.1
 ; GFX942-NEXT:    global_load_dwordx2 v[2:3], v5, s[10:11]
 ; GFX942-NEXT:    v_cmp_gt_u32_e32 vcc, 7, v4
-; GFX942-NEXT:    s_waitcnt vmcnt(1)
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
 ; GFX942-NEXT:    s_and_b64 s[4:5], vcc, exec
-; GFX942-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-NEXT:    s_waitcnt vmcnt(1)
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], 0
 ; GFX942-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
 ; GFX942-NEXT:  .LBB9_2: ; %Flow
 ; GFX942-NEXT:    s_or_b64 exec, exec, s[2:3]
diff --git a/llvm/test/CodeGen/AMDGPU/vopc_dpp.ll b/llvm/test/CodeGen/AMDGPU/vopc_dpp.ll
index a6dcbb5bbd695..cec401c0a9713 100644
--- a/llvm/test/CodeGen/AMDGPU/vopc_dpp.ll
+++ b/llvm/test/CodeGen/AMDGPU/vopc_dpp.ll
@@ -2,7 +2,7 @@
 
 define amdgpu_cs void @_amdgpu_cs_main(i32 %0) {
 ; GFX11-LABEL: _amdgpu_cs_main:
-; GFX11:    v_cmp_eq_u32_e64_dpp s1, v1, v0 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX11:    v_cmp_eq_u32_e64_dpp s0, v1, v0 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
 .entry:
   %1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 0, i32 0, i32 15, i32 15, i1 false)
   %2 = icmp ne i32 %1, %0
diff --git a/llvm/test/CodeGen/AMDGPU/waterfall_kills_scc.ll b/llvm/test/CodeGen/AMDGPU/waterfall_kills_scc.ll
index 6133cb4690723..53172b7ce7ca7 100644
--- a/llvm/test/CodeGen/AMDGPU/waterfall_kills_scc.ll
+++ b/llvm/test/CodeGen/AMDGPU/waterfall_kills_scc.ll
@@ -30,7 +30,8 @@ define amdgpu_kernel void  @foo(i1 %cmp1) {
 ; GFX906-NEXT:    buffer_load_dword v6, off, s[12:15], 0 offset:12
 ; GFX906-NEXT:    s_load_dword s2, s[4:5], 0x24
 ; GFX906-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1c
-; GFX906-NEXT:    s_mov_b32 s4, 0
+; GFX906-NEXT:    s_mov_b64 s[4:5], 0
+; GFX906-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    s_bitcmp1_b32 s2, 0
 ; GFX906-NEXT:    s_mul_i32 s0, s0, s1
@@ -38,9 +39,9 @@ define amdgpu_kernel void  @foo(i1 %cmp1) {
 ; GFX906-NEXT:    v_mad_u32_u24 v0, s0, v0, v1
 ; GFX906-NEXT:    v_add_lshl_u32 v2, v0, v2, 4
 ; GFX906-NEXT:    v_mov_b32_e32 v0, 0
-; GFX906-NEXT:    v_mov_b32_e32 v1, v0
-; GFX906-NEXT:    s_cselect_b32 s5, 1, 0
-; GFX906-NEXT:    s_mov_b64 s[2:3], exec
+; GFX906-NEXT:    v_mov_b32_e32 v1, 0
+; GFX906-NEXT:    s_mov_b32 s2, 0
+; GFX906-NEXT:    s_cselect_b32 s3, 1, 0
 ; GFX906-NEXT:    ds_write_b64 v2, v[0:1]
 ; GFX906-NEXT:  .LBB0_1: ; =>This Inner Loop Header: Depth=1
 ; GFX906-NEXT:    s_waitcnt vmcnt(3)
@@ -59,13 +60,11 @@ define amdgpu_kernel void  @foo(i1 %cmp1) {
 ; GFX906-NEXT:    s_xor_b64 exec, exec, s[0:1]
 ; GFX906-NEXT:    s_cbranch_execnz .LBB0_1
 ; GFX906-NEXT:  ; %bb.2:
-; GFX906-NEXT:    s_cmp_lg_u32 s5, 0
-; GFX906-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX906-NEXT:    s_cselect_b32 s5, 0x3ff00000, 0
-; GFX906-NEXT:    v_cvt_f32_f64_e32 v0, s[4:5]
-; GFX906-NEXT:    s_mov_b32 s5, s4
-; GFX906-NEXT:    s_mov_b32 s6, s4
-; GFX906-NEXT:    s_mov_b32 s7, s4
+; GFX906-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX906-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX906-NEXT:    s_cselect_b32 s3, 0x3ff00000, 0
+; GFX906-NEXT:    v_cvt_f32_f64_e32 v0, s[2:3]
+; GFX906-NEXT:    s_mov_b64 s[6:7], s[4:5]
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX906-NEXT:    s_endpgm
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll
index deab407581880..360e987e2b43b 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -3398,38 +3398,38 @@ define amdgpu_gs void @wqm_init_exec() {
 ; GFX9-W64-LABEL: wqm_init_exec:
 ; GFX9-W64:       ; %bb.0: ; %bb
 ; GFX9-W64-NEXT:    s_mov_b64 exec, -1
-; GFX9-W64-NEXT:    s_mov_b32 s0, 0
 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-W64-NEXT:    s_mov_b32 s1, s0
-; GFX9-W64-NEXT:    s_mov_b32 s2, s0
-; GFX9-W64-NEXT:    s_mov_b32 s3, s0
-; GFX9-W64-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-W64-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-W64-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9-W64-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-W64-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX9-W64-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9-W64-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-W64-NEXT:    s_mov_b32 s4, 0
 ; GFX9-W64-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
-; GFX9-W64-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $exec
-; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-W64-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $exec
+; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX9-W64-NEXT:    ds_write_b32 v0, v1
 ; GFX9-W64-NEXT:    s_endpgm
 ;
 ; GFX10-W32-LABEL: wqm_init_exec:
 ; GFX10-W32:       ; %bb.0: ; %bb
 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, -1
-; GFX10-W32-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-W32-NEXT:    s_mov_b32 s0, 0
+; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
+; GFX10-W32-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-W32-NEXT:    s_mov_b32 s2, 0
 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX10-W32-NEXT:    s_mov_b32 s2, s0
-; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s1
-; GFX10-W32-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s0
+; GFX10-W32-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX10-W32-NEXT:    v_mov_b32_e32 v2, v0
-; GFX10-W32-NEXT:    v_mov_b32_e32 v3, v0
-; GFX10-W32-NEXT:    v_mov_b32_e32 v4, s0
-; GFX10-W32-NEXT:    s_mov_b32 s1, s0
-; GFX10-W32-NEXT:    s_mov_b32 s3, s0
+; GFX10-W32-NEXT:    v_mov_b32_e32 v4, 0
+; GFX10-W32-NEXT:    v_mov_b32_e32 v5, s2
+; GFX10-W32-NEXT:    s_mov_b64 s[0:1], 0
+; GFX10-W32-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX10-W32-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
-; GFX10-W32-NEXT:    ds_write_b32 v0, v4
+; GFX10-W32-NEXT:    ds_write_b32 v4, v5
 ; GFX10-W32-NEXT:    s_endpgm
 bb:
   call void @llvm.amdgcn.init.exec(i64 -1)



More information about the llvm-commits mailing list