[llvm-branch-commits] [llvm] [AMDGPU][SILoadStoreOptimizer] Merge constrained sloads (PR #96162)

Christudasan Devadasan via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Wed Jul 3 06:32:40 PDT 2024


https://github.com/cdevadas updated https://github.com/llvm/llvm-project/pull/96162

>From 7627b45295a9c64d4caccdb3a18eaf941a3937cd Mon Sep 17 00:00:00 2001
From: Christudasan Devadasan <Christudasan.Devadasan at amd.com>
Date: Thu, 20 Jun 2024 10:02:29 +0000
Subject: [PATCH 1/6] [AMDGPU][SILoadStoreOptimizer] Merge constrained sloads

Consider the constrained multi-dword loads while merging
individual loads to a single multi-dword load.
---
 llvm/lib/Target/AMDGPU/GCNSubtarget.h         |   1 +
 .../Target/AMDGPU/SILoadStoreOptimizer.cpp    |  79 ++-
 .../AMDGPU/GlobalISel/cvt_f32_ubyte.ll        | 169 +++--
 .../AMDGPU/GlobalISel/fp-atomics-gfx940.ll    |  12 +-
 .../llvm.amdgcn.global.atomic.csub.ll         |  12 +-
 .../GlobalISel/llvm.amdgcn.intersect_ray.ll   | 102 +--
 .../GlobalISel/llvm.amdgcn.set.inactive.ll    | 100 +--
 .../GlobalISel/llvm.amdgcn.update.dpp.ll      |  84 +--
 .../test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll | 204 +++---
 .../test/CodeGen/AMDGPU/GlobalISel/udivrem.ll | 130 ++--
 llvm/test/CodeGen/AMDGPU/add.v2i16.ll         |  42 +-
 .../AMDGPU/amdgpu-codegenprepare-idiv.ll      | 408 +++++------
 llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll  |  48 +-
 llvm/test/CodeGen/AMDGPU/bfe-patterns.ll      |  40 +-
 llvm/test/CodeGen/AMDGPU/bfm.ll               |  10 +-
 llvm/test/CodeGen/AMDGPU/bitreverse.ll        | 314 ++++-----
 llvm/test/CodeGen/AMDGPU/build_vector.ll      |  50 +-
 .../CodeGen/AMDGPU/calling-conventions.ll     |  50 +-
 llvm/test/CodeGen/AMDGPU/cluster_stores.ll    | 120 ++--
 .../CodeGen/AMDGPU/combine-cond-add-sub.ll    |  20 +-
 llvm/test/CodeGen/AMDGPU/ctlz.ll              | 454 ++++++------
 llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll   | 446 ++++++------
 llvm/test/CodeGen/AMDGPU/ctpop16.ll           | 100 +--
 llvm/test/CodeGen/AMDGPU/ctpop64.ll           |  48 +-
 llvm/test/CodeGen/AMDGPU/cttz.ll              | 370 +++++-----
 llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll   | 424 +++++------
 llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll     | 665 +++++++++---------
 .../AMDGPU/divergence-driven-buildvector.ll   |  48 +-
 llvm/test/CodeGen/AMDGPU/ds_read2.ll          |  16 +-
 .../CodeGen/AMDGPU/extract_vector_elt-f16.ll  |  50 +-
 llvm/test/CodeGen/AMDGPU/fabs.f16.ll          |  12 +-
 llvm/test/CodeGen/AMDGPU/fabs.ll              |  34 +-
 llvm/test/CodeGen/AMDGPU/fcanonicalize.ll     |   6 +-
 llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll     |  63 +-
 llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll     |  66 +-
 llvm/test/CodeGen/AMDGPU/fdiv.ll              | 286 ++++----
 llvm/test/CodeGen/AMDGPU/flat_atomics.ll      |  40 +-
 .../CodeGen/AMDGPU/flat_atomics_i32_system.ll | 112 +--
 llvm/test/CodeGen/AMDGPU/fma-combine.ll       | 176 ++---
 .../AMDGPU/fmul-2-combine-multi-use.ll        |  18 +-
 llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll       | 480 ++++++-------
 llvm/test/CodeGen/AMDGPU/fnearbyint.ll        |  18 +-
 llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll |  46 +-
 llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll     |  12 +-
 llvm/test/CodeGen/AMDGPU/fneg-fabs.ll         |  34 +-
 llvm/test/CodeGen/AMDGPU/fneg.ll              |  26 +-
 .../test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll |  52 +-
 llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll |  44 +-
 llvm/test/CodeGen/AMDGPU/fp-classify.ll       |  26 +-
 .../AMDGPU/fp-min-max-buffer-atomics.ll       |  66 +-
 .../AMDGPU/fp-min-max-buffer-ptr-atomics.ll   |  92 +--
 llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll      |  40 +-
 llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll      |  40 +-
 llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll      |  40 +-
 .../AMDGPU/fp64-min-max-buffer-atomics.ll     |  48 +-
 .../AMDGPU/fp64-min-max-buffer-ptr-atomics.ll |  76 +-
 llvm/test/CodeGen/AMDGPU/fp_to_sint.ll        |  38 +-
 llvm/test/CodeGen/AMDGPU/fp_to_uint.ll        |  38 +-
 llvm/test/CodeGen/AMDGPU/fshl.ll              |  88 +--
 llvm/test/CodeGen/AMDGPU/fshr.ll              |  40 +-
 llvm/test/CodeGen/AMDGPU/global_atomics.ll    |  48 +-
 .../AMDGPU/global_atomics_i32_system.ll       | 196 +++---
 llvm/test/CodeGen/AMDGPU/half.ll              |  82 +--
 .../CodeGen/AMDGPU/insert_vector_dynelt.ll    | 176 ++---
 .../insert_waitcnt_for_precise_memory.ll      | 248 +++----
 llvm/test/CodeGen/AMDGPU/kernel-args.ll       | 106 +--
 .../CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll   |  30 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll    |  64 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll    | 100 +--
 .../AMDGPU/llvm.amdgcn.global.atomic.csub.ll  |   8 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll    |  32 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll    |  50 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll    |  16 +-
 .../AMDGPU/llvm.amdgcn.intersect_ray.ll       |  72 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.permlane.ll    | 392 +++++------
 .../AMDGPU/llvm.amdgcn.permlane16.var.ll      | 168 ++---
 .../CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll   |  36 +-
 .../AMDGPU/llvm.amdgcn.s.barrier.wait.ll      | 132 ++--
 .../llvm.amdgcn.sched.group.barrier.gfx11.ll  |  28 +-
 .../llvm.amdgcn.sched.group.barrier.gfx12.ll  |  28 +-
 .../AMDGPU/llvm.amdgcn.sched.group.barrier.ll | 244 +++----
 .../AMDGPU/llvm.amdgcn.set.inactive.ll        |  80 +--
 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll  |  82 +--
 llvm/test/CodeGen/AMDGPU/llvm.exp.ll          | 116 +--
 llvm/test/CodeGen/AMDGPU/llvm.exp10.ll        | 116 +--
 llvm/test/CodeGen/AMDGPU/llvm.exp2.ll         |  26 +-
 llvm/test/CodeGen/AMDGPU/llvm.log.ll          |  36 +-
 llvm/test/CodeGen/AMDGPU/llvm.log10.ll        |  36 +-
 llvm/test/CodeGen/AMDGPU/llvm.log2.ll         |  64 +-
 .../AMDGPU/llvm.r600.read.local.size.ll       |   8 +-
 llvm/test/CodeGen/AMDGPU/llvm.round.ll        |  60 +-
 llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll        |  42 +-
 llvm/test/CodeGen/AMDGPU/madak.ll             | 184 ++---
 llvm/test/CodeGen/AMDGPU/memory_clause.ll     |  36 +-
 llvm/test/CodeGen/AMDGPU/merge-s-load.mir     | 180 ++++-
 llvm/test/CodeGen/AMDGPU/min.ll               |  54 +-
 .../CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll  | 160 ++---
 llvm/test/CodeGen/AMDGPU/mul.ll               |  29 +-
 llvm/test/CodeGen/AMDGPU/mul_int24.ll         |  76 +-
 llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll | 110 +--
 llvm/test/CodeGen/AMDGPU/or.ll                |  16 +-
 llvm/test/CodeGen/AMDGPU/packed-op-sel.ll     |  72 +-
 llvm/test/CodeGen/AMDGPU/preload-kernargs.ll  |  48 +-
 .../AMDGPU/ptr-buffer-alias-scheduling.ll     |  48 +-
 llvm/test/CodeGen/AMDGPU/rotl.ll              |  26 +-
 llvm/test/CodeGen/AMDGPU/rotr.ll              |  22 +-
 llvm/test/CodeGen/AMDGPU/shl.v2i16.ll         |  72 +-
 llvm/test/CodeGen/AMDGPU/sign_extend.ll       |  54 +-
 llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll |   8 +-
 llvm/test/CodeGen/AMDGPU/sub.ll               |  34 +-
 llvm/test/CodeGen/AMDGPU/sub.v2i16.ll         |  72 +-
 llvm/test/CodeGen/AMDGPU/udiv.ll              |  34 +-
 llvm/test/CodeGen/AMDGPU/v_cndmask.ll         |  30 +-
 llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll   | 112 +--
 llvm/test/CodeGen/AMDGPU/wave32.ll            |  54 +-
 llvm/test/CodeGen/AMDGPU/xor.ll               |  10 +-
 116 files changed, 5687 insertions(+), 5549 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index db5b467f22389..19d5b950d7142 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -967,6 +967,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 
   bool hasLDSFPAtomicAddF32() const { return GFX8Insts; }
   bool hasLDSFPAtomicAddF64() const { return GFX90AInsts; }
+  bool hasXnackReplay() const { return GFX8Insts; }
 
   /// \returns true if the subtarget has the v_permlanex16_b32 instruction.
   bool hasPermLaneX16() const { return getGeneration() >= GFX10; }
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index 8b42d4a1dee7a..0b285d52b539e 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -216,7 +216,8 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
                                    CombineInfo &Paired, bool Modify = false);
   static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI,
                         const CombineInfo &Paired);
-  static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);
+  static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired,
+                               const GCNSubtarget *STI = nullptr);
   static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI,
                                                      const CombineInfo &Paired);
   const TargetRegisterClass *
@@ -343,6 +344,7 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
   case AMDGPU::S_LOAD_DWORD_IMM:
+  case AMDGPU::S_LOAD_DWORD_IMM_ec:
   case AMDGPU::GLOBAL_LOAD_DWORD:
   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
   case AMDGPU::GLOBAL_STORE_DWORD:
@@ -353,6 +355,7 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
   case AMDGPU::S_LOAD_DWORDX2_IMM:
+  case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
   case AMDGPU::GLOBAL_LOAD_DWORDX2:
   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
   case AMDGPU::GLOBAL_STORE_DWORDX2:
@@ -363,6 +366,7 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
   case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
   case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
   case AMDGPU::S_LOAD_DWORDX3_IMM:
+  case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
   case AMDGPU::GLOBAL_LOAD_DWORDX3:
   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
   case AMDGPU::GLOBAL_STORE_DWORDX3:
@@ -373,6 +377,7 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
   case AMDGPU::S_LOAD_DWORDX4_IMM:
+  case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
   case AMDGPU::GLOBAL_LOAD_DWORDX4:
   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
   case AMDGPU::GLOBAL_STORE_DWORDX4:
@@ -383,6 +388,7 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
   case AMDGPU::S_LOAD_DWORDX8_IMM:
+  case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
     return 8;
   case AMDGPU::DS_READ_B32:
   case AMDGPU::DS_READ_B32_gfx9:
@@ -507,6 +513,11 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
   case AMDGPU::S_LOAD_DWORDX3_IMM:
   case AMDGPU::S_LOAD_DWORDX4_IMM:
   case AMDGPU::S_LOAD_DWORDX8_IMM:
+  case AMDGPU::S_LOAD_DWORD_IMM_ec:
+  case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
+  case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
+  case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
+  case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
     return S_LOAD_IMM;
   case AMDGPU::DS_READ_B32:
   case AMDGPU::DS_READ_B32_gfx9:
@@ -591,6 +602,11 @@ static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
   case AMDGPU::S_LOAD_DWORDX3_IMM:
   case AMDGPU::S_LOAD_DWORDX4_IMM:
   case AMDGPU::S_LOAD_DWORDX8_IMM:
+  case AMDGPU::S_LOAD_DWORD_IMM_ec:
+  case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
+  case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
+  case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
+  case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
     return AMDGPU::S_LOAD_DWORD_IMM;
   case AMDGPU::GLOBAL_LOAD_DWORD:
   case AMDGPU::GLOBAL_LOAD_DWORDX2:
@@ -703,6 +719,11 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
   case AMDGPU::S_LOAD_DWORDX3_IMM:
   case AMDGPU::S_LOAD_DWORDX4_IMM:
   case AMDGPU::S_LOAD_DWORDX8_IMM:
+  case AMDGPU::S_LOAD_DWORD_IMM_ec:
+  case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
+  case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
+  case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
+  case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
     Result.SBase = true;
     return Result;
   case AMDGPU::DS_READ_B32:
@@ -1212,8 +1233,17 @@ void SILoadStoreOptimizer::copyToDestRegs(
 
   // Copy to the old destination registers.
   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
-  const auto *Dest0 = TII->getNamedOperand(*CI.I, OpName);
-  const auto *Dest1 = TII->getNamedOperand(*Paired.I, OpName);
+  auto *Dest0 = TII->getNamedOperand(*CI.I, OpName);
+  auto *Dest1 = TII->getNamedOperand(*Paired.I, OpName);
+
+  // The constrained sload instructions in S_LOAD_IMM class will have
+  // `early-clobber` flag in the dst operand. Remove the flag before using the
+  // MOs in copies.
+  if (Dest0->isEarlyClobber())
+    Dest0->setIsEarlyClobber(false);
+
+  if (Dest1->isEarlyClobber())
+    Dest1->setIsEarlyClobber(false);
 
   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
       .add(*Dest0) // Copy to same destination including flags and sub reg.
@@ -1446,7 +1476,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair(
     MachineBasicBlock::iterator InsertBefore) {
   MachineBasicBlock *MBB = CI.I->getParent();
   DebugLoc DL = CI.I->getDebugLoc();
-  const unsigned Opcode = getNewOpcode(CI, Paired);
+  const unsigned Opcode = getNewOpcode(CI, Paired, STM);
 
   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
 
@@ -1658,7 +1688,8 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
 }
 
 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
-                                            const CombineInfo &Paired) {
+                                            const CombineInfo &Paired,
+                                            const GCNSubtarget *STI) {
   const unsigned Width = CI.Width + Paired.Width;
 
   switch (getCommonInstClass(CI, Paired)) {
@@ -1701,17 +1732,33 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
       return AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
     }
   case S_LOAD_IMM:
-    switch (Width) {
-    default:
-      return 0;
-    case 2:
-      return AMDGPU::S_LOAD_DWORDX2_IMM;
-    case 3:
-      return AMDGPU::S_LOAD_DWORDX3_IMM;
-    case 4:
-      return AMDGPU::S_LOAD_DWORDX4_IMM;
-    case 8:
-      return AMDGPU::S_LOAD_DWORDX8_IMM;
+    // For targets that support XNACK replay, use the constrained load opcode.
+    if (STI && STI->hasXnackReplay()) {
+      switch (Width) {
+      default:
+        return 0;
+      case 2:
+        return AMDGPU::S_LOAD_DWORDX2_IMM_ec;
+      case 3:
+        return AMDGPU::S_LOAD_DWORDX3_IMM_ec;
+      case 4:
+        return AMDGPU::S_LOAD_DWORDX4_IMM_ec;
+      case 8:
+        return AMDGPU::S_LOAD_DWORDX8_IMM_ec;
+      }
+    } else {
+      switch (Width) {
+      default:
+        return 0;
+      case 2:
+        return AMDGPU::S_LOAD_DWORDX2_IMM;
+      case 3:
+        return AMDGPU::S_LOAD_DWORDX3_IMM;
+      case 4:
+        return AMDGPU::S_LOAD_DWORDX4_IMM;
+      case 8:
+        return AMDGPU::S_LOAD_DWORDX8_IMM;
+      }
     }
   case GLOBAL_LOAD:
     switch (Width) {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
index eb20178f9f4d8..3f034eaca4997 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
@@ -468,18 +468,18 @@ define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr add
 ;
 ; VI-LABEL: load_i8_to_f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s2
-; VI-NEXT:    v_mov_b32_e32 v2, s3
+; VI-NEXT:    v_mov_b32_e32 v1, s6
+; VI-NEXT:    v_mov_b32_e32 v2, s7
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, v2, v3, vcc
 ; VI-NEXT:    flat_load_ubyte v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v0
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -513,16 +513,16 @@ define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr
 ;
 ; VI-LABEL: load_v2i8_to_v2f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_mov_b32_e32 v0, s6
+; VI-NEXT:    v_mov_b32_e32 v1, s7
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_ushort v1, v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    v_mov_b32_e32 v2, s4
+; VI-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
 ; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
@@ -562,16 +562,16 @@ define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr
 ;
 ; VI-LABEL: load_v3i8_to_v3f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_mov_b32_e32 v0, s6
+; VI-NEXT:    v_mov_b32_e32 v1, s7
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v2, v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v4, s1
-; VI-NEXT:    v_mov_b32_e32 v3, s0
+; VI-NEXT:    v_mov_b32_e32 v3, s4
+; VI-NEXT:    v_mov_b32_e32 v4, s5
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
 ; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
@@ -612,16 +612,16 @@ define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr
 ;
 ; VI-LABEL: load_v4i8_to_v4f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_mov_b32_e32 v0, s6
+; VI-NEXT:    v_mov_b32_e32 v1, s7
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v3, v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v5, s1
-; VI-NEXT:    v_mov_b32_e32 v4, s0
+; VI-NEXT:    v_mov_b32_e32 v4, s4
+; VI-NEXT:    v_mov_b32_e32 v5, s5
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
 ; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
@@ -679,11 +679,11 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias
 ;
 ; VI-LABEL: load_v4i8_to_v4f32_unaligned:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_mov_b32_e32 v0, s6
+; VI-NEXT:    v_mov_b32_e32 v1, s7
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 1, v0
@@ -706,12 +706,12 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias
 ; VI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; VI-NEXT:    v_or_b32_e32 v1, v2, v3
 ; VI-NEXT:    v_or_b32_e32 v3, v1, v0
-; VI-NEXT:    v_mov_b32_e32 v5, s1
+; VI-NEXT:    v_mov_b32_e32 v4, s4
 ; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
 ; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
 ; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
 ; VI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v3
-; VI-NEXT:    v_mov_b32_e32 v4, s0
+; VI-NEXT:    v_mov_b32_e32 v5, s5
 ; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; VI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -770,6 +770,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
 ; VI-LABEL: load_v4i8_to_v4f32_2_uses:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; VI-NEXT:    v_mov_b32_e32 v6, 9
 ; VI-NEXT:    v_mov_b32_e32 v7, 8
@@ -779,11 +780,9 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v1, v[0:1]
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_mov_b32_e32 v2, 0xff
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v5, s1
-; VI-NEXT:    v_mov_b32_e32 v4, s0
+; VI-NEXT:    v_mov_b32_e32 v4, s4
+; VI-NEXT:    v_mov_b32_e32 v5, s5
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
 ; VI-NEXT:    v_and_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -804,8 +803,8 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 24, v6
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; VI-NEXT:    v_or_b32_e32 v2, v0, v2
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_mov_b32_e32 v0, s6
+; VI-NEXT:    v_mov_b32_e32 v1, s7
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
@@ -858,11 +857,11 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr
 ;
 ; VI-LABEL: load_v7i8_to_v7f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_mov_b32_e32 v0, s6
+; VI-NEXT:    v_mov_b32_e32 v1, s7
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 1, v0
@@ -884,10 +883,10 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr
 ; VI-NEXT:    flat_load_ubyte v4, v[8:9]
 ; VI-NEXT:    flat_load_ubyte v5, v[10:11]
 ; VI-NEXT:    flat_load_ubyte v6, v[12:13]
-; VI-NEXT:    v_mov_b32_e32 v8, s1
-; VI-NEXT:    v_mov_b32_e32 v7, s0
-; VI-NEXT:    s_add_u32 s0, s0, 16
-; VI-NEXT:    s_addc_u32 s1, s1, 0
+; VI-NEXT:    s_add_u32 s0, s4, 16
+; VI-NEXT:    v_mov_b32_e32 v8, s5
+; VI-NEXT:    s_addc_u32 s1, s5, 0
+; VI-NEXT:    v_mov_b32_e32 v7, s4
 ; VI-NEXT:    v_mov_b32_e32 v10, s1
 ; VI-NEXT:    v_mov_b32_e32 v9, s0
 ; VI-NEXT:    s_waitcnt vmcnt(6)
@@ -949,18 +948,18 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr
 ;
 ; VI-LABEL: load_v8i8_to_v8f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_mov_b32_e32 v0, s6
+; VI-NEXT:    v_mov_b32_e32 v1, s7
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v9, s1
-; VI-NEXT:    v_mov_b32_e32 v8, s0
-; VI-NEXT:    s_add_u32 s0, s0, 16
-; VI-NEXT:    s_addc_u32 s1, s1, 0
+; VI-NEXT:    s_add_u32 s0, s4, 16
+; VI-NEXT:    v_mov_b32_e32 v9, s5
+; VI-NEXT:    s_addc_u32 s1, s5, 0
+; VI-NEXT:    v_mov_b32_e32 v8, s4
 ; VI-NEXT:    v_mov_b32_e32 v11, s1
 ; VI-NEXT:    v_mov_b32_e32 v10, s0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -1005,19 +1004,19 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %ou
 ;
 ; VI-LABEL: i8_zext_inreg_i32_to_f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_mov_b32_e32 v0, s6
+; VI-NEXT:    v_mov_b32_e32 v1, s7
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 2, v0
 ; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1051,18 +1050,18 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %ou
 ;
 ; VI-LABEL: i8_zext_inreg_hi1_to_f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_mov_b32_e32 v0, s6
+; VI-NEXT:    v_mov_b32_e32 v1, s7
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1096,18 +1095,18 @@ define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr
 ;
 ; VI-LABEL: i8_zext_i32_to_f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s2
-; VI-NEXT:    v_mov_b32_e32 v2, s3
+; VI-NEXT:    v_mov_b32_e32 v1, s6
+; VI-NEXT:    v_mov_b32_e32 v2, s7
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, v2, v3, vcc
 ; VI-NEXT:    flat_load_ubyte v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v0
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1157,11 +1156,11 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou
 ;
 ; VI-LABEL: v4i8_zext_v4i32_to_v4f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_mov_b32_e32 v0, s6
+; VI-NEXT:    v_mov_b32_e32 v1, s7
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 1, v0
@@ -1184,12 +1183,12 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou
 ; VI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; VI-NEXT:    v_or_b32_e32 v1, v2, v3
 ; VI-NEXT:    v_or_b32_e32 v3, v1, v0
-; VI-NEXT:    v_mov_b32_e32 v5, s1
+; VI-NEXT:    v_mov_b32_e32 v4, s4
 ; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
 ; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
 ; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
 ; VI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v3
-; VI-NEXT:    v_mov_b32_e32 v4, s0
+; VI-NEXT:    v_mov_b32_e32 v5, s5
 ; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; VI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1221,18 +1220,18 @@ define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, p
 ;
 ; VI-LABEL: extract_byte0_to_f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_mov_b32_e32 v0, s6
+; VI-NEXT:    v_mov_b32_e32 v1, s7
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_ubyte v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v0
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1265,18 +1264,18 @@ define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, p
 ;
 ; VI-LABEL: extract_byte1_to_f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_mov_b32_e32 v0, s6
+; VI-NEXT:    v_mov_b32_e32 v1, s7
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1310,18 +1309,18 @@ define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, p
 ;
 ; VI-LABEL: extract_byte2_to_f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_mov_b32_e32 v0, s6
+; VI-NEXT:    v_mov_b32_e32 v1, s7
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1354,18 +1353,18 @@ define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, p
 ;
 ; VI-LABEL: extract_byte3_to_f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_mov_b32_e32 v0, s6
+; VI-NEXT:    v_mov_b32_e32 v1, s7
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte3_e32 v2, v0
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll
index fff341b07881b..a018ea5bf18f1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll
@@ -159,10 +159,10 @@ define <2 x i16> @global_atomic_fadd_v2bf16_rtn(ptr addrspace(1) %ptr, <2 x i16>
 define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, <2 x half> %data) {
 ; GFX940-LABEL: local_atomic_fadd_v2f16_noret:
 ; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX940-NEXT:    ds_pk_add_f16 v0, v1
 ; GFX940-NEXT:    s_endpgm
   %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0)
@@ -183,10 +183,10 @@ define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half>
 define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, <2 x i16> %data) {
 ; GFX940-LABEL: local_atomic_fadd_v2bf16_noret:
 ; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v0, s1
-; GFX940-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-NEXT:    v_mov_b32_e32 v0, s3
+; GFX940-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
 ; GFX940-NEXT:    ds_pk_add_bf16 v1, v0
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
index ade6e55b482bb..dbe7d473ea632 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
@@ -160,10 +160,10 @@ define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset(ptr addrspace(1)
 ;
 ; GFX12-LABEL: global_atomic_csub_sgpr_base_offset:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-NEXT:    s_load_b96 s[4:6], s[0:1], 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX12-NEXT:    global_atomic_sub_clamp_u32 v0, v1, v0, s[0:1] offset:4096 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX12-NEXT:    global_atomic_sub_clamp_u32 v0, v1, v0, s[4:5] offset:4096 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v[0:1], v0, off
 ; GFX12-NEXT:    s_nop 0
@@ -199,10 +199,10 @@ define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset_nortn(ptr addrspa
 ;
 ; GFX12-LABEL: global_atomic_csub_sgpr_base_offset_nortn:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-NEXT:    s_load_b96 s[4:6], s[0:1], 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX12-NEXT:    global_atomic_sub_clamp_u32 v0, v1, v0, s[0:1] offset:4096 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX12-NEXT:    global_atomic_sub_clamp_u32 v0, v1, v0, s[4:5] offset:4096 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_endpgm
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
   %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %gep, i32 %data)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
index 6e96a4ddbc0b3..87d0d712d5bde 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
@@ -628,7 +628,7 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node
 define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> inreg %tdescr) {
 ; GFX1030-LABEL: image_bvh_intersect_ray_nsa_reassign:
 ; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX1030-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
 ; GFX1030-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
 ; GFX1030-NEXT:    v_mov_b32_e32 v5, 0x40400000
 ; GFX1030-NEXT:    v_mov_b32_e32 v6, 4.0
@@ -637,10 +637,10 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr,
 ; GFX1030-NEXT:    v_mov_b32_e32 v9, 0x40e00000
 ; GFX1030-NEXT:    v_mov_b32_e32 v10, 0x41000000
 ; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v0, s0
-; GFX1030-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1030-NEXT:    v_mov_b32_e32 v2, s2
-; GFX1030-NEXT:    v_mov_b32_e32 v3, s3
+; GFX1030-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1030-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1030-NEXT:    v_mov_b32_e32 v2, s6
+; GFX1030-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX1030-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
 ; GFX1030-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX1030-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
@@ -651,24 +651,24 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr,
 ; GFX1030-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1030-NEXT:    v_mov_b32_e32 v3, 1.0
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    image_bvh_intersect_ray v[0:3], v[0:10], s[4:7]
+; GFX1030-NEXT:    image_bvh_intersect_ray v[0:3], v[0:10], s[8:11]
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1030-NEXT:    flat_store_dwordx4 v[0:1], v[0:3]
 ; GFX1030-NEXT:    s_endpgm
 ;
 ; GFX1013-LABEL: image_bvh_intersect_ray_nsa_reassign:
 ; GFX1013:       ; %bb.0:
-; GFX1013-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX1013-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
 ; GFX1013-NEXT:    v_lshlrev_b32_e32 v6, 2, v0
 ; GFX1013-NEXT:    v_mov_b32_e32 v7, 0x40a00000
 ; GFX1013-NEXT:    v_mov_b32_e32 v8, 0x40c00000
 ; GFX1013-NEXT:    v_mov_b32_e32 v9, 0x40e00000
 ; GFX1013-NEXT:    v_mov_b32_e32 v10, 0x41000000
 ; GFX1013-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1013-NEXT:    v_mov_b32_e32 v0, s0
-; GFX1013-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1013-NEXT:    v_mov_b32_e32 v2, s2
-; GFX1013-NEXT:    v_mov_b32_e32 v3, s3
+; GFX1013-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1013-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1013-NEXT:    v_mov_b32_e32 v2, s6
+; GFX1013-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX1013-NEXT:    v_add_co_u32 v4, vcc_lo, v0, v6
 ; GFX1013-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
 ; GFX1013-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v6
@@ -681,42 +681,42 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr,
 ; GFX1013-NEXT:    v_mov_b32_e32 v4, 2.0
 ; GFX1013-NEXT:    v_mov_b32_e32 v5, 0x40400000
 ; GFX1013-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1013-NEXT:    image_bvh_intersect_ray v[0:3], v[0:10], s[4:7]
+; GFX1013-NEXT:    image_bvh_intersect_ray v[0:3], v[0:10], s[8:11]
 ; GFX1013-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1013-NEXT:    flat_store_dwordx4 v[0:1], v[0:3]
 ; GFX1013-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: image_bvh_intersect_ray_nsa_reassign:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b256 s[4:11], s[0:1], 0x24
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
-; GFX11-NEXT:    s_mov_b32 s8, 0x40400000
+; GFX11-NEXT:    s_mov_b32 s2, 2.0
+; GFX11-NEXT:    s_mov_b32 s1, 1.0
+; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:    s_mov_b32 s12, 0x40c00000
-; GFX11-NEXT:    s_mov_b32 s10, 0x40a00000
-; GFX11-NEXT:    s_mov_b32 s9, 4.0
 ; GFX11-NEXT:    s_mov_b32 s14, 0x41000000
 ; GFX11-NEXT:    s_mov_b32 s13, 0x40e00000
 ; GFX11-NEXT:    v_mov_b32_e32 v6, s12
 ; GFX11-NEXT:    v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v7, s13
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT:    s_mov_b32 s1, 1.0
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX11-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX11-NEXT:    s_mov_b32 s4, 0x40400000
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
 ; GFX11-NEXT:    flat_load_b32 v9, v[0:1]
 ; GFX11-NEXT:    flat_load_b32 v10, v[2:3]
-; GFX11-NEXT:    s_mov_b32 s2, 2.0
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s8
+; GFX11-NEXT:    s_mov_b32 s6, 0x40a00000
+; GFX11-NEXT:    s_mov_b32 s5, 4.0
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s4
 ; GFX11-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
-; GFX11-NEXT:    v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v4, s9
+; GFX11-NEXT:    v_dual_mov_b32 v5, s6 :: v_dual_mov_b32 v4, s5
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    image_bvh_intersect_ray v[0:3], [v9, v10, v[0:2], v[3:5], v[6:8]], s[4:7]
+; GFX11-NEXT:    image_bvh_intersect_ray v[0:3], [v9, v10, v[0:2], v[3:5], v[6:8]], s[8:11]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b128 v[0:1], v[0:3]
 ; GFX11-NEXT:    s_endpgm
@@ -742,16 +742,16 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr,
 define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> inreg %tdescr) {
 ; GFX1030-LABEL: image_bvh_intersect_ray_a16_nsa_reassign:
 ; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX1030-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
 ; GFX1030-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
 ; GFX1030-NEXT:    v_mov_b32_e32 v5, 0x44004200
 ; GFX1030-NEXT:    v_mov_b32_e32 v6, 0x46004500
 ; GFX1030-NEXT:    v_mov_b32_e32 v7, 0x48004700
 ; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v0, s0
-; GFX1030-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1030-NEXT:    v_mov_b32_e32 v2, s2
-; GFX1030-NEXT:    v_mov_b32_e32 v3, s3
+; GFX1030-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1030-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1030-NEXT:    v_mov_b32_e32 v2, s6
+; GFX1030-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX1030-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
 ; GFX1030-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX1030-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
@@ -762,21 +762,21 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_
 ; GFX1030-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1030-NEXT:    v_mov_b32_e32 v3, 1.0
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    image_bvh_intersect_ray v[0:3], v[0:7], s[4:7] a16
+; GFX1030-NEXT:    image_bvh_intersect_ray v[0:3], v[0:7], s[8:11] a16
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1030-NEXT:    flat_store_dwordx4 v[0:1], v[0:3]
 ; GFX1030-NEXT:    s_endpgm
 ;
 ; GFX1013-LABEL: image_bvh_intersect_ray_a16_nsa_reassign:
 ; GFX1013:       ; %bb.0:
-; GFX1013-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX1013-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
 ; GFX1013-NEXT:    v_lshlrev_b32_e32 v6, 2, v0
 ; GFX1013-NEXT:    v_mov_b32_e32 v7, 0x48004700
 ; GFX1013-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1013-NEXT:    v_mov_b32_e32 v0, s0
-; GFX1013-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1013-NEXT:    v_mov_b32_e32 v2, s2
-; GFX1013-NEXT:    v_mov_b32_e32 v3, s3
+; GFX1013-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1013-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1013-NEXT:    v_mov_b32_e32 v2, s6
+; GFX1013-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX1013-NEXT:    v_add_co_u32 v4, vcc_lo, v0, v6
 ; GFX1013-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
 ; GFX1013-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v6
@@ -789,37 +789,37 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_
 ; GFX1013-NEXT:    v_mov_b32_e32 v4, 2.0
 ; GFX1013-NEXT:    v_mov_b32_e32 v5, 0x44004200
 ; GFX1013-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1013-NEXT:    image_bvh_intersect_ray v[0:3], v[0:7], s[4:7] a16
+; GFX1013-NEXT:    image_bvh_intersect_ray v[0:3], v[0:7], s[8:11] a16
 ; GFX1013-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1013-NEXT:    flat_store_dwordx4 v[0:1], v[0:3]
 ; GFX1013-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: image_bvh_intersect_ray_a16_nsa_reassign:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b256 s[4:11], s[0:1], 0x24
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
-; GFX11-NEXT:    s_mov_b32 s8, 0x42004600
-; GFX11-NEXT:    s_mov_b32 s9, 0x44004700
-; GFX11-NEXT:    s_mov_b32 s10, 0x45004800
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_mov_b32 s2, 2.0
 ; GFX11-NEXT:    s_mov_b32 s1, 1.0
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX11-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX11-NEXT:    s_mov_b32 s4, 0x42004600
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
+; GFX11-NEXT:    s_mov_b32 s5, 0x44004700
 ; GFX11-NEXT:    flat_load_b32 v6, v[0:1]
 ; GFX11-NEXT:    flat_load_b32 v7, v[2:3]
-; GFX11-NEXT:    s_mov_b32 s2, 2.0
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s8
+; GFX11-NEXT:    s_mov_b32 s6, 0x45004800
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s4
 ; GFX11-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
-; GFX11-NEXT:    v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v4, s9
+; GFX11-NEXT:    v_dual_mov_b32 v5, s6 :: v_dual_mov_b32 v4, s5
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    image_bvh_intersect_ray v[0:3], [v6, v7, v[0:2], v[3:5]], s[4:7] a16
+; GFX11-NEXT:    image_bvh_intersect_ray v[0:3], [v6, v7, v[0:2], v[3:5]], s[8:11] a16
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b128 v[0:1], v[0:3]
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
index 0c60be9d94591..fa24489df52dc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
@@ -180,21 +180,21 @@ define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half>
 define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> %in) {
 ; GCN-LABEL: set_inactive_v2i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NEXT:    s_mov_b32 s4, 1
-; GCN-NEXT:    s_mov_b32 s5, s4
-; GCN-NEXT:    v_mov_b32_e32 v2, s4
-; GCN-NEXT:    v_mov_b32_e32 v3, s5
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NEXT:    s_mov_b32 s0, 1
+; GCN-NEXT:    s_mov_b32 s1, s0
+; GCN-NEXT:    v_mov_b32_e32 v3, s1
+; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN-NEXT:    s_not_b64 exec, exec
 ; GCN-NEXT:    v_mov_b32_e32 v0, v2
 ; GCN-NEXT:    v_mov_b32_e32 v1, v3
 ; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
   %tmp = call <2 x i32> @llvm.amdgcn.set.inactive.v2i32(<2 x i32> %in, <2 x i32> <i32 1, i32 1>) #0
   store <2 x i32> %tmp, ptr addrspace(1) %out
@@ -204,21 +204,21 @@ define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> %
 define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
 ; GCN-LABEL: set_inactive_v2f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NEXT:    s_mov_b32 s4, 1.0
-; GCN-NEXT:    s_mov_b32 s5, s4
-; GCN-NEXT:    v_mov_b32_e32 v2, s4
-; GCN-NEXT:    v_mov_b32_e32 v3, s5
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NEXT:    s_mov_b32 s0, 1.0
+; GCN-NEXT:    s_mov_b32 s1, s0
+; GCN-NEXT:    v_mov_b32_e32 v3, s1
+; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN-NEXT:    s_not_b64 exec, exec
 ; GCN-NEXT:    v_mov_b32_e32 v0, v2
 ; GCN-NEXT:    v_mov_b32_e32 v1, v3
 ; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
   %tmp = call <2 x float> @llvm.amdgcn.set.inactive.v2f32(<2 x float> %in, <2 x float> <float 1.0, float 1.0>) #0
   store <2 x float> %tmp, ptr addrspace(1) %out
@@ -248,21 +248,21 @@ define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloa
 define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> %in) {
 ; GCN-LABEL: set_inactive_v4i16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NEXT:    s_mov_b32 s4, 0x10001
-; GCN-NEXT:    s_mov_b32 s5, s4
-; GCN-NEXT:    v_mov_b32_e32 v2, s4
-; GCN-NEXT:    v_mov_b32_e32 v3, s5
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NEXT:    s_mov_b32 s0, 0x10001
+; GCN-NEXT:    s_mov_b32 s1, s0
+; GCN-NEXT:    v_mov_b32_e32 v3, s1
+; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN-NEXT:    s_not_b64 exec, exec
 ; GCN-NEXT:    v_mov_b32_e32 v0, v2
 ; GCN-NEXT:    v_mov_b32_e32 v1, v3
 ; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
   %tmp = call <4 x i16> @llvm.amdgcn.set.inactive.v4i16(<4 x i16> %in, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) #0
   store <4 x i16> %tmp, ptr addrspace(1) %out
@@ -272,21 +272,21 @@ define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> %
 define amdgpu_kernel void @set_inactive_v4f16(ptr addrspace(1) %out, <4 x half> %in) {
 ; GCN-LABEL: set_inactive_v4f16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NEXT:    s_mov_b32 s4, 0x3c003c00
-; GCN-NEXT:    s_mov_b32 s5, s4
-; GCN-NEXT:    v_mov_b32_e32 v2, s4
-; GCN-NEXT:    v_mov_b32_e32 v3, s5
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NEXT:    s_mov_b32 s0, 0x3c003c00
+; GCN-NEXT:    s_mov_b32 s1, s0
+; GCN-NEXT:    v_mov_b32_e32 v3, s1
+; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN-NEXT:    s_not_b64 exec, exec
 ; GCN-NEXT:    v_mov_b32_e32 v0, v2
 ; GCN-NEXT:    v_mov_b32_e32 v1, v3
 ; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
   %tmp = call <4 x half> @llvm.amdgcn.set.inactive.v4f16(<4 x half> %in, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>) #0
   store <4 x half> %tmp, ptr addrspace(1) %out
@@ -296,21 +296,21 @@ define amdgpu_kernel void @set_inactive_v4f16(ptr addrspace(1) %out, <4 x half>
 define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloat> %in) {
 ; GCN-LABEL: set_inactive_v4bf16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NEXT:    s_mov_b32 s4, 0x3f803f80
-; GCN-NEXT:    s_mov_b32 s5, s4
-; GCN-NEXT:    v_mov_b32_e32 v2, s4
-; GCN-NEXT:    v_mov_b32_e32 v3, s5
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NEXT:    s_mov_b32 s0, 0x3f803f80
+; GCN-NEXT:    s_mov_b32 s1, s0
+; GCN-NEXT:    v_mov_b32_e32 v3, s1
+; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN-NEXT:    s_not_b64 exec, exec
 ; GCN-NEXT:    v_mov_b32_e32 v0, v2
 ; GCN-NEXT:    v_mov_b32_e32 v1, v3
 ; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
   %tmp = call <4 x bfloat> @llvm.amdgcn.set.inactive.v4bf16(<4 x bfloat> %in, <4 x bfloat> <bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0>) #0
   store <4 x bfloat> %tmp, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
index 727184a36c006..d6282708ece35 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
@@ -6,36 +6,36 @@
 define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in1, i32 %in2) {
 ; GFX8-LABEL: dpp_test:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v2, s2
-; GFX8-NEXT:    v_mov_b32_e32 v0, s3
+; GFX8-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8-NEXT:    v_mov_b32_e32 v0, s7
 ; GFX8-NEXT:    s_nop 1
 ; GFX8-NEXT:    v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: dpp_test:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: dpp_test:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
 ; GFX11-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -156,47 +156,47 @@ define amdgpu_kernel void @update_dppf64_test(ptr addrspace(1) %arg, double %in1
 define amdgpu_kernel void @update_dppv2i32_test(ptr addrspace(1) %arg, <2 x i32> %in1, <2 x i32> %in2) {
 ; GFX8-LABEL: update_dppv2i32_test:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
-; GFX8-NEXT:    v_mov_b32_e32 v5, s3
-; GFX8-NEXT:    v_mov_b32_e32 v4, s2
+; GFX8-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8-NEXT:    v_mov_b32_e32 v5, s7
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; GFX8-NEXT:    v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX8-NEXT:    v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: update_dppv2i32_test:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[0:1]
-; GFX10-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[4:5]
+; GFX10-NEXT:    v_mov_b32_e32 v2, s6
+; GFX10-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; GFX10-NEXT:    v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
-; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1]
+; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: update_dppv2i32_test:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT:    global_load_b64 v[0:1], v4, s[0:1]
+; GFX11-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX11-NEXT:    global_load_b64 v[0:1], v4, s[4:5]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; GFX11-NEXT:    v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
-; GFX11-NEXT:    global_store_b64 v4, v[2:3], s[0:1]
+; GFX11-NEXT:    global_store_b64 v4, v[2:3], s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -211,47 +211,47 @@ define amdgpu_kernel void @update_dppv2i32_test(ptr addrspace(1) %arg, <2 x i32>
 define amdgpu_kernel void @update_dppv2f32_test(ptr addrspace(1) %arg, <2 x float> %in1, <2 x float> %in2) {
 ; GFX8-LABEL: update_dppv2f32_test:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
-; GFX8-NEXT:    v_mov_b32_e32 v5, s3
-; GFX8-NEXT:    v_mov_b32_e32 v4, s2
+; GFX8-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8-NEXT:    v_mov_b32_e32 v5, s7
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; GFX8-NEXT:    v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX8-NEXT:    v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: update_dppv2f32_test:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[0:1]
-; GFX10-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[4:5]
+; GFX10-NEXT:    v_mov_b32_e32 v2, s6
+; GFX10-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; GFX10-NEXT:    v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
-; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1]
+; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: update_dppv2f32_test:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT:    global_load_b64 v[0:1], v4, s[0:1]
+; GFX11-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX11-NEXT:    global_load_b64 v[0:1], v4, s[4:5]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; GFX11-NEXT:    v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
-; GFX11-NEXT:    global_store_b64 v4, v[2:3], s[0:1]
+; GFX11-NEXT:    global_store_b64 v4, v[2:3], s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
index b666f45521661..c3bd56610d102 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
@@ -616,26 +616,26 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i32> %x, <2 x i32> %y) {
 ; GFX8-LABEL: sdivrem_v2i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dwordx8 s[4:11], s[4:5], 0x0
+; GFX8-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_ashr_i32 s2, s10, 31
-; GFX8-NEXT:    s_add_i32 s0, s10, s2
+; GFX8-NEXT:    s_ashr_i32 s2, s14, 31
+; GFX8-NEXT:    s_add_i32 s0, s14, s2
 ; GFX8-NEXT:    s_xor_b32 s3, s0, s2
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX8-NEXT:    s_ashr_i32 s10, s11, 31
-; GFX8-NEXT:    s_add_i32 s0, s11, s10
-; GFX8-NEXT:    s_xor_b32 s11, s0, s10
+; GFX8-NEXT:    s_ashr_i32 s4, s15, 31
+; GFX8-NEXT:    s_add_i32 s0, s15, s4
+; GFX8-NEXT:    s_xor_b32 s5, s0, s4
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX8-NEXT:    s_sub_i32 s0, 0, s3
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, s11
-; GFX8-NEXT:    s_ashr_i32 s12, s8, 31
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, s5
+; GFX8-NEXT:    s_ashr_i32 s6, s12, 31
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GFX8-NEXT:    s_sub_i32 s1, 0, s11
+; GFX8-NEXT:    s_sub_i32 s1, 0, s5
 ; GFX8-NEXT:    v_mul_lo_u32 v2, s0, v0
-; GFX8-NEXT:    s_add_i32 s0, s8, s12
-; GFX8-NEXT:    s_xor_b32 s0, s0, s12
+; GFX8-NEXT:    s_add_i32 s0, s12, s6
+; GFX8-NEXT:    s_xor_b32 s0, s0, s6
 ; GFX8-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GFX8-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v1, v1
@@ -654,159 +654,159 @@ define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s3, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s3, v3
-; GFX8-NEXT:    s_xor_b32 s0, s12, s2
-; GFX8-NEXT:    s_ashr_i32 s2, s9, 31
-; GFX8-NEXT:    s_add_i32 s1, s9, s2
+; GFX8-NEXT:    s_xor_b32 s0, s6, s2
+; GFX8-NEXT:    s_ashr_i32 s2, s13, 31
+; GFX8-NEXT:    s_add_i32 s1, s13, s2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX8-NEXT:    s_xor_b32 s1, s1, s2
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
 ; GFX8-NEXT:    v_mul_hi_u32 v1, s1, v1
-; GFX8-NEXT:    v_xor_b32_e32 v2, s12, v3
+; GFX8-NEXT:    v_xor_b32_e32 v2, s6, v3
 ; GFX8-NEXT:    v_xor_b32_e32 v0, s0, v0
 ; GFX8-NEXT:    v_subrev_u32_e32 v0, vcc, s0, v0
-; GFX8-NEXT:    v_mul_lo_u32 v3, v1, s11
-; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, s12, v2
+; GFX8-NEXT:    v_mul_lo_u32 v3, v1, s5
+; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, s6, v2
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v1
 ; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s1, v3
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s5, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s11, v3
+; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s5, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v1
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s5, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s11, v3
-; GFX8-NEXT:    s_xor_b32 s0, s2, s10
+; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s5, v3
+; GFX8-NEXT:    s_xor_b32 s0, s2, s4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX8-NEXT:    v_xor_b32_e32 v1, s0, v1
-; GFX8-NEXT:    v_mov_b32_e32 v4, s4
+; GFX8-NEXT:    v_mov_b32_e32 v4, s8
 ; GFX8-NEXT:    v_subrev_u32_e32 v1, vcc, s0, v1
-; GFX8-NEXT:    v_mov_b32_e32 v5, s5
+; GFX8-NEXT:    v_mov_b32_e32 v5, s9
 ; GFX8-NEXT:    v_xor_b32_e32 v3, s2, v3
 ; GFX8-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
-; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mov_b32_e32 v0, s10
 ; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, s2, v3
-; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v1, s11
 ; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: sdivrem_v2i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x0
+; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s8, s6, 31
-; GFX9-NEXT:    s_add_i32 s6, s6, s8
-; GFX9-NEXT:    s_xor_b32 s6, s6, s8
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s6
-; GFX9-NEXT:    s_ashr_i32 s9, s7, 31
-; GFX9-NEXT:    s_add_i32 s7, s7, s9
-; GFX9-NEXT:    s_xor_b32 s7, s7, s9
+; GFX9-NEXT:    s_ashr_i32 s0, s14, 31
+; GFX9-NEXT:    s_add_i32 s1, s14, s0
+; GFX9-NEXT:    s_xor_b32 s1, s1, s0
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s1
+; GFX9-NEXT:    s_ashr_i32 s2, s15, 31
+; GFX9-NEXT:    s_add_i32 s3, s15, s2
+; GFX9-NEXT:    s_xor_b32 s3, s3, s2
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s7
-; GFX9-NEXT:    s_sub_i32 s12, 0, s6
-; GFX9-NEXT:    s_ashr_i32 s10, s4, 31
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s3
+; GFX9-NEXT:    s_sub_i32 s6, 0, s1
+; GFX9-NEXT:    s_ashr_i32 s4, s12, 31
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GFX9-NEXT:    s_add_i32 s4, s4, s10
-; GFX9-NEXT:    s_xor_b32 s4, s4, s10
-; GFX9-NEXT:    v_mul_lo_u32 v2, s12, v0
+; GFX9-NEXT:    s_sub_i32 s7, 0, s3
+; GFX9-NEXT:    s_ashr_i32 s5, s13, 31
+; GFX9-NEXT:    v_mul_lo_u32 v2, s6, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    s_sub_i32 s12, 0, s7
+; GFX9-NEXT:    s_add_i32 s6, s12, s4
 ; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
-; GFX9-NEXT:    s_ashr_i32 s11, s5, 31
-; GFX9-NEXT:    v_mul_lo_u32 v3, s12, v1
-; GFX9-NEXT:    s_add_i32 s5, s5, s11
+; GFX9-NEXT:    s_xor_b32 s6, s6, s4
+; GFX9-NEXT:    v_mul_lo_u32 v3, s7, v1
+; GFX9-NEXT:    s_add_i32 s7, s13, s5
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v0, s4, v0
+; GFX9-NEXT:    v_mul_hi_u32 v0, s6, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v2, v1, v3
-; GFX9-NEXT:    s_xor_b32 s5, s5, s11
-; GFX9-NEXT:    v_mul_lo_u32 v3, v0, s6
+; GFX9-NEXT:    s_xor_b32 s7, s7, s5
+; GFX9-NEXT:    s_xor_b32 s0, s4, s0
+; GFX9-NEXT:    v_mul_lo_u32 v3, v0, s1
 ; GFX9-NEXT:    v_add_u32_e32 v1, v1, v2
 ; GFX9-NEXT:    v_add_u32_e32 v2, 1, v0
-; GFX9-NEXT:    v_mul_hi_u32 v1, s5, v1
-; GFX9-NEXT:    v_sub_u32_e32 v3, s4, v3
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s6, v3
+; GFX9-NEXT:    v_mul_hi_u32 v1, s7, v1
+; GFX9-NEXT:    v_sub_u32_e32 v3, s6, v3
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s1, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v2, s6, v3
+; GFX9-NEXT:    v_subrev_u32_e32 v2, s1, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s6, v2
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s1, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v3, s6, v2
+; GFX9-NEXT:    v_subrev_u32_e32 v3, s1, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v3, v1, s7
+; GFX9-NEXT:    v_mul_lo_u32 v3, v1, s3
 ; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
-; GFX9-NEXT:    s_xor_b32 s4, s10, s8
-; GFX9-NEXT:    v_xor_b32_e32 v0, s4, v0
-; GFX9-NEXT:    v_sub_u32_e32 v3, s5, v3
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s7, v3
+; GFX9-NEXT:    v_xor_b32_e32 v0, s0, v0
+; GFX9-NEXT:    v_subrev_u32_e32 v0, s0, v0
+; GFX9-NEXT:    v_sub_u32_e32 v3, s7, v3
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v4, s7, v3
+; GFX9-NEXT:    v_subrev_u32_e32 v4, s3, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s7, v3
-; GFX9-NEXT:    v_subrev_u32_e32 v0, s4, v0
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v4, s7, v3
-; GFX9-NEXT:    s_xor_b32 s4, s11, s9
+; GFX9-NEXT:    v_subrev_u32_e32 v4, s3, v3
+; GFX9-NEXT:    s_xor_b32 s0, s5, s2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v1, s4, v1
-; GFX9-NEXT:    v_xor_b32_e32 v2, s10, v2
-; GFX9-NEXT:    v_subrev_u32_e32 v1, s4, v1
-; GFX9-NEXT:    v_xor_b32_e32 v3, s11, v3
+; GFX9-NEXT:    v_xor_b32_e32 v1, s0, v1
+; GFX9-NEXT:    v_xor_b32_e32 v2, s4, v2
+; GFX9-NEXT:    v_subrev_u32_e32 v1, s0, v1
+; GFX9-NEXT:    v_xor_b32_e32 v3, s5, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-NEXT:    v_subrev_u32_e32 v2, s10, v2
-; GFX9-NEXT:    v_subrev_u32_e32 v3, s11, v3
-; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
-; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[2:3]
+; GFX9-NEXT:    v_subrev_u32_e32 v2, s4, v2
+; GFX9-NEXT:    v_subrev_u32_e32 v3, s5, v3
+; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[8:9]
+; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[10:11]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: sdivrem_v2i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[4:5], 0x0
+; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_ashr_i32 s1, s10, 31
-; GFX10-NEXT:    s_ashr_i32 s2, s11, 31
-; GFX10-NEXT:    s_add_i32 s0, s10, s1
-; GFX10-NEXT:    s_add_i32 s3, s11, s2
-; GFX10-NEXT:    s_xor_b32 s10, s0, s1
+; GFX10-NEXT:    s_ashr_i32 s1, s14, 31
+; GFX10-NEXT:    s_ashr_i32 s2, s15, 31
+; GFX10-NEXT:    s_add_i32 s0, s14, s1
+; GFX10-NEXT:    s_add_i32 s3, s15, s2
+; GFX10-NEXT:    s_xor_b32 s4, s0, s1
 ; GFX10-NEXT:    s_xor_b32 s3, s3, s2
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s10
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s4
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s3
-; GFX10-NEXT:    s_sub_i32 s0, 0, s10
-; GFX10-NEXT:    s_sub_i32 s11, 0, s3
-; GFX10-NEXT:    s_ashr_i32 s12, s9, 31
+; GFX10-NEXT:    s_sub_i32 s0, 0, s4
+; GFX10-NEXT:    s_sub_i32 s5, 0, s3
+; GFX10-NEXT:    s_ashr_i32 s6, s13, 31
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GFX10-NEXT:    s_add_i32 s7, s13, s6
+; GFX10-NEXT:    s_xor_b32 s7, s7, s6
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX10-NEXT:    v_mul_lo_u32 v2, s0, v0
-; GFX10-NEXT:    v_mul_lo_u32 v3, s11, v1
-; GFX10-NEXT:    s_ashr_i32 s11, s8, 31
-; GFX10-NEXT:    s_add_i32 s0, s8, s11
-; GFX10-NEXT:    s_add_i32 s8, s9, s12
-; GFX10-NEXT:    s_xor_b32 s0, s0, s11
-; GFX10-NEXT:    s_xor_b32 s8, s8, s12
+; GFX10-NEXT:    v_mul_lo_u32 v3, s5, v1
+; GFX10-NEXT:    s_ashr_i32 s5, s12, 31
+; GFX10-NEXT:    s_add_i32 s0, s12, s5
+; GFX10-NEXT:    s_xor_b32 s1, s5, s1
+; GFX10-NEXT:    s_xor_b32 s0, s0, s5
 ; GFX10-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX10-NEXT:    v_mul_hi_u32 v3, v1, v3
-; GFX10-NEXT:    s_xor_b32 s1, s11, s1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_add_nc_u32_e32 v1, v1, v3
 ; GFX10-NEXT:    v_mul_hi_u32 v0, s0, v0
-; GFX10-NEXT:    v_mul_hi_u32 v1, s8, v1
-; GFX10-NEXT:    v_mul_lo_u32 v2, v0, s10
+; GFX10-NEXT:    v_mul_hi_u32 v1, s7, v1
+; GFX10-NEXT:    v_mul_lo_u32 v2, v0, s4
 ; GFX10-NEXT:    v_mul_lo_u32 v3, v1, s3
 ; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v5, 1, v1
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v2, s0, v2
-; GFX10-NEXT:    v_sub_nc_u32_e32 v3, s8, v3
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s10, v2
+; GFX10-NEXT:    v_sub_nc_u32_e32 v3, s7, v3
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s4, v2
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s3, v3
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s10, v2
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s4, v2
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s3, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
@@ -814,26 +814,26 @@ define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
 ; GFX10-NEXT:    v_add_nc_u32_e32 v5, 1, v1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v0
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s10, v2
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s4, v2
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s3, v3
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s10, v2
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s4, v2
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s3, v3
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
-; GFX10-NEXT:    s_xor_b32 s0, s12, s2
+; GFX10-NEXT:    s_xor_b32 s0, s6, s2
 ; GFX10-NEXT:    v_xor_b32_e32 v0, s1, v0
 ; GFX10-NEXT:    v_xor_b32_e32 v1, s0, v1
-; GFX10-NEXT:    v_xor_b32_e32 v2, s11, v2
-; GFX10-NEXT:    v_xor_b32_e32 v3, s12, v3
+; GFX10-NEXT:    v_xor_b32_e32 v2, s5, v2
+; GFX10-NEXT:    v_xor_b32_e32 v3, s6, v3
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v0, s1, v0
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v1, s0, v1
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, s11, v2
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, s12, v3
-; GFX10-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
-; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[6:7]
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, s5, v2
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, s6, v3
+; GFX10-NEXT:    global_store_dwordx2 v4, v[0:1], s[8:9]
+; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[10:11]
 ; GFX10-NEXT:    s_endpgm
   %div = sdiv <2 x i32> %x, %y
   store <2 x i32> %div, ptr addrspace(1) %out0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
index a58397eccaba7..63a0d8afd4de0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
@@ -522,11 +522,11 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i32> %x, <2 x i32> %y) {
 ; GFX8-LABEL: udivrem_v2i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dwordx8 s[4:11], s[4:5], 0x0
+; GFX8-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s10
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, s11
-; GFX8-NEXT:    s_sub_i32 s0, 0, s10
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s14
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, s15
+; GFX8-NEXT:    s_sub_i32 s0, 0, s14
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
@@ -534,54 +534,54 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX8-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX8-NEXT:    v_mul_lo_u32 v2, s0, v0
-; GFX8-NEXT:    s_sub_i32 s0, 0, s11
+; GFX8-NEXT:    s_sub_i32 s0, 0, s15
 ; GFX8-NEXT:    v_mul_lo_u32 v3, s0, v1
 ; GFX8-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX8-NEXT:    v_mul_hi_u32 v3, v1, v3
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
-; GFX8-NEXT:    v_mul_hi_u32 v0, s8, v0
+; GFX8-NEXT:    v_mul_hi_u32 v0, s12, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
-; GFX8-NEXT:    v_mul_hi_u32 v1, s9, v1
-; GFX8-NEXT:    v_mul_lo_u32 v2, v0, s10
+; GFX8-NEXT:    v_mul_hi_u32 v1, s13, v1
+; GFX8-NEXT:    v_mul_lo_u32 v2, v0, s14
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 1, v0
-; GFX8-NEXT:    v_mul_lo_u32 v4, v1, s11
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s8, v2
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
+; GFX8-NEXT:    v_mul_lo_u32 v4, v1, s15
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s12, v2
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s14, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX8-NEXT:    v_subrev_u32_e64 v3, s[0:1], s10, v2
+; GFX8-NEXT:    v_subrev_u32_e64 v3, s[0:1], s14, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 1, v0
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s14, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX8-NEXT:    v_subrev_u32_e64 v3, s[0:1], s10, v2
+; GFX8-NEXT:    v_subrev_u32_e64 v3, s[0:1], s14, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s9, v4
+; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s13, v4
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v1
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s15, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s11, v3
+; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s15, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v1
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s15, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s11, v3
+; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s15, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; GFX8-NEXT:    v_mov_b32_e32 v4, s4
-; GFX8-NEXT:    v_mov_b32_e32 v5, s5
+; GFX8-NEXT:    v_mov_b32_e32 v4, s8
+; GFX8-NEXT:    v_mov_b32_e32 v5, s9
 ; GFX8-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
-; GFX8-NEXT:    v_mov_b32_e32 v0, s6
-; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v0, s10
+; GFX8-NEXT:    v_mov_b32_e32 v1, s11
 ; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: udivrem_v2i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[4:5], 0x0
+; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s10
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s11
-; GFX9-NEXT:    s_sub_i32 s0, 0, s10
-; GFX9-NEXT:    s_sub_i32 s1, 0, s11
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s14
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s15
+; GFX9-NEXT:    s_sub_i32 s0, 0, s14
+; GFX9-NEXT:    s_sub_i32 s1, 0, s15
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
@@ -593,47 +593,47 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v0, s8, v0
+; GFX9-NEXT:    v_mul_hi_u32 v0, s12, v0
 ; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
-; GFX9-NEXT:    v_mul_hi_u32 v1, s9, v1
-; GFX9-NEXT:    v_mul_lo_u32 v2, v0, s10
+; GFX9-NEXT:    v_mul_hi_u32 v1, s13, v1
+; GFX9-NEXT:    v_mul_lo_u32 v2, v0, s14
 ; GFX9-NEXT:    v_add_u32_e32 v4, 1, v0
-; GFX9-NEXT:    v_mul_lo_u32 v3, v1, s11
+; GFX9-NEXT:    v_mul_lo_u32 v3, v1, s15
 ; GFX9-NEXT:    v_add_u32_e32 v5, 1, v1
-; GFX9-NEXT:    v_sub_u32_e32 v2, s8, v2
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
-; GFX9-NEXT:    v_sub_u32_e32 v3, s9, v3
+; GFX9-NEXT:    v_sub_u32_e32 v2, s12, v2
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s14, v2
+; GFX9-NEXT:    v_sub_u32_e32 v3, s13, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v4, s10, v2
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v3
+; GFX9-NEXT:    v_subrev_u32_e32 v4, s14, v2
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s15, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[0:1]
-; GFX9-NEXT:    v_subrev_u32_e32 v5, s11, v3
+; GFX9-NEXT:    v_subrev_u32_e32 v5, s15, v3
 ; GFX9-NEXT:    v_add_u32_e32 v4, 1, v0
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s14, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v4, s10, v2
+; GFX9-NEXT:    v_subrev_u32_e32 v4, s14, v2
 ; GFX9-NEXT:    v_add_u32_e32 v5, 1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
-; GFX9-NEXT:    v_subrev_u32_e32 v4, s11, v3
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s15, v3
+; GFX9-NEXT:    v_subrev_u32_e32 v4, s15, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
-; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[6:7]
+; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[8:9]
+; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[10:11]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: udivrem_v2i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[4:5], 0x0
+; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s10
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s11
-; GFX10-NEXT:    s_sub_i32 s0, 0, s10
-; GFX10-NEXT:    s_sub_i32 s1, 0, s11
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s14
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s15
+; GFX10-NEXT:    s_sub_i32 s0, 0, s14
+; GFX10-NEXT:    s_sub_i32 s1, 0, s15
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
@@ -646,34 +646,34 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX10-NEXT:    v_mul_hi_u32 v3, v1, v3
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_add_nc_u32_e32 v1, v1, v3
-; GFX10-NEXT:    v_mul_hi_u32 v0, s8, v0
-; GFX10-NEXT:    v_mul_hi_u32 v1, s9, v1
-; GFX10-NEXT:    v_mul_lo_u32 v2, v0, s10
-; GFX10-NEXT:    v_mul_lo_u32 v3, v1, s11
+; GFX10-NEXT:    v_mul_hi_u32 v0, s12, v0
+; GFX10-NEXT:    v_mul_hi_u32 v1, s13, v1
+; GFX10-NEXT:    v_mul_lo_u32 v2, v0, s14
+; GFX10-NEXT:    v_mul_lo_u32 v3, v1, s15
 ; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v5, 1, v1
-; GFX10-NEXT:    v_sub_nc_u32_e32 v2, s8, v2
-; GFX10-NEXT:    v_sub_nc_u32_e32 v3, s9, v3
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s10, v2
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s11, v3
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s10, v2
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s11, v3
+; GFX10-NEXT:    v_sub_nc_u32_e32 v2, s12, v2
+; GFX10-NEXT:    v_sub_nc_u32_e32 v3, s13, v3
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s14, v2
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s15, v3
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s14, v2
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s15, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
 ; GFX10-NEXT:    v_add_nc_u32_e32 v5, 1, v1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v0
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s10, v2
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s11, v3
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s10, v2
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s11, v3
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s14, v2
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s15, v3
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s14, v2
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s15, v3
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
-; GFX10-NEXT:    global_store_dwordx2 v8, v[0:1], s[4:5]
-; GFX10-NEXT:    global_store_dwordx2 v8, v[2:3], s[6:7]
+; GFX10-NEXT:    global_store_dwordx2 v8, v[0:1], s[8:9]
+; GFX10-NEXT:    global_store_dwordx2 v8, v[2:3], s[10:11]
 ; GFX10-NEXT:    s_endpgm
   %div = udiv <2 x i32> %x, %y
   store <2 x i32> %div, ptr addrspace(1) %out0
diff --git a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
index 6f67ce4de9ce5..be9b5b00c39d7 100644
--- a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
@@ -226,47 +226,47 @@ define amdgpu_kernel void @s_test_add_self_v2i16(ptr addrspace(1) %out, ptr addr
 define amdgpu_kernel void @s_test_add_v2i16_kernarg(ptr addrspace(1) %out, <2 x i16> %a, <2 x i16> %b) #1 {
 ; VI-LABEL: s_test_add_v2i16_kernarg:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshr_b32 s4, s2, 16
-; VI-NEXT:    s_lshr_b32 s5, s3, 16
-; VI-NEXT:    s_add_i32 s2, s2, s3
-; VI-NEXT:    s_add_i32 s4, s4, s5
-; VI-NEXT:    s_and_b32 s2, s2, 0xffff
-; VI-NEXT:    s_lshl_b32 s3, s4, 16
-; VI-NEXT:    s_or_b32 s2, s2, s3
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    s_lshr_b32 s0, s6, 16
+; VI-NEXT:    s_lshr_b32 s1, s7, 16
+; VI-NEXT:    s_add_i32 s2, s6, s7
+; VI-NEXT:    s_add_i32 s0, s0, s1
+; VI-NEXT:    s_and_b32 s1, s2, 0xffff
+; VI-NEXT:    s_lshl_b32 s0, s0, 16
+; VI-NEXT:    s_or_b32 s0, s1, s0
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: s_test_add_v2i16_kernarg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    v_pk_add_u16 v1, s2, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    v_pk_add_u16 v1, s6, v1
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: s_test_add_v2i16_kernarg:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_pk_add_u16 v1, s2, s3
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    v_pk_add_u16 v1, s6, s7
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_test_add_v2i16_kernarg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_pk_add_u16 v1, s2, s3
-; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    v_pk_add_u16 v1, s6, s7
+; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index 8144fb7a3b646..559871d162e13 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -72,31 +72,31 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ;
 ; GFX9-LABEL: udiv_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX9-NEXT:    s_sub_i32 s4, 0, s3
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s7
+; GFX9-NEXT:    s_sub_i32 s0, 0, s7
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_readfirstlane_b32 s5, v0
-; GFX9-NEXT:    s_mul_i32 s4, s4, s5
-; GFX9-NEXT:    s_mul_hi_u32 s4, s5, s4
-; GFX9-NEXT:    s_add_i32 s5, s5, s4
-; GFX9-NEXT:    s_mul_hi_u32 s4, s2, s5
-; GFX9-NEXT:    s_mul_i32 s5, s4, s3
-; GFX9-NEXT:    s_sub_i32 s2, s2, s5
-; GFX9-NEXT:    s_add_i32 s6, s4, 1
-; GFX9-NEXT:    s_sub_i32 s5, s2, s3
-; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
-; GFX9-NEXT:    s_cselect_b32 s4, s6, s4
-; GFX9-NEXT:    s_cselect_b32 s2, s5, s2
-; GFX9-NEXT:    s_add_i32 s5, s4, 1
-; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
-; GFX9-NEXT:    s_cselect_b32 s2, s5, s4
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX9-NEXT:    s_mul_i32 s0, s0, s1
+; GFX9-NEXT:    s_mul_hi_u32 s0, s1, s0
+; GFX9-NEXT:    s_add_i32 s1, s1, s0
+; GFX9-NEXT:    s_mul_hi_u32 s0, s6, s1
+; GFX9-NEXT:    s_mul_i32 s1, s0, s7
+; GFX9-NEXT:    s_sub_i32 s1, s6, s1
+; GFX9-NEXT:    s_add_i32 s2, s0, 1
+; GFX9-NEXT:    s_sub_i32 s3, s1, s7
+; GFX9-NEXT:    s_cmp_ge_u32 s1, s7
+; GFX9-NEXT:    s_cselect_b32 s0, s2, s0
+; GFX9-NEXT:    s_cselect_b32 s1, s3, s1
+; GFX9-NEXT:    s_add_i32 s2, s0, 1
+; GFX9-NEXT:    s_cmp_ge_u32 s1, s7
+; GFX9-NEXT:    s_cselect_b32 s0, s2, s0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %r = udiv i32 %x, %y
   store i32 %r, ptr addrspace(1) %out
@@ -167,29 +167,29 @@ define amdgpu_kernel void @urem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ;
 ; GFX9-LABEL: urem_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX9-NEXT:    s_sub_i32 s4, 0, s3
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s7
+; GFX9-NEXT:    s_sub_i32 s0, 0, s7
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_readfirstlane_b32 s5, v0
-; GFX9-NEXT:    s_mul_i32 s4, s4, s5
-; GFX9-NEXT:    s_mul_hi_u32 s4, s5, s4
-; GFX9-NEXT:    s_add_i32 s5, s5, s4
-; GFX9-NEXT:    s_mul_hi_u32 s4, s2, s5
-; GFX9-NEXT:    s_mul_i32 s4, s4, s3
-; GFX9-NEXT:    s_sub_i32 s2, s2, s4
-; GFX9-NEXT:    s_sub_i32 s4, s2, s3
-; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
-; GFX9-NEXT:    s_cselect_b32 s2, s4, s2
-; GFX9-NEXT:    s_sub_i32 s4, s2, s3
-; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
-; GFX9-NEXT:    s_cselect_b32 s2, s4, s2
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX9-NEXT:    s_mul_i32 s0, s0, s1
+; GFX9-NEXT:    s_mul_hi_u32 s0, s1, s0
+; GFX9-NEXT:    s_add_i32 s1, s1, s0
+; GFX9-NEXT:    s_mul_hi_u32 s0, s6, s1
+; GFX9-NEXT:    s_mul_i32 s0, s0, s7
+; GFX9-NEXT:    s_sub_i32 s0, s6, s0
+; GFX9-NEXT:    s_sub_i32 s1, s0, s7
+; GFX9-NEXT:    s_cmp_ge_u32 s0, s7
+; GFX9-NEXT:    s_cselect_b32 s0, s1, s0
+; GFX9-NEXT:    s_sub_i32 s1, s0, s7
+; GFX9-NEXT:    s_cmp_ge_u32 s0, s7
+; GFX9-NEXT:    s_cselect_b32 s0, s1, s0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %r = urem i32 %x, %y
   store i32 %r, ptr addrspace(1) %out
@@ -280,37 +280,37 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ;
 ; GFX9-LABEL: sdiv_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_abs_i32 s4, s3
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s4
-; GFX9-NEXT:    s_sub_i32 s5, 0, s4
-; GFX9-NEXT:    s_xor_b32 s3, s2, s3
-; GFX9-NEXT:    s_abs_i32 s2, s2
+; GFX9-NEXT:    s_abs_i32 s0, s7
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s0
+; GFX9-NEXT:    s_xor_b32 s1, s6, s7
+; GFX9-NEXT:    s_abs_i32 s2, s6
+; GFX9-NEXT:    s_sub_i32 s3, 0, s0
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    s_ashr_i32 s3, s3, 31
+; GFX9-NEXT:    s_ashr_i32 s1, s1, 31
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s6, v0
-; GFX9-NEXT:    s_mul_i32 s5, s5, s6
-; GFX9-NEXT:    s_mul_hi_u32 s5, s6, s5
-; GFX9-NEXT:    s_add_i32 s6, s6, s5
-; GFX9-NEXT:    s_mul_hi_u32 s5, s2, s6
-; GFX9-NEXT:    s_mul_i32 s6, s5, s4
+; GFX9-NEXT:    s_mul_i32 s3, s3, s6
+; GFX9-NEXT:    s_mul_hi_u32 s3, s6, s3
+; GFX9-NEXT:    s_add_i32 s6, s6, s3
+; GFX9-NEXT:    s_mul_hi_u32 s3, s2, s6
+; GFX9-NEXT:    s_mul_i32 s6, s3, s0
 ; GFX9-NEXT:    s_sub_i32 s2, s2, s6
-; GFX9-NEXT:    s_add_i32 s7, s5, 1
-; GFX9-NEXT:    s_sub_i32 s6, s2, s4
-; GFX9-NEXT:    s_cmp_ge_u32 s2, s4
-; GFX9-NEXT:    s_cselect_b32 s5, s7, s5
+; GFX9-NEXT:    s_add_i32 s7, s3, 1
+; GFX9-NEXT:    s_sub_i32 s6, s2, s0
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s0
+; GFX9-NEXT:    s_cselect_b32 s3, s7, s3
 ; GFX9-NEXT:    s_cselect_b32 s2, s6, s2
-; GFX9-NEXT:    s_add_i32 s6, s5, 1
-; GFX9-NEXT:    s_cmp_ge_u32 s2, s4
-; GFX9-NEXT:    s_cselect_b32 s2, s6, s5
-; GFX9-NEXT:    s_xor_b32 s2, s2, s3
-; GFX9-NEXT:    s_sub_i32 s2, s2, s3
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    s_add_i32 s6, s3, 1
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s0
+; GFX9-NEXT:    s_cselect_b32 s0, s6, s3
+; GFX9-NEXT:    s_xor_b32 s0, s0, s1
+; GFX9-NEXT:    s_sub_i32 s0, s0, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %r = sdiv i32 %x, %y
   store i32 %r, ptr addrspace(1) %out
@@ -394,34 +394,34 @@ define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ;
 ; GFX9-LABEL: srem_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_abs_i32 s3, s3
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX9-NEXT:    s_sub_i32 s5, 0, s3
-; GFX9-NEXT:    s_ashr_i32 s4, s2, 31
-; GFX9-NEXT:    s_abs_i32 s2, s2
+; GFX9-NEXT:    s_abs_i32 s0, s7
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s0
+; GFX9-NEXT:    s_ashr_i32 s1, s6, 31
+; GFX9-NEXT:    s_abs_i32 s2, s6
+; GFX9-NEXT:    s_sub_i32 s3, 0, s0
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s6, v0
-; GFX9-NEXT:    s_mul_i32 s5, s5, s6
-; GFX9-NEXT:    s_mul_hi_u32 s5, s6, s5
-; GFX9-NEXT:    s_add_i32 s6, s6, s5
-; GFX9-NEXT:    s_mul_hi_u32 s5, s2, s6
-; GFX9-NEXT:    s_mul_i32 s5, s5, s3
-; GFX9-NEXT:    s_sub_i32 s2, s2, s5
-; GFX9-NEXT:    s_sub_i32 s5, s2, s3
-; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
-; GFX9-NEXT:    s_cselect_b32 s2, s5, s2
-; GFX9-NEXT:    s_sub_i32 s5, s2, s3
-; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
-; GFX9-NEXT:    s_cselect_b32 s2, s5, s2
-; GFX9-NEXT:    s_xor_b32 s2, s2, s4
-; GFX9-NEXT:    s_sub_i32 s2, s2, s4
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    s_mul_i32 s3, s3, s6
+; GFX9-NEXT:    s_mul_hi_u32 s3, s6, s3
+; GFX9-NEXT:    s_add_i32 s6, s6, s3
+; GFX9-NEXT:    s_mul_hi_u32 s3, s2, s6
+; GFX9-NEXT:    s_mul_i32 s3, s3, s0
+; GFX9-NEXT:    s_sub_i32 s2, s2, s3
+; GFX9-NEXT:    s_sub_i32 s3, s2, s0
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s0
+; GFX9-NEXT:    s_cselect_b32 s2, s3, s2
+; GFX9-NEXT:    s_sub_i32 s3, s2, s0
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s0
+; GFX9-NEXT:    s_cselect_b32 s0, s3, s2
+; GFX9-NEXT:    s_xor_b32 s0, s0, s1
+; GFX9-NEXT:    s_sub_i32 s0, s0, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %r = srem i32 %x, %y
   store i32 %r, ptr addrspace(1) %out
@@ -5482,13 +5482,13 @@ define amdgpu_kernel void @udiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x
 ;
 ; GFX9-LABEL: udiv_i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_add_i32 s3, s3, 12
-; GFX9-NEXT:    s_lshr_b32 s2, s2, s3
-; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_add_i32 s0, s7, 12
+; GFX9-NEXT:    s_lshr_b32 s0, s6, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %shl.y = shl i32 4096, %y
   %r = udiv i32 %x, %shl.y
@@ -5524,14 +5524,14 @@ define amdgpu_kernel void @udiv_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3
 ;
 ; GFX9-LABEL: udiv_v2i32_pow2k_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshr_b32 s2, s2, 12
-; GFX9-NEXT:    s_lshr_b32 s3, s3, 12
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_lshr_b32 s0, s6, 12
+; GFX9-NEXT:    s_lshr_b32 s1, s7, 12
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %r = udiv <2 x i32> %x, <i32 4096, i32 4096>
   store <2 x i32> %r, ptr addrspace(1) %out
@@ -5570,18 +5570,18 @@ define amdgpu_kernel void @udiv_v2i32_mixed_pow2k_denom(ptr addrspace(1) %out, <
 ;
 ; GFX9-LABEL: udiv_v2i32_mixed_pow2k_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mul_hi_u32 s4, s3, 0x100101
-; GFX9-NEXT:    s_sub_i32 s3, s3, s4
-; GFX9-NEXT:    s_lshr_b32 s3, s3, 1
-; GFX9-NEXT:    s_add_i32 s3, s3, s4
-; GFX9-NEXT:    s_lshr_b32 s2, s2, 12
-; GFX9-NEXT:    s_lshr_b32 s3, s3, 11
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_mul_hi_u32 s1, s7, 0x100101
+; GFX9-NEXT:    s_sub_i32 s2, s7, s1
+; GFX9-NEXT:    s_lshr_b32 s2, s2, 1
+; GFX9-NEXT:    s_add_i32 s2, s2, s1
+; GFX9-NEXT:    s_lshr_b32 s0, s6, 12
+; GFX9-NEXT:    s_lshr_b32 s1, s2, 11
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %r = udiv <2 x i32> %x, <i32 4096, i32 4095>
   store <2 x i32> %r, ptr addrspace(1) %out
@@ -5875,14 +5875,14 @@ define amdgpu_kernel void @urem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x
 ;
 ; GFX9-LABEL: urem_i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b32 s3, 0x1000, s3
-; GFX9-NEXT:    s_add_i32 s3, s3, -1
-; GFX9-NEXT:    s_and_b32 s2, s2, s3
-; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_lshl_b32 s0, 0x1000, s7
+; GFX9-NEXT:    s_add_i32 s0, s0, -1
+; GFX9-NEXT:    s_and_b32 s0, s6, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %shl.y = shl i32 4096, %y
   %r = urem i32 %x, %shl.y
@@ -5918,14 +5918,14 @@ define amdgpu_kernel void @urem_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3
 ;
 ; GFX9-LABEL: urem_v2i32_pow2k_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s2, s2, 0xfff
-; GFX9-NEXT:    s_and_b32 s3, s3, 0xfff
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_and_b32 s0, s6, 0xfff
+; GFX9-NEXT:    s_and_b32 s1, s7, 0xfff
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %r = urem <2 x i32> %x, <i32 4096, i32 4096>
   store <2 x i32> %r, ptr addrspace(1) %out
@@ -6234,41 +6234,41 @@ define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x
 ;
 ; GFX9-LABEL: sdiv_i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b32 s3, 0x1000, s3
-; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
-; GFX9-NEXT:    s_add_i32 s3, s3, s4
-; GFX9-NEXT:    s_xor_b32 s3, s3, s4
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX9-NEXT:    s_sub_i32 s6, 0, s3
-; GFX9-NEXT:    s_ashr_i32 s5, s2, 31
-; GFX9-NEXT:    s_add_i32 s2, s2, s5
+; GFX9-NEXT:    s_lshl_b32 s0, 0x1000, s7
+; GFX9-NEXT:    s_ashr_i32 s1, s0, 31
+; GFX9-NEXT:    s_add_i32 s0, s0, s1
+; GFX9-NEXT:    s_xor_b32 s0, s0, s1
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s0
+; GFX9-NEXT:    s_ashr_i32 s2, s6, 31
+; GFX9-NEXT:    s_add_i32 s3, s6, s2
+; GFX9-NEXT:    s_sub_i32 s6, 0, s0
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    s_xor_b32 s2, s2, s5
+; GFX9-NEXT:    s_xor_b32 s3, s3, s2
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s7, v0
 ; GFX9-NEXT:    s_mul_i32 s6, s6, s7
 ; GFX9-NEXT:    s_mul_hi_u32 s6, s7, s6
 ; GFX9-NEXT:    s_add_i32 s7, s7, s6
-; GFX9-NEXT:    s_mul_hi_u32 s6, s2, s7
-; GFX9-NEXT:    s_mul_i32 s8, s6, s3
-; GFX9-NEXT:    s_sub_i32 s2, s2, s8
+; GFX9-NEXT:    s_mul_hi_u32 s6, s3, s7
+; GFX9-NEXT:    s_mul_i32 s8, s6, s0
+; GFX9-NEXT:    s_sub_i32 s3, s3, s8
 ; GFX9-NEXT:    s_add_i32 s7, s6, 1
-; GFX9-NEXT:    s_sub_i32 s8, s2, s3
-; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX9-NEXT:    s_sub_i32 s8, s3, s0
+; GFX9-NEXT:    s_cmp_ge_u32 s3, s0
 ; GFX9-NEXT:    s_cselect_b32 s6, s7, s6
-; GFX9-NEXT:    s_cselect_b32 s2, s8, s2
+; GFX9-NEXT:    s_cselect_b32 s3, s8, s3
 ; GFX9-NEXT:    s_add_i32 s7, s6, 1
-; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
-; GFX9-NEXT:    s_cselect_b32 s2, s7, s6
-; GFX9-NEXT:    s_xor_b32 s3, s5, s4
-; GFX9-NEXT:    s_xor_b32 s2, s2, s3
-; GFX9-NEXT:    s_sub_i32 s2, s2, s3
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    s_cmp_ge_u32 s3, s0
+; GFX9-NEXT:    s_cselect_b32 s0, s7, s6
+; GFX9-NEXT:    s_xor_b32 s1, s2, s1
+; GFX9-NEXT:    s_xor_b32 s0, s0, s1
+; GFX9-NEXT:    s_sub_i32 s0, s0, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %shl.y = shl i32 4096, %y
   %r = sdiv i32 %x, %shl.y
@@ -6310,20 +6310,20 @@ define amdgpu_kernel void @sdiv_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3
 ;
 ; GFX9-LABEL: sdiv_v2i32_pow2k_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s4, s2, 31
-; GFX9-NEXT:    s_ashr_i32 s5, s3, 31
-; GFX9-NEXT:    s_lshr_b32 s4, s4, 20
-; GFX9-NEXT:    s_lshr_b32 s5, s5, 20
-; GFX9-NEXT:    s_add_i32 s2, s2, s4
-; GFX9-NEXT:    s_add_i32 s3, s3, s5
-; GFX9-NEXT:    s_ashr_i32 s2, s2, 12
-; GFX9-NEXT:    s_ashr_i32 s3, s3, 12
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_ashr_i32 s0, s6, 31
+; GFX9-NEXT:    s_ashr_i32 s1, s7, 31
+; GFX9-NEXT:    s_lshr_b32 s0, s0, 20
+; GFX9-NEXT:    s_lshr_b32 s1, s1, 20
+; GFX9-NEXT:    s_add_i32 s0, s6, s0
+; GFX9-NEXT:    s_add_i32 s1, s7, s1
+; GFX9-NEXT:    s_ashr_i32 s0, s0, 12
+; GFX9-NEXT:    s_ashr_i32 s1, s1, 12
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %r = sdiv <2 x i32> %x, <i32 4096, i32 4096>
   store <2 x i32> %r, ptr addrspace(1) %out
@@ -6365,21 +6365,21 @@ define amdgpu_kernel void @ssdiv_v2i32_mixed_pow2k_denom(ptr addrspace(1) %out,
 ;
 ; GFX9-LABEL: ssdiv_v2i32_mixed_pow2k_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s4, s2, 31
-; GFX9-NEXT:    s_mul_hi_i32 s5, s3, 0x80080081
-; GFX9-NEXT:    s_lshr_b32 s4, s4, 20
-; GFX9-NEXT:    s_add_i32 s5, s5, s3
-; GFX9-NEXT:    s_add_i32 s2, s2, s4
-; GFX9-NEXT:    s_lshr_b32 s3, s5, 31
-; GFX9-NEXT:    s_ashr_i32 s4, s5, 11
-; GFX9-NEXT:    s_ashr_i32 s2, s2, 12
-; GFX9-NEXT:    s_add_i32 s4, s4, s3
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_ashr_i32 s0, s6, 31
+; GFX9-NEXT:    s_mul_hi_i32 s1, s7, 0x80080081
+; GFX9-NEXT:    s_lshr_b32 s0, s0, 20
+; GFX9-NEXT:    s_add_i32 s1, s1, s7
+; GFX9-NEXT:    s_add_i32 s0, s6, s0
+; GFX9-NEXT:    s_lshr_b32 s2, s1, 31
+; GFX9-NEXT:    s_ashr_i32 s1, s1, 11
+; GFX9-NEXT:    s_ashr_i32 s0, s0, 12
+; GFX9-NEXT:    s_add_i32 s1, s1, s2
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %r = sdiv <2 x i32> %x, <i32 4096, i32 4095>
   store <2 x i32> %r, ptr addrspace(1) %out
@@ -6746,38 +6746,38 @@ define amdgpu_kernel void @srem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x
 ;
 ; GFX9-LABEL: srem_i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b32 s3, 0x1000, s3
-; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
-; GFX9-NEXT:    s_add_i32 s3, s3, s4
-; GFX9-NEXT:    s_xor_b32 s3, s3, s4
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX9-NEXT:    s_sub_i32 s5, 0, s3
-; GFX9-NEXT:    s_ashr_i32 s4, s2, 31
-; GFX9-NEXT:    s_add_i32 s2, s2, s4
+; GFX9-NEXT:    s_lshl_b32 s0, 0x1000, s7
+; GFX9-NEXT:    s_ashr_i32 s1, s0, 31
+; GFX9-NEXT:    s_add_i32 s0, s0, s1
+; GFX9-NEXT:    s_xor_b32 s0, s0, s1
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s0
+; GFX9-NEXT:    s_ashr_i32 s1, s6, 31
+; GFX9-NEXT:    s_add_i32 s2, s6, s1
+; GFX9-NEXT:    s_sub_i32 s3, 0, s0
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    s_xor_b32 s2, s2, s4
+; GFX9-NEXT:    s_xor_b32 s2, s2, s1
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s6, v0
-; GFX9-NEXT:    s_mul_i32 s5, s5, s6
-; GFX9-NEXT:    s_mul_hi_u32 s5, s6, s5
-; GFX9-NEXT:    s_add_i32 s6, s6, s5
-; GFX9-NEXT:    s_mul_hi_u32 s5, s2, s6
-; GFX9-NEXT:    s_mul_i32 s5, s5, s3
-; GFX9-NEXT:    s_sub_i32 s2, s2, s5
-; GFX9-NEXT:    s_sub_i32 s5, s2, s3
-; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
-; GFX9-NEXT:    s_cselect_b32 s2, s5, s2
-; GFX9-NEXT:    s_sub_i32 s5, s2, s3
-; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
-; GFX9-NEXT:    s_cselect_b32 s2, s5, s2
-; GFX9-NEXT:    s_xor_b32 s2, s2, s4
-; GFX9-NEXT:    s_sub_i32 s2, s2, s4
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    s_mul_i32 s3, s3, s6
+; GFX9-NEXT:    s_mul_hi_u32 s3, s6, s3
+; GFX9-NEXT:    s_add_i32 s6, s6, s3
+; GFX9-NEXT:    s_mul_hi_u32 s3, s2, s6
+; GFX9-NEXT:    s_mul_i32 s3, s3, s0
+; GFX9-NEXT:    s_sub_i32 s2, s2, s3
+; GFX9-NEXT:    s_sub_i32 s3, s2, s0
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s0
+; GFX9-NEXT:    s_cselect_b32 s2, s3, s2
+; GFX9-NEXT:    s_sub_i32 s3, s2, s0
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s0
+; GFX9-NEXT:    s_cselect_b32 s0, s3, s2
+; GFX9-NEXT:    s_xor_b32 s0, s0, s1
+; GFX9-NEXT:    s_sub_i32 s0, s0, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %shl.y = shl i32 4096, %y
   %r = srem i32 %x, %shl.y
@@ -6821,22 +6821,22 @@ define amdgpu_kernel void @srem_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3
 ;
 ; GFX9-LABEL: srem_v2i32_pow2k_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s4, s2, 31
-; GFX9-NEXT:    s_ashr_i32 s5, s3, 31
-; GFX9-NEXT:    s_lshr_b32 s4, s4, 20
-; GFX9-NEXT:    s_lshr_b32 s5, s5, 20
-; GFX9-NEXT:    s_add_i32 s4, s2, s4
-; GFX9-NEXT:    s_add_i32 s5, s3, s5
-; GFX9-NEXT:    s_and_b32 s4, s4, 0xfffff000
-; GFX9-NEXT:    s_sub_i32 s2, s2, s4
-; GFX9-NEXT:    s_and_b32 s4, s5, 0xfffff000
-; GFX9-NEXT:    s_sub_i32 s3, s3, s4
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_ashr_i32 s0, s6, 31
+; GFX9-NEXT:    s_ashr_i32 s1, s7, 31
+; GFX9-NEXT:    s_lshr_b32 s0, s0, 20
+; GFX9-NEXT:    s_lshr_b32 s1, s1, 20
+; GFX9-NEXT:    s_add_i32 s0, s6, s0
+; GFX9-NEXT:    s_add_i32 s1, s7, s1
+; GFX9-NEXT:    s_and_b32 s0, s0, 0xfffff000
+; GFX9-NEXT:    s_and_b32 s1, s1, 0xfffff000
+; GFX9-NEXT:    s_sub_i32 s0, s6, s0
+; GFX9-NEXT:    s_sub_i32 s1, s7, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %r = srem <2 x i32> %x, <i32 4096, i32 4096>
   store <2 x i32> %r, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll b/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll
index 2c69ae58f0e61..ad6009e378178 100644
--- a/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll
@@ -9,19 +9,19 @@ declare i32 @llvm.amdgcn.atomic.cond.sub.u32.p0(ptr, i32)
 define amdgpu_kernel void @flat_atomic_cond_sub_no_rtn_u32(ptr %addr, i32 %in) {
 ; GFX12-SDAG-LABEL: flat_atomic_cond_sub_no_rtn_u32:
 ; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX12-SDAG-NEXT:    flat_atomic_cond_sub_u32 v0, v[0:1], v2 offset:-16 th:TH_ATOMIC_RETURN
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: flat_atomic_cond_sub_no_rtn_u32:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v1, s5
 ; GFX12-GISEL-NEXT:    flat_atomic_cond_sub_u32 v0, v[0:1], v2 offset:-16 th:TH_ATOMIC_RETURN
 ; GFX12-GISEL-NEXT:    s_endpgm
 entry:
@@ -33,19 +33,19 @@ entry:
 define amdgpu_kernel void @flat_atomic_cond_sub_no_rtn_u32_forced(ptr %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" {
 ; GFX12-SDAG-LABEL: flat_atomic_cond_sub_no_rtn_u32_forced:
 ; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX12-SDAG-NEXT:    flat_atomic_cond_sub_u32 v[0:1], v2 offset:-16
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: flat_atomic_cond_sub_no_rtn_u32_forced:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v1, s5
 ; GFX12-GISEL-NEXT:    flat_atomic_cond_sub_u32 v[0:1], v2 offset:-16
 ; GFX12-GISEL-NEXT:    s_endpgm
 entry:
@@ -92,18 +92,18 @@ entry:
 define amdgpu_kernel void @global_atomic_cond_sub_no_rtn_u32(ptr addrspace(1) %addr, i32 %in) {
 ; GFX12-SDAG-LABEL: global_atomic_cond_sub_no_rtn_u32:
 ; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-SDAG-NEXT:    global_atomic_cond_sub_u32 v0, v0, v1, s[0:1] offset:-16 th:TH_ATOMIC_RETURN
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6
+; GFX12-SDAG-NEXT:    global_atomic_cond_sub_u32 v0, v0, v1, s[4:5] offset:-16 th:TH_ATOMIC_RETURN
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: global_atomic_cond_sub_no_rtn_u32:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX12-GISEL-NEXT:    global_atomic_cond_sub_u32 v0, v1, v0, s[0:1] offset:-16 th:TH_ATOMIC_RETURN
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX12-GISEL-NEXT:    global_atomic_cond_sub_u32 v0, v1, v0, s[4:5] offset:-16 th:TH_ATOMIC_RETURN
 ; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr addrspace(1) %addr, i32 -4
@@ -114,20 +114,20 @@ entry:
 define amdgpu_kernel void @global_atomic_cond_sub_no_rtn_u32_forced(ptr addrspace(1) %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" {
 ; GFX12-SDAG-LABEL: global_atomic_cond_sub_no_rtn_u32_forced:
 ; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-SDAG-NEXT:    global_atomic_cond_sub_u32 v0, v1, s[0:1] offset:-16
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6
+; GFX12-SDAG-NEXT:    global_atomic_cond_sub_u32 v0, v1, s[4:5] offset:-16
 ; GFX12-SDAG-NEXT:    s_nop 0
 ; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: global_atomic_cond_sub_no_rtn_u32_forced:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX12-GISEL-NEXT:    global_atomic_cond_sub_u32 v1, v0, s[0:1] offset:-16
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX12-GISEL-NEXT:    global_atomic_cond_sub_u32 v1, v0, s[4:5] offset:-16
 ; GFX12-GISEL-NEXT:    s_nop 0
 ; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll b/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll
index af4116bd6aae5..1639ec6512208 100644
--- a/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll
+++ b/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll
@@ -131,13 +131,13 @@ define amdgpu_kernel void @s_ubfe_sub_i32(ptr addrspace(1) %out, i32 %src, i32 %
 ;
 ; VI-LABEL: s_ubfe_sub_i32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
-; VI-NEXT:    s_sub_i32 s0, 32, s3
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    s_lshl_b32 s1, s2, s0
+; VI-NEXT:    s_sub_i32 s0, 32, s7
+; VI-NEXT:    s_lshl_b32 s1, s6, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s4, v0
 ; VI-NEXT:    s_lshr_b32 s0, s1, s0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
@@ -175,13 +175,13 @@ define amdgpu_kernel void @s_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, i
 ;
 ; VI-LABEL: s_ubfe_sub_multi_use_shl_i32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
-; VI-NEXT:    s_sub_i32 s0, 32, s3
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    s_lshl_b32 s1, s2, s0
+; VI-NEXT:    s_sub_i32 s0, 32, s7
+; VI-NEXT:    s_lshl_b32 s1, s6, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s4, v0
 ; VI-NEXT:    s_lshr_b32 s0, s1, s0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
@@ -329,13 +329,13 @@ define amdgpu_kernel void @s_sbfe_sub_i32(ptr addrspace(1) %out, i32 %src, i32 %
 ;
 ; VI-LABEL: s_sbfe_sub_i32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
-; VI-NEXT:    s_sub_i32 s0, 32, s3
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    s_lshl_b32 s1, s2, s0
+; VI-NEXT:    s_sub_i32 s0, 32, s7
+; VI-NEXT:    s_lshl_b32 s1, s6, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s4, v0
 ; VI-NEXT:    s_ashr_i32 s0, s1, s0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
@@ -373,13 +373,13 @@ define amdgpu_kernel void @s_sbfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, i
 ;
 ; VI-LABEL: s_sbfe_sub_multi_use_shl_i32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
-; VI-NEXT:    s_sub_i32 s0, 32, s3
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    s_lshl_b32 s1, s2, s0
+; VI-NEXT:    s_sub_i32 s0, 32, s7
+; VI-NEXT:    s_lshl_b32 s1, s6, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s4, v0
 ; VI-NEXT:    s_ashr_i32 s0, s1, s0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
diff --git a/llvm/test/CodeGen/AMDGPU/bfm.ll b/llvm/test/CodeGen/AMDGPU/bfm.ll
index f8bd44b7c98f5..8b2f66b438f14 100644
--- a/llvm/test/CodeGen/AMDGPU/bfm.ll
+++ b/llvm/test/CodeGen/AMDGPU/bfm.ll
@@ -18,12 +18,12 @@ define amdgpu_kernel void @s_bfm_pattern(ptr addrspace(1) %out, i32 %x, i32 %y)
 ;
 ; VI-LABEL: s_bfm_pattern:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_bfm_b32 s2, s2, s3
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    s_bfm_b32 s0, s6, s7
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
   %a = shl i32 1, %x
diff --git a/llvm/test/CodeGen/AMDGPU/bitreverse.ll b/llvm/test/CodeGen/AMDGPU/bitreverse.ll
index 64555f14a55cc..49ec09ddb7770 100644
--- a/llvm/test/CodeGen/AMDGPU/bitreverse.ll
+++ b/llvm/test/CodeGen/AMDGPU/bitreverse.ll
@@ -117,64 +117,64 @@ define amdgpu_kernel void @v_brev_i16(ptr addrspace(1) noalias %out, ptr addrspa
 ;
 ; FLAT-LABEL: v_brev_i16:
 ; FLAT:       ; %bb.0:
-; FLAT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; FLAT-NEXT:    s_mov_b32 s7, 0xf000
-; FLAT-NEXT:    s_mov_b32 s6, -1
-; FLAT-NEXT:    s_mov_b32 s10, s6
-; FLAT-NEXT:    s_mov_b32 s11, s7
+; FLAT-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; FLAT-NEXT:    s_mov_b32 s3, 0xf000
+; FLAT-NEXT:    s_mov_b32 s2, -1
+; FLAT-NEXT:    s_mov_b32 s10, s2
+; FLAT-NEXT:    s_mov_b32 s11, s3
 ; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
-; FLAT-NEXT:    s_mov_b32 s8, s2
-; FLAT-NEXT:    s_mov_b32 s9, s3
+; FLAT-NEXT:    s_mov_b32 s8, s6
+; FLAT-NEXT:    s_mov_b32 s9, s7
 ; FLAT-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
-; FLAT-NEXT:    s_mov_b32 s4, s0
-; FLAT-NEXT:    s_mov_b32 s5, s1
+; FLAT-NEXT:    s_mov_b32 s0, s4
+; FLAT-NEXT:    s_mov_b32 s1, s5
 ; FLAT-NEXT:    s_waitcnt vmcnt(0)
 ; FLAT-NEXT:    v_bfrev_b32_e32 v0, v0
 ; FLAT-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; FLAT-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; FLAT-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; FLAT-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: v_brev_i16:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GISEL-NEXT:    v_mov_b32_e32 v1, s7
 ; GISEL-NEXT:    flat_load_ushort v0, v[0:1]
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    v_bfrev_b32_e32 v0, v0
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GISEL-NEXT:    v_mov_b32_e32 v1, s5
 ; GISEL-NEXT:    flat_store_short v[0:1], v2
 ; GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-FLAT-LABEL: v_brev_i16:
 ; GFX11-FLAT:       ; %bb.0:
-; GFX11-FLAT-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-FLAT-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX11-FLAT-NEXT:    s_mov_b32 s6, -1
+; GFX11-FLAT-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-FLAT-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-FLAT-NEXT:    s_mov_b32 s2, -1
 ; GFX11-FLAT-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-FLAT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FLAT-NEXT:    s_mov_b32 s4, s2
-; GFX11-FLAT-NEXT:    s_mov_b32 s5, s3
-; GFX11-FLAT-NEXT:    buffer_load_u16 v0, off, s[4:7], 0
+; GFX11-FLAT-NEXT:    s_mov_b32 s0, s6
+; GFX11-FLAT-NEXT:    s_mov_b32 s1, s7
+; GFX11-FLAT-NEXT:    buffer_load_u16 v0, off, s[0:3], 0
 ; GFX11-FLAT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FLAT-NEXT:    v_bfrev_b32_e32 v0, v0
-; GFX11-FLAT-NEXT:    global_store_d16_hi_b16 v1, v0, s[0:1]
+; GFX11-FLAT-NEXT:    global_store_d16_hi_b16 v1, v0, s[4:5]
 ; GFX11-FLAT-NEXT:    s_nop 0
 ; GFX11-FLAT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FLAT-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_brev_i16:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    global_load_u16 v1, v0, s[2:3]
+; GFX11-GISEL-NEXT:    global_load_u16 v1, v0, s[6:7]
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    v_bfrev_b32_e32 v1, v1
-; GFX11-GISEL-NEXT:    global_store_d16_hi_b16 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT:    global_store_d16_hi_b16 v0, v1, s[4:5]
 ; GFX11-GISEL-NEXT:    s_nop 0
 ; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
@@ -278,61 +278,61 @@ define amdgpu_kernel void @v_brev_i32(ptr addrspace(1) noalias %out, ptr addrspa
 ;
 ; FLAT-LABEL: v_brev_i32:
 ; FLAT:       ; %bb.0:
-; FLAT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; FLAT-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; FLAT-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
-; FLAT-NEXT:    v_mov_b32_e32 v1, s3
-; FLAT-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; FLAT-NEXT:    v_mov_b32_e32 v1, s7
+; FLAT-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 ; FLAT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; FLAT-NEXT:    flat_load_dword v0, v[0:1]
-; FLAT-NEXT:    s_mov_b32 s3, 0xf000
-; FLAT-NEXT:    s_mov_b32 s2, -1
+; FLAT-NEXT:    s_mov_b32 s7, 0xf000
+; FLAT-NEXT:    s_mov_b32 s6, -1
 ; FLAT-NEXT:    s_waitcnt vmcnt(0)
 ; FLAT-NEXT:    v_bfrev_b32_e32 v0, v0
-; FLAT-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; FLAT-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; FLAT-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: v_brev_i32:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GISEL-NEXT:    v_mov_b32_e32 v1, s7
 ; GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GISEL-NEXT:    flat_load_dword v0, v[0:1]
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    v_bfrev_b32_e32 v2, v0
-; GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GISEL-NEXT:    v_mov_b32_e32 v1, s5
 ; GISEL-NEXT:    flat_store_dword v[0:1], v2
 ; GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-FLAT-LABEL: v_brev_i32:
 ; GFX11-FLAT:       ; %bb.0:
-; GFX11-FLAT-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-FLAT-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-FLAT-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-FLAT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FLAT-NEXT:    global_load_b32 v0, v0, s[2:3]
-; GFX11-FLAT-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-FLAT-NEXT:    s_mov_b32 s2, -1
+; GFX11-FLAT-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-FLAT-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-FLAT-NEXT:    s_mov_b32 s6, -1
 ; GFX11-FLAT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FLAT-NEXT:    v_bfrev_b32_e32 v0, v0
-; GFX11-FLAT-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-FLAT-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
 ; GFX11-FLAT-NEXT:    s_nop 0
 ; GFX11-FLAT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FLAT-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_brev_i32:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    global_load_b32 v0, v0, s[2:3]
+; GFX11-GISEL-NEXT:    global_load_b32 v0, v0, s[6:7]
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    v_bfrev_b32_e32 v0, v0
-; GFX11-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT:    global_store_b32 v1, v0, s[4:5]
 ; GFX11-GISEL-NEXT:    s_nop 0
 ; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
@@ -362,59 +362,59 @@ define amdgpu_kernel void @s_brev_v2i32(ptr addrspace(1) noalias %out, <2 x i32>
 ;
 ; FLAT-LABEL: s_brev_v2i32:
 ; FLAT:       ; %bb.0:
-; FLAT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; FLAT-NEXT:    s_mov_b32 s7, 0xf000
-; FLAT-NEXT:    s_mov_b32 s6, -1
+; FLAT-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; FLAT-NEXT:    s_mov_b32 s3, 0xf000
+; FLAT-NEXT:    s_mov_b32 s2, -1
 ; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
-; FLAT-NEXT:    s_mov_b32 s4, s0
-; FLAT-NEXT:    s_mov_b32 s5, s1
-; FLAT-NEXT:    s_brev_b32 s0, s3
-; FLAT-NEXT:    s_brev_b32 s1, s2
-; FLAT-NEXT:    v_mov_b32_e32 v0, s1
-; FLAT-NEXT:    v_mov_b32_e32 v1, s0
-; FLAT-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; FLAT-NEXT:    s_mov_b32 s0, s4
+; FLAT-NEXT:    s_mov_b32 s1, s5
+; FLAT-NEXT:    s_brev_b32 s4, s7
+; FLAT-NEXT:    s_brev_b32 s5, s6
+; FLAT-NEXT:    v_mov_b32_e32 v0, s5
+; FLAT-NEXT:    v_mov_b32_e32 v1, s4
+; FLAT-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; FLAT-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: s_brev_v2i32:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    s_brev_b32 s2, s2
-; GISEL-NEXT:    s_brev_b32 s3, s3
-; GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GISEL-NEXT:    v_mov_b32_e32 v3, s1
-; GISEL-NEXT:    v_mov_b32_e32 v1, s3
-; GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GISEL-NEXT:    s_brev_b32 s0, s6
+; GISEL-NEXT:    s_brev_b32 s1, s7
+; GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GISEL-NEXT:    v_mov_b32_e32 v3, s5
 ; GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-FLAT-LABEL: s_brev_v2i32:
 ; GFX11-FLAT:       ; %bb.0:
-; GFX11-FLAT-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-FLAT-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX11-FLAT-NEXT:    s_mov_b32 s6, -1
+; GFX11-FLAT-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-FLAT-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-FLAT-NEXT:    s_mov_b32 s2, -1
 ; GFX11-FLAT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FLAT-NEXT:    s_brev_b32 s2, s2
-; GFX11-FLAT-NEXT:    s_brev_b32 s3, s3
+; GFX11-FLAT-NEXT:    s_brev_b32 s0, s6
+; GFX11-FLAT-NEXT:    s_brev_b32 s1, s7
 ; GFX11-FLAT-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FLAT-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-FLAT-NEXT:    s_mov_b32 s4, s0
-; GFX11-FLAT-NEXT:    s_mov_b32 s5, s1
-; GFX11-FLAT-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-FLAT-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FLAT-NEXT:    s_mov_b32 s0, s4
+; GFX11-FLAT-NEXT:    s_mov_b32 s1, s5
+; GFX11-FLAT-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
 ; GFX11-FLAT-NEXT:    s_nop 0
 ; GFX11-FLAT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FLAT-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: s_brev_v2i32:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_brev_b32 s2, s2
-; GFX11-GISEL-NEXT:    s_brev_b32 s3, s3
+; GFX11-GISEL-NEXT:    s_brev_b32 s0, s6
+; GFX11-GISEL-NEXT:    s_brev_b32 s1, s7
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
 ; GFX11-GISEL-NEXT:    s_nop 0
 ; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
@@ -446,33 +446,33 @@ define amdgpu_kernel void @v_brev_v2i32(ptr addrspace(1) noalias %out, ptr addrs
 ;
 ; FLAT-LABEL: v_brev_v2i32:
 ; FLAT:       ; %bb.0:
-; FLAT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; FLAT-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; FLAT-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
-; FLAT-NEXT:    v_mov_b32_e32 v1, s3
-; FLAT-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; FLAT-NEXT:    v_mov_b32_e32 v1, s7
+; FLAT-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 ; FLAT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; FLAT-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; FLAT-NEXT:    s_mov_b32 s3, 0xf000
-; FLAT-NEXT:    s_mov_b32 s2, -1
+; FLAT-NEXT:    s_mov_b32 s7, 0xf000
+; FLAT-NEXT:    s_mov_b32 s6, -1
 ; FLAT-NEXT:    s_waitcnt vmcnt(0)
 ; FLAT-NEXT:    v_bfrev_b32_e32 v1, v1
 ; FLAT-NEXT:    v_bfrev_b32_e32 v0, v0
-; FLAT-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; FLAT-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; FLAT-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: v_brev_v2i32:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GISEL-NEXT:    v_mov_b32_e32 v1, s7
 ; GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GISEL-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GISEL-NEXT:    v_mov_b32_e32 v3, s1
-; GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GISEL-NEXT:    v_mov_b32_e32 v3, s5
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    v_bfrev_b32_e32 v0, v0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v1, v1
@@ -481,31 +481,31 @@ define amdgpu_kernel void @v_brev_v2i32(ptr addrspace(1) noalias %out, ptr addrs
 ;
 ; GFX11-FLAT-LABEL: v_brev_v2i32:
 ; GFX11-FLAT:       ; %bb.0:
-; GFX11-FLAT-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-FLAT-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-FLAT-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX11-FLAT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FLAT-NEXT:    global_load_b64 v[0:1], v0, s[2:3]
-; GFX11-FLAT-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-FLAT-NEXT:    s_mov_b32 s2, -1
+; GFX11-FLAT-NEXT:    global_load_b64 v[0:1], v0, s[6:7]
+; GFX11-FLAT-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-FLAT-NEXT:    s_mov_b32 s6, -1
 ; GFX11-FLAT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FLAT-NEXT:    v_bfrev_b32_e32 v1, v1
 ; GFX11-FLAT-NEXT:    v_bfrev_b32_e32 v0, v0
-; GFX11-FLAT-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-FLAT-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
 ; GFX11-FLAT-NEXT:    s_nop 0
 ; GFX11-FLAT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FLAT-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_brev_v2i32:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    global_load_b64 v[0:1], v0, s[2:3]
+; GFX11-GISEL-NEXT:    global_load_b64 v[0:1], v0, s[6:7]
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    v_bfrev_b32_e32 v0, v0
 ; GFX11-GISEL-NEXT:    v_bfrev_b32_e32 v1, v1
-; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
 ; GFX11-GISEL-NEXT:    s_nop 0
 ; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
@@ -534,52 +534,52 @@ define amdgpu_kernel void @s_brev_i64(ptr addrspace(1) noalias %out, i64 %val) #
 ;
 ; FLAT-LABEL: s_brev_i64:
 ; FLAT:       ; %bb.0:
-; FLAT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; FLAT-NEXT:    s_mov_b32 s7, 0xf000
-; FLAT-NEXT:    s_mov_b32 s6, -1
+; FLAT-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; FLAT-NEXT:    s_mov_b32 s3, 0xf000
+; FLAT-NEXT:    s_mov_b32 s2, -1
 ; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
-; FLAT-NEXT:    s_mov_b32 s4, s0
-; FLAT-NEXT:    s_mov_b32 s5, s1
-; FLAT-NEXT:    s_brev_b64 s[0:1], s[2:3]
-; FLAT-NEXT:    v_mov_b32_e32 v0, s0
-; FLAT-NEXT:    v_mov_b32_e32 v1, s1
-; FLAT-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; FLAT-NEXT:    s_mov_b32 s0, s4
+; FLAT-NEXT:    s_mov_b32 s1, s5
+; FLAT-NEXT:    s_brev_b64 s[4:5], s[6:7]
+; FLAT-NEXT:    v_mov_b32_e32 v0, s4
+; FLAT-NEXT:    v_mov_b32_e32 v1, s5
+; FLAT-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; FLAT-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: s_brev_i64:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    s_brev_b64 s[2:3], s[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GISEL-NEXT:    v_mov_b32_e32 v3, s1
-; GISEL-NEXT:    v_mov_b32_e32 v1, s3
-; GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GISEL-NEXT:    s_brev_b64 s[0:1], s[6:7]
+; GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GISEL-NEXT:    v_mov_b32_e32 v3, s5
 ; GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-FLAT-LABEL: s_brev_i64:
 ; GFX11-FLAT:       ; %bb.0:
-; GFX11-FLAT-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-FLAT-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-FLAT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FLAT-NEXT:    s_brev_b64 s[4:5], s[2:3]
-; GFX11-FLAT-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-FLAT-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX11-FLAT-NEXT:    s_mov_b32 s2, -1
-; GFX11-FLAT-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-FLAT-NEXT:    s_brev_b64 s[0:1], s[6:7]
+; GFX11-FLAT-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-FLAT-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FLAT-NEXT:    s_mov_b32 s6, -1
+; GFX11-FLAT-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
 ; GFX11-FLAT-NEXT:    s_nop 0
 ; GFX11-FLAT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FLAT-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: s_brev_i64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_brev_b64 s[2:3], s[2:3]
+; GFX11-GISEL-NEXT:    s_brev_b64 s[0:1], s[6:7]
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
 ; GFX11-GISEL-NEXT:    s_nop 0
 ; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
@@ -611,33 +611,33 @@ define amdgpu_kernel void @v_brev_i64(ptr addrspace(1) noalias %out, ptr addrspa
 ;
 ; FLAT-LABEL: v_brev_i64:
 ; FLAT:       ; %bb.0:
-; FLAT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; FLAT-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; FLAT-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
-; FLAT-NEXT:    v_mov_b32_e32 v1, s3
-; FLAT-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; FLAT-NEXT:    v_mov_b32_e32 v1, s7
+; FLAT-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 ; FLAT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; FLAT-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; FLAT-NEXT:    s_mov_b32 s3, 0xf000
-; FLAT-NEXT:    s_mov_b32 s2, -1
+; FLAT-NEXT:    s_mov_b32 s7, 0xf000
+; FLAT-NEXT:    s_mov_b32 s6, -1
 ; FLAT-NEXT:    s_waitcnt vmcnt(0)
 ; FLAT-NEXT:    v_bfrev_b32_e32 v2, v0
 ; FLAT-NEXT:    v_bfrev_b32_e32 v1, v1
-; FLAT-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], 0
+; FLAT-NEXT:    buffer_store_dwordx2 v[1:2], off, s[4:7], 0
 ; FLAT-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: v_brev_i64:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GISEL-NEXT:    v_mov_b32_e32 v1, s7
 ; GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GISEL-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GISEL-NEXT:    v_mov_b32_e32 v4, s1
-; GISEL-NEXT:    v_mov_b32_e32 v3, s0
+; GISEL-NEXT:    v_mov_b32_e32 v3, s4
+; GISEL-NEXT:    v_mov_b32_e32 v4, s5
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    v_bfrev_b32_e32 v1, v1
 ; GISEL-NEXT:    v_bfrev_b32_e32 v2, v0
@@ -646,31 +646,31 @@ define amdgpu_kernel void @v_brev_i64(ptr addrspace(1) noalias %out, ptr addrspa
 ;
 ; GFX11-FLAT-LABEL: v_brev_i64:
 ; GFX11-FLAT:       ; %bb.0:
-; GFX11-FLAT-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-FLAT-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-FLAT-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX11-FLAT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FLAT-NEXT:    global_load_b64 v[0:1], v0, s[2:3]
-; GFX11-FLAT-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-FLAT-NEXT:    s_mov_b32 s2, -1
+; GFX11-FLAT-NEXT:    global_load_b64 v[0:1], v0, s[6:7]
+; GFX11-FLAT-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-FLAT-NEXT:    s_mov_b32 s6, -1
 ; GFX11-FLAT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FLAT-NEXT:    v_bfrev_b32_e32 v2, v0
 ; GFX11-FLAT-NEXT:    v_bfrev_b32_e32 v1, v1
-; GFX11-FLAT-NEXT:    buffer_store_b64 v[1:2], off, s[0:3], 0
+; GFX11-FLAT-NEXT:    buffer_store_b64 v[1:2], off, s[4:7], 0
 ; GFX11-FLAT-NEXT:    s_nop 0
 ; GFX11-FLAT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FLAT-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_brev_i64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    global_load_b64 v[0:1], v0, s[2:3]
+; GFX11-GISEL-NEXT:    global_load_b64 v[0:1], v0, s[6:7]
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    v_bfrev_b32_e32 v1, v1
 ; GFX11-GISEL-NEXT:    v_bfrev_b32_e32 v2, v0
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-GISEL-NEXT:    global_store_b64 v0, v[1:2], s[0:1]
+; GFX11-GISEL-NEXT:    global_store_b64 v0, v[1:2], s[4:5]
 ; GFX11-GISEL-NEXT:    s_nop 0
 ; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
@@ -793,74 +793,74 @@ define amdgpu_kernel void @v_brev_v2i64(ptr addrspace(1) noalias %out, ptr addrs
 ;
 ; FLAT-LABEL: v_brev_v2i64:
 ; FLAT:       ; %bb.0:
-; FLAT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; FLAT-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; FLAT-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
-; FLAT-NEXT:    v_mov_b32_e32 v1, s3
-; FLAT-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; FLAT-NEXT:    v_mov_b32_e32 v1, s7
+; FLAT-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 ; FLAT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; FLAT-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
-; FLAT-NEXT:    s_mov_b32 s3, 0xf000
-; FLAT-NEXT:    s_mov_b32 s2, -1
+; FLAT-NEXT:    s_mov_b32 s7, 0xf000
+; FLAT-NEXT:    s_mov_b32 s6, -1
 ; FLAT-NEXT:    s_waitcnt vmcnt(0)
 ; FLAT-NEXT:    v_bfrev_b32_e32 v4, v2
 ; FLAT-NEXT:    v_bfrev_b32_e32 v3, v3
 ; FLAT-NEXT:    v_bfrev_b32_e32 v2, v0
 ; FLAT-NEXT:    v_bfrev_b32_e32 v1, v1
-; FLAT-NEXT:    buffer_store_dwordx4 v[1:4], off, s[0:3], 0
+; FLAT-NEXT:    buffer_store_dwordx4 v[1:4], off, s[4:7], 0
 ; FLAT-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: v_brev_v2i64:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 4, v0
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GISEL-NEXT:    v_mov_b32_e32 v1, s7
 ; GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GISEL-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    v_bfrev_b32_e32 v4, v1
 ; GISEL-NEXT:    v_bfrev_b32_e32 v5, v0
-; GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GISEL-NEXT:    v_mov_b32_e32 v0, s4
 ; GISEL-NEXT:    v_bfrev_b32_e32 v6, v3
 ; GISEL-NEXT:    v_bfrev_b32_e32 v7, v2
-; GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GISEL-NEXT:    v_mov_b32_e32 v1, s5
 ; GISEL-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
 ; GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-FLAT-LABEL: v_brev_v2i64:
 ; GFX11-FLAT:       ; %bb.0:
-; GFX11-FLAT-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-FLAT-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-FLAT-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX11-FLAT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FLAT-NEXT:    global_load_b128 v[0:3], v0, s[2:3]
-; GFX11-FLAT-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-FLAT-NEXT:    s_mov_b32 s2, -1
+; GFX11-FLAT-NEXT:    global_load_b128 v[0:3], v0, s[6:7]
+; GFX11-FLAT-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-FLAT-NEXT:    s_mov_b32 s6, -1
 ; GFX11-FLAT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FLAT-NEXT:    v_bfrev_b32_e32 v4, v2
 ; GFX11-FLAT-NEXT:    v_bfrev_b32_e32 v3, v3
 ; GFX11-FLAT-NEXT:    v_bfrev_b32_e32 v2, v0
 ; GFX11-FLAT-NEXT:    v_bfrev_b32_e32 v1, v1
-; GFX11-FLAT-NEXT:    buffer_store_b128 v[1:4], off, s[0:3], 0
+; GFX11-FLAT-NEXT:    buffer_store_b128 v[1:4], off, s[4:7], 0
 ; GFX11-FLAT-NEXT:    s_nop 0
 ; GFX11-FLAT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FLAT-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_brev_v2i64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    global_load_b128 v[0:3], v0, s[2:3]
+; GFX11-GISEL-NEXT:    global_load_b128 v[0:3], v0, s[6:7]
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    v_bfrev_b32_e32 v4, v1
 ; GFX11-GISEL-NEXT:    v_bfrev_b32_e32 v5, v0
 ; GFX11-GISEL-NEXT:    v_bfrev_b32_e32 v6, v3
 ; GFX11-GISEL-NEXT:    v_bfrev_b32_e32 v7, v2
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-GISEL-NEXT:    global_store_b128 v0, v[4:7], s[0:1]
+; GFX11-GISEL-NEXT:    global_store_b128 v0, v[4:7], s[4:5]
 ; GFX11-GISEL-NEXT:    s_nop 0
 ; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/build_vector.ll b/llvm/test/CodeGen/AMDGPU/build_vector.ll
index b26d15ed3a1c8..8d347ae421437 100644
--- a/llvm/test/CodeGen/AMDGPU/build_vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/build_vector.ll
@@ -277,53 +277,53 @@ define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out,
 ;
 ; GFX8-LABEL: build_v2i32_from_v4i16_shuffle:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX8-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    v_mov_b32_e32 v0, s2
-; GFX8-NEXT:    v_mov_b32_e32 v1, s3
-; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    s_lshl_b32 s0, s7, 16
+; GFX8-NEXT:    s_lshl_b32 s1, s6, 16
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    v_mov_b32_e32 v0, s1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: build_v2i32_from_v4i16_shuffle:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX10-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT:    s_lshl_b32 s0, s6, 16
+; GFX10-NEXT:    s_lshl_b32 s1, s7, 16
+; GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: build_v2i32_from_v4i16_shuffle:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX11-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX11-NEXT:    v_mov_b32_e32 v0, s2
-; GFX11-NEXT:    v_mov_b32_e32 v1, s3
-; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT:    s_lshl_b32 s0, s6, 16
+; GFX11-NEXT:    s_lshl_b32 s1, s7, 16
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX940-LABEL: build_v2i32_from_v4i16_shuffle:
 ; GFX940:       ; %bb.0: ; %entry
-; GFX940-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX940-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX940-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX940-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX940-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-NEXT:    v_mov_b32_e32 v1, s3
-; GFX940-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
+; GFX940-NEXT:    s_lshl_b32 s0, s7, 16
+; GFX940-NEXT:    s_lshl_b32 s1, s6, 16
+; GFX940-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5] sc0 sc1
 ; GFX940-NEXT:    s_endpgm
 entry:
   %shuf = shufflevector <4 x i16> %in, <4 x i16> zeroinitializer, <2 x i32> <i32 0, i32 2>
diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
index 15ebdd70ae881..00af922b05f93 100644
--- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
+++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
@@ -18,20 +18,20 @@ define spir_kernel void @kernel(ptr addrspace(1) %out) {
 ;
 ; VI-LABEL: kernel:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; VI-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: kernel:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_store_b32 v0, v0, s[0:1]
+; GFX11-NEXT:    global_store_b32 v0, v0, s[2:3]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -1245,26 +1245,26 @@ define amdgpu_kernel void @amd_kernel_v5i8(<5 x i8> %arg0) {
 ;
 ; VI-LABEL: amd_kernel_v5i8:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshr_b32 s2, s0, 24
-; VI-NEXT:    s_lshr_b32 s3, s0, 16
-; VI-NEXT:    s_add_i32 s3, s3, s3
-; VI-NEXT:    s_add_i32 s2, s2, s2
-; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_lshr_b32 s0, s2, 24
+; VI-NEXT:    s_lshr_b32 s1, s2, 16
+; VI-NEXT:    s_add_i32 s1, s1, s1
 ; VI-NEXT:    s_add_i32 s0, s0, s0
-; VI-NEXT:    v_lshlrev_b16_e64 v1, 8, s2
-; VI-NEXT:    v_mov_b32_e32 v2, s3
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    s_add_i32 s2, s2, s2
+; VI-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; VI-NEXT:    v_mov_b32_e32 v2, s1
 ; VI-NEXT:    v_add_u32_sdwa v0, vcc, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    s_and_b32 s1, s1, 0xff
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    s_and_b32 s3, s3, 0xff
 ; VI-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_add_i32 s1, s1, s1
+; VI-NEXT:    s_add_i32 s3, s3, s3
 ; VI-NEXT:    v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_mov_b32_e32 v0, 4
 ; VI-NEXT:    v_mov_b32_e32 v1, 0
-; VI-NEXT:    v_mov_b32_e32 v5, s1
+; VI-NEXT:    v_mov_b32_e32 v5, s3
 ; VI-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, 0
 ; VI-NEXT:    flat_store_byte v[0:1], v5
@@ -1273,16 +1273,16 @@ define amdgpu_kernel void @amd_kernel_v5i8(<5 x i8> %arg0) {
 ;
 ; GFX11-LABEL: amd_kernel_v5i8:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b16 v0, 8, s0
-; GFX11-NEXT:    s_lshr_b32 s2, s0, 16
-; GFX11-NEXT:    s_lshr_b32 s3, s0, 24
-; GFX11-NEXT:    v_add_nc_u16 v1, s0, s0
-; GFX11-NEXT:    v_add_nc_u16 v2, s3, s3
+; GFX11-NEXT:    v_lshrrev_b16 v0, 8, s2
+; GFX11-NEXT:    s_lshr_b32 s0, s2, 16
+; GFX11-NEXT:    s_lshr_b32 s1, s2, 24
+; GFX11-NEXT:    v_add_nc_u16 v1, s2, s2
+; GFX11-NEXT:    v_add_nc_u16 v2, s1, s1
 ; GFX11-NEXT:    v_add_nc_u16 v0, v0, v0
-; GFX11-NEXT:    v_add_nc_u16 v3, s2, s2
-; GFX11-NEXT:    v_add_nc_u16 v6, s1, s1
+; GFX11-NEXT:    v_add_nc_u16 v3, s0, s0
+; GFX11-NEXT:    v_add_nc_u16 v6, s3, s3
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX11-NEXT:    v_lshlrev_b16 v2, 8, v2
 ; GFX11-NEXT:    v_lshlrev_b16 v0, 8, v0
diff --git a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll
index b6948dab6bf9f..9336816866614 100644
--- a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll
+++ b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll
@@ -30,16 +30,16 @@
 define amdgpu_kernel void @cluster_load_cluster_store(ptr noalias %lb, ptr noalias %sb) {
 ; GFX9-LABEL: cluster_load_cluster_store:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    flat_load_dword v2, v[0:1]
 ; GFX9-NEXT:    flat_load_dword v3, v[0:1] offset:8
 ; GFX9-NEXT:    flat_load_dword v4, v[0:1] offset:16
 ; GFX9-NEXT:    flat_load_dword v5, v[0:1] offset:24
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    flat_store_dword v[0:1], v2
 ; GFX9-NEXT:    flat_store_dword v[0:1], v3 offset:8
@@ -49,20 +49,20 @@ define amdgpu_kernel void @cluster_load_cluster_store(ptr noalias %lb, ptr noali
 ;
 ; GFX10-LABEL: cluster_load_cluster_store:
 ; GFX10:       ; %bb.0: ; %bb
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_add_u32 s4, s0, 8
-; GFX10-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-NEXT:    s_add_u32 s6, s0, 16
-; GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-NEXT:    s_addc_u32 s7, s1, 0
-; GFX10-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-NEXT:    s_add_u32 s0, s0, 24
-; GFX10-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-NEXT:    v_mov_b32_e32 v4, s6
-; GFX10-NEXT:    v_mov_b32_e32 v5, s7
+; GFX10-NEXT:    s_add_u32 s0, s4, 8
+; GFX10-NEXT:    s_addc_u32 s1, s5, 0
+; GFX10-NEXT:    s_add_u32 s2, s4, 16
+; GFX10-NEXT:    v_mov_b32_e32 v3, s1
+; GFX10-NEXT:    s_addc_u32 s3, s5, 0
+; GFX10-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-NEXT:    v_mov_b32_e32 v2, s0
+; GFX10-NEXT:    s_add_u32 s0, s4, 24
+; GFX10-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-NEXT:    s_addc_u32 s1, s5, 0
+; GFX10-NEXT:    v_mov_b32_e32 v5, s3
+; GFX10-NEXT:    v_mov_b32_e32 v4, s2
 ; GFX10-NEXT:    v_mov_b32_e32 v7, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v6, s0
 ; GFX10-NEXT:    s_clause 0x3
@@ -70,16 +70,16 @@ define amdgpu_kernel void @cluster_load_cluster_store(ptr noalias %lb, ptr noali
 ; GFX10-NEXT:    flat_load_dword v9, v[2:3]
 ; GFX10-NEXT:    flat_load_dword v10, v[4:5]
 ; GFX10-NEXT:    flat_load_dword v11, v[6:7]
-; GFX10-NEXT:    s_add_u32 s0, s2, 8
-; GFX10-NEXT:    s_addc_u32 s1, s3, 0
-; GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-NEXT:    s_add_u32 s0, s6, 8
+; GFX10-NEXT:    s_addc_u32 s1, s7, 0
+; GFX10-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s0
-; GFX10-NEXT:    s_add_u32 s0, s2, 16
-; GFX10-NEXT:    s_addc_u32 s1, s3, 0
-; GFX10-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-NEXT:    s_add_u32 s2, s2, 24
-; GFX10-NEXT:    s_addc_u32 s3, s3, 0
+; GFX10-NEXT:    s_add_u32 s0, s6, 16
+; GFX10-NEXT:    s_addc_u32 s1, s7, 0
+; GFX10-NEXT:    s_add_u32 s2, s6, 24
+; GFX10-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-NEXT:    s_addc_u32 s3, s7, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v5, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v7, s3
@@ -96,15 +96,15 @@ define amdgpu_kernel void @cluster_load_cluster_store(ptr noalias %lb, ptr noali
 ;
 ; GFX11-LABEL: cluster_load_cluster_store:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX11-NEXT:    s_clause 0x3
 ; GFX11-NEXT:    flat_load_b32 v2, v[0:1]
 ; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:8
 ; GFX11-NEXT:    flat_load_b32 v4, v[0:1] offset:16
 ; GFX11-NEXT:    flat_load_b32 v5, v[0:1] offset:24
-; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
 ; GFX11-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
 ; GFX11-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(3)
@@ -155,16 +155,16 @@ bb:
 define amdgpu_kernel void @cluster_load_valu_cluster_store(ptr noalias %lb, ptr noalias %sb) {
 ; GFX9-LABEL: cluster_load_valu_cluster_store:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    flat_load_dword v2, v[0:1]
 ; GFX9-NEXT:    flat_load_dword v3, v[0:1] offset:8
 ; GFX9-NEXT:    flat_load_dword v4, v[0:1] offset:16
 ; GFX9-NEXT:    flat_load_dword v5, v[0:1] offset:24
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    flat_store_dword v[0:1], v2
 ; GFX9-NEXT:    v_add_u32_e32 v2, 1, v3
@@ -175,20 +175,20 @@ define amdgpu_kernel void @cluster_load_valu_cluster_store(ptr noalias %lb, ptr
 ;
 ; GFX10-LABEL: cluster_load_valu_cluster_store:
 ; GFX10:       ; %bb.0: ; %bb
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_add_u32 s4, s0, 8
-; GFX10-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-NEXT:    s_add_u32 s6, s0, 16
-; GFX10-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-NEXT:    s_addc_u32 s7, s1, 0
-; GFX10-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-NEXT:    s_add_u32 s0, s0, 24
-; GFX10-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-NEXT:    v_mov_b32_e32 v4, s6
-; GFX10-NEXT:    v_mov_b32_e32 v5, s7
+; GFX10-NEXT:    s_add_u32 s0, s4, 8
+; GFX10-NEXT:    s_addc_u32 s1, s5, 0
+; GFX10-NEXT:    s_add_u32 s2, s4, 16
+; GFX10-NEXT:    v_mov_b32_e32 v3, s1
+; GFX10-NEXT:    v_mov_b32_e32 v2, s0
+; GFX10-NEXT:    s_addc_u32 s3, s5, 0
+; GFX10-NEXT:    s_add_u32 s0, s4, 24
+; GFX10-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-NEXT:    s_addc_u32 s1, s5, 0
+; GFX10-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-NEXT:    v_mov_b32_e32 v5, s3
+; GFX10-NEXT:    v_mov_b32_e32 v4, s2
 ; GFX10-NEXT:    flat_load_dword v6, v[2:3]
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s0
@@ -196,18 +196,18 @@ define amdgpu_kernel void @cluster_load_valu_cluster_store(ptr noalias %lb, ptr
 ; GFX10-NEXT:    flat_load_dword v8, v[0:1]
 ; GFX10-NEXT:    flat_load_dword v9, v[4:5]
 ; GFX10-NEXT:    flat_load_dword v10, v[2:3]
-; GFX10-NEXT:    s_add_u32 s0, s2, 8
-; GFX10-NEXT:    s_addc_u32 s1, s3, 0
-; GFX10-NEXT:    s_add_u32 s4, s2, 16
+; GFX10-NEXT:    s_add_u32 s0, s6, 8
+; GFX10-NEXT:    s_addc_u32 s1, s7, 0
+; GFX10-NEXT:    s_add_u32 s2, s6, 16
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s1
-; GFX10-NEXT:    s_addc_u32 s5, s3, 0
-; GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-NEXT:    s_addc_u32 s3, s7, 0
+; GFX10-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s0
-; GFX10-NEXT:    s_add_u32 s0, s2, 24
-; GFX10-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-NEXT:    v_mov_b32_e32 v4, s4
-; GFX10-NEXT:    s_addc_u32 s1, s3, 0
-; GFX10-NEXT:    v_mov_b32_e32 v5, s5
+; GFX10-NEXT:    s_add_u32 s0, s6, 24
+; GFX10-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-NEXT:    v_mov_b32_e32 v5, s3
+; GFX10-NEXT:    s_addc_u32 s1, s7, 0
+; GFX10-NEXT:    v_mov_b32_e32 v4, s2
 ; GFX10-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v11, 1, v6
 ; GFX10-NEXT:    v_mov_b32_e32 v7, s1
@@ -223,15 +223,15 @@ define amdgpu_kernel void @cluster_load_valu_cluster_store(ptr noalias %lb, ptr
 ;
 ; GFX11-LABEL: cluster_load_valu_cluster_store:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX11-NEXT:    s_clause 0x3
 ; GFX11-NEXT:    flat_load_b32 v2, v[0:1] offset:8
 ; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
 ; GFX11-NEXT:    flat_load_b32 v4, v[0:1] offset:16
 ; GFX11-NEXT:    flat_load_b32 v5, v[0:1] offset:24
-; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
 ; GFX11-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
 ; GFX11-NEXT:    v_add_nc_u32_e32 v2, 1, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
diff --git a/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll b/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll
index 33c0d90f94a39..9c7fa1537c0c2 100644
--- a/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll
@@ -665,17 +665,17 @@ define amdgpu_kernel void @sub_zext_setcc_commute(ptr addrspace(1) nocapture %ar
 ;
 ; GFX9-LABEL: sub_zext_setcc_commute:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, v0, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v2, s[0:1]
+; GFX9-NEXT:    global_load_dword v3, v2, s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v3
-; GFX9-NEXT:    v_add_u32_e32 v0, s2, v0
-; GFX9-NEXT:    v_subrev_u32_e32 v0, s3, v0
-; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT:    v_add_u32_e32 v0, s6, v0
+; GFX9-NEXT:    v_subrev_u32_e32 v0, s7, v0
+; GFX9-NEXT:    global_store_dword v2, v0, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 bb:
   %x = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -714,17 +714,17 @@ define amdgpu_kernel void @sub_sext_setcc_commute(ptr addrspace(1) nocapture %ar
 ;
 ; GFX9-LABEL: sub_sext_setcc_commute:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, v0, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v2, s[0:1]
+; GFX9-NEXT:    global_load_dword v3, v2, s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v3
-; GFX9-NEXT:    v_add_u32_e32 v0, s2, v0
-; GFX9-NEXT:    v_subrev_u32_e32 v0, s3, v0
-; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT:    v_add_u32_e32 v0, s6, v0
+; GFX9-NEXT:    v_subrev_u32_e32 v0, s7, v0
+; GFX9-NEXT:    global_store_dword v2, v0, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 bb:
   %x = tail call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll
index 4decf39d04013..332b6013803cf 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll
@@ -127,19 +127,19 @@ define amdgpu_kernel void @v_ctlz_i32(ptr addrspace(1) noalias %out, ptr addrspa
 ;
 ; VI-LABEL: v_ctlz_i32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbh_u32_e32 v0, v0
 ; VI-NEXT:    v_min_u32_e32 v0, 32, v0
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_ctlz_i32:
@@ -164,41 +164,41 @@ define amdgpu_kernel void @v_ctlz_i32(ptr addrspace(1) noalias %out, ptr addrspa
 ;
 ; GFX10-LABEL: v_ctlz_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_ctlz_i32:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_ctlz_i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3]
+; GFX11-NEXT:    global_load_b32 v0, v0, s[6:7]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_clz_i32_u32_e32 v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_min_u32_e32 v0, 32, v0
-; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -235,21 +235,21 @@ define amdgpu_kernel void @v_ctlz_v2i32(ptr addrspace(1) noalias %out, ptr addrs
 ;
 ; VI-LABEL: v_ctlz_v2i32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbh_u32_e32 v1, v1
 ; VI-NEXT:    v_ffbh_u32_e32 v0, v0
 ; VI-NEXT:    v_min_u32_e32 v1, 32, v1
 ; VI-NEXT:    v_min_u32_e32 v0, 32, v0
-; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_ctlz_v2i32:
@@ -277,48 +277,48 @@ define amdgpu_kernel void @v_ctlz_v2i32(ptr addrspace(1) noalias %out, ptr addrs
 ;
 ; GFX10-LABEL: v_ctlz_v2i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbh_u32_e32 v1, v1
 ; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX10-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
-; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_ctlz_v2i32:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
+; GFX10-GISEL-NEXT:    global_load_dwordx2 v[0:1], v0, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
-; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_ctlz_v2i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[0:1], v0, s[2:3]
+; GFX11-NEXT:    global_load_b64 v[0:1], v0, s[6:7]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_clz_i32_u32_e32 v1, v1
 ; GFX11-NEXT:    v_clz_i32_u32_e32 v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX11-NEXT:    v_min_u32_e32 v0, 32, v0
-; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -359,15 +359,15 @@ define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrs
 ;
 ; VI-LABEL: v_ctlz_v4i32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbh_u32_e32 v3, v3
 ; VI-NEXT:    v_ffbh_u32_e32 v2, v2
@@ -377,7 +377,7 @@ define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrs
 ; VI-NEXT:    v_min_u32_e32 v2, 32, v2
 ; VI-NEXT:    v_min_u32_e32 v1, 32, v1
 ; VI-NEXT:    v_min_u32_e32 v0, 32, v0
-; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_ctlz_v4i32:
@@ -411,11 +411,11 @@ define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrs
 ;
 ; GFX10-LABEL: v_ctlz_v4i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbh_u32_e32 v3, v3
 ; GFX10-NEXT:    v_ffbh_u32_e32 v2, v2
@@ -425,16 +425,16 @@ define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrs
 ; GFX10-NEXT:    v_min_u32_e32 v2, 32, v2
 ; GFX10-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
-; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_ctlz_v4i32:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[0:3], v0, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
@@ -444,16 +444,16 @@ define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrs
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v2, 32, v2
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v3, 32, v3
-; GFX10-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_ctlz_v4i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b128 v[0:3], v0, s[2:3]
+; GFX11-NEXT:    global_load_b128 v[0:3], v0, s[6:7]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_clz_i32_u32_e32 v3, v3
 ; GFX11-NEXT:    v_clz_i32_u32_e32 v2, v2
@@ -465,7 +465,7 @@ define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrs
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX11-NEXT:    v_min_u32_e32 v0, 32, v0
-; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
+; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -500,22 +500,22 @@ define amdgpu_kernel void @v_ctlz_i8(ptr addrspace(1) noalias %out, ptr addrspac
 ;
 ; VI-LABEL: v_ctlz_i8:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
-; VI-NEXT:    s_mov_b32 s10, s6
-; VI-NEXT:    s_mov_b32 s11, s7
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s10, s2
+; VI-NEXT:    s_mov_b32 s11, s3
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s8, s2
-; VI-NEXT:    s_mov_b32 s9, s3
+; VI-NEXT:    s_mov_b32 s8, s6
+; VI-NEXT:    s_mov_b32 s9, s7
 ; VI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbh_u32_e32 v0, v0
 ; VI-NEXT:    v_min_u32_e32 v0, 32, v0
 ; VI-NEXT:    v_subrev_u32_e32 v0, vcc, 24, v0
-; VI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
+; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_ctlz_i8:
@@ -550,42 +550,42 @@ define amdgpu_kernel void @v_ctlz_i8(ptr addrspace(1) noalias %out, ptr addrspac
 ;
 ; GFX10-LABEL: v_ctlz_i8:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ubyte v1, v0, s[2:3]
+; GFX10-NEXT:    global_load_ubyte v1, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbh_u32_e32 v1, v1
 ; GFX10-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v1, 24, v1
-; GFX10-NEXT:    global_store_byte v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_byte v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_ctlz_i8:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
+; GFX10-GISEL-NEXT:    global_load_ubyte v1, v0, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX10-GISEL-NEXT:    v_subrev_nc_u32_e32 v1, 24, v1
-; GFX10-GISEL-NEXT:    global_store_byte v0, v1, s[0:1]
+; GFX10-GISEL-NEXT:    global_store_byte v0, v1, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_ctlz_i8:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_u8 v1, v0, s[2:3]
+; GFX11-NEXT:    global_load_u8 v1, v0, s[6:7]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_clz_i32_u32_e32 v1, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX11-NEXT:    v_subrev_nc_u32_e32 v1, 24, v1
-; GFX11-NEXT:    global_store_b8 v0, v1, s[0:1]
+; GFX11-NEXT:    global_store_b8 v0, v1, s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -706,16 +706,16 @@ define amdgpu_kernel void @s_ctlz_i64_trunc(ptr addrspace(1) noalias %out, i64 %
 ;
 ; VI-LABEL: s_ctlz_i64_trunc:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_flbit_i32_b64 s0, s[2:3]
-; VI-NEXT:    s_min_u32 s0, s0, 64
-; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_flbit_i32_b64 s4, s[6:7]
+; VI-NEXT:    s_min_u32 s4, s4, 64
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_ctlz_i64_trunc:
@@ -737,35 +737,35 @@ define amdgpu_kernel void @s_ctlz_i64_trunc(ptr addrspace(1) noalias %out, i64 %
 ;
 ; GFX10-LABEL: s_ctlz_i64_trunc:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_flbit_i32_b64 s2, s[2:3]
-; GFX10-NEXT:    s_min_u32 s2, s2, 64
-; GFX10-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    s_flbit_i32_b64 s0, s[6:7]
+; GFX10-NEXT:    s_min_u32 s0, s0, 64
+; GFX10-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: s_ctlz_i64_trunc:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_flbit_i32_b64 s2, s[2:3]
-; GFX10-GISEL-NEXT:    s_min_u32 s2, s2, 64
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT:    s_flbit_i32_b64 s0, s[6:7]
+; GFX10-GISEL-NEXT:    s_min_u32 s0, s0, 64
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_ctlz_i64_trunc:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_clz_i32_u64 s2, s[2:3]
+; GFX11-NEXT:    s_clz_i32_u64 s0, s[6:7]
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_min_u32 s2, s2, 64
-; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    s_min_u32 s0, s0, 64
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -799,16 +799,16 @@ define amdgpu_kernel void @v_ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspa
 ;
 ; VI-LABEL: v_ctlz_i64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
 ; VI-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v3
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v3
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v4, s1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, s0, v3
+; VI-NEXT:    v_mov_b32_e32 v4, s5
+; VI-NEXT:    v_add_u32_e32 v3, vcc, s4, v3
 ; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbh_u32_e32 v0, v0
@@ -847,25 +847,25 @@ define amdgpu_kernel void @v_ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspa
 ;
 ; GFX10-LABEL: v_ctlz_i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v2, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX10-NEXT:    v_ffbh_u32_e32 v1, v1
 ; GFX10-NEXT:    v_add_nc_u32_e64 v0, v0, 32 clamp
 ; GFX10-NEXT:    v_min3_u32 v0, v0, v1, 64
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_ctlz_i64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX10-GISEL-NEXT:    global_load_dwordx2 v[0:1], v2, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
@@ -873,15 +873,15 @@ define amdgpu_kernel void @v_ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspa
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, v1, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 64, v0
-; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_ctlz_i64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[0:1], v2, s[2:3]
+; GFX11-NEXT:    global_load_b64 v[0:1], v2, s[6:7]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_clz_i32_u32_e32 v0, v0
 ; GFX11-NEXT:    v_clz_i32_u32_e32 v1, v1
@@ -889,7 +889,7 @@ define amdgpu_kernel void @v_ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspa
 ; GFX11-NEXT:    v_add_nc_u32_e64 v0, v0, 32 clamp
 ; GFX11-NEXT:    v_min3_u32 v0, v0, v1, 64
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -926,16 +926,16 @@ define amdgpu_kernel void @v_ctlz_i64_trunc(ptr addrspace(1) noalias %out, ptr a
 ;
 ; VI-LABEL: v_ctlz_i64_trunc:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v2, s3
-; VI-NEXT:    v_add_u32_e32 v1, vcc, s2, v1
+; VI-NEXT:    v_mov_b32_e32 v2, s7
+; VI-NEXT:    v_add_u32_e32 v1, vcc, s6, v1
 ; VI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[1:2], v[1:2]
-; VI-NEXT:    v_add_u32_e32 v3, vcc, s0, v0
-; VI-NEXT:    v_mov_b32_e32 v4, s1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, s4, v0
+; VI-NEXT:    v_mov_b32_e32 v4, s5
 ; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbh_u32_e32 v0, v1
@@ -974,49 +974,49 @@ define amdgpu_kernel void @v_ctlz_i64_trunc(ptr addrspace(1) noalias %out, ptr a
 ;
 ; GFX10-LABEL: v_ctlz_i64_trunc:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[1:2], v1, s[2:3]
+; GFX10-NEXT:    global_load_dwordx2 v[1:2], v1, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbh_u32_e32 v1, v1
 ; GFX10-NEXT:    v_ffbh_u32_e32 v2, v2
 ; GFX10-NEXT:    v_add_nc_u32_e64 v1, v1, 32 clamp
 ; GFX10-NEXT:    v_min3_u32 v1, v1, v2, 64
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_ctlz_i64_trunc:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dwordx2 v[1:2], v1, s[2:3]
+; GFX10-GISEL-NEXT:    global_load_dwordx2 v[1:2], v1, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v2, v2
 ; GFX10-GISEL-NEXT:    v_add_nc_u32_e64 v1, v1, 32 clamp
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, v2, v1
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 64, v1
-; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_ctlz_i64_trunc:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[1:2], v1, s[2:3]
+; GFX11-NEXT:    global_load_b64 v[1:2], v1, s[6:7]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_clz_i32_u32_e32 v1, v1
 ; GFX11-NEXT:    v_clz_i32_u32_e32 v2, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_nc_u32_e64 v1, v1, 32 clamp
 ; GFX11-NEXT:    v_min3_u32 v1, v1, v2, 64
-; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -1052,18 +1052,18 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out,
 ;
 ; VI-LABEL: v_ctlz_i32_sel_eq_neg1:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbh_u32_e32 v0, v0
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_ctlz_i32_sel_eq_neg1:
@@ -1090,40 +1090,40 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out,
 ;
 ; GFX10-LABEL: v_ctlz_i32_sel_eq_neg1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_ctlz_i32_sel_eq_neg1:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, -1, vcc_lo
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_ctlz_i32_sel_eq_neg1:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3]
+; GFX11-NEXT:    global_load_b32 v0, v0, s[6:7]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_clz_i32_u32_e32 v0, v0
-; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -1159,18 +1159,18 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out,
 ;
 ; VI-LABEL: v_ctlz_i32_sel_ne_neg1:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbh_u32_e32 v0, v0
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_ctlz_i32_sel_ne_neg1:
@@ -1197,40 +1197,40 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out,
 ;
 ; GFX10-LABEL: v_ctlz_i32_sel_ne_neg1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_ctlz_i32_sel_ne_neg1:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
 ; GFX10-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e32 v0, -1, v1, vcc_lo
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_ctlz_i32_sel_ne_neg1:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3]
+; GFX11-NEXT:    global_load_b32 v0, v0, s[6:7]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_clz_i32_u32_e32 v0, v0
-; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -1270,21 +1270,21 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias %
 ;
 ; VI-LABEL: v_ctlz_i32_sel_eq_bitwidth:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbh_u32_e32 v0, v0
 ; VI-NEXT:    v_min_u32_e32 v0, 32, v0
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_ctlz_i32_sel_eq_bitwidth:
@@ -1313,47 +1313,47 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias %
 ;
 ; GFX10-LABEL: v_ctlz_i32_sel_eq_bitwidth:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 32, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_ctlz_i32_sel_eq_bitwidth:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 32, v0
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc_lo
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_ctlz_i32_sel_eq_bitwidth:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3]
+; GFX11-NEXT:    global_load_b32 v0, v0, s[6:7]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_clz_i32_u32_e32 v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_min_u32_e32 v0, 32, v0
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 32, v0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
-; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -1392,21 +1392,21 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ;
 ; VI-LABEL: v_ctlz_i32_sel_ne_bitwidth:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbh_u32_e32 v0, v0
 ; VI-NEXT:    v_min_u32_e32 v0, 32, v0
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_ctlz_i32_sel_ne_bitwidth:
@@ -1435,47 +1435,47 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ;
 ; GFX10-LABEL: v_ctlz_i32_sel_ne_bitwidth:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 32, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_ctlz_i32_sel_ne_bitwidth:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
 ; GFX10-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 32, v0
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_ctlz_i32_sel_ne_bitwidth:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3]
+; GFX11-NEXT:    global_load_b32 v0, v0, s[6:7]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_clz_i32_u32_e32 v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_min_u32_e32 v0, 32, v0
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 32, v0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
-; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -1510,17 +1510,17 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ;
 ; VI-LABEL: v_ctlz_i8_sel_eq_neg1:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_ubyte v0, v[0:1]
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbh_u32_e32 v0, v0
-; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; VI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_ctlz_i8_sel_eq_neg1:
@@ -1552,22 +1552,22 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ;
 ; GFX10-LABEL: v_ctlz_i8_sel_eq_neg1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_ubyte v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
-; GFX10-NEXT:    global_store_byte v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_byte v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_ctlz_i8_sel_eq_neg1:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s7
 ; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
 ; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo
 ; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
@@ -1578,18 +1578,18 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; GFX10-GISEL-NEXT:    v_subrev_nc_u32_e32 v1, 24, v1
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, -1, vcc_lo
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-GISEL-NEXT:    global_store_byte v1, v0, s[0:1]
+; GFX10-GISEL-NEXT:    global_store_byte v1, v0, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_ctlz_i8_sel_eq_neg1:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3]
+; GFX11-NEXT:    global_load_u8 v0, v0, s[6:7]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_clz_i32_u32_e32 v0, v0
-; GFX11-NEXT:    global_store_b8 v1, v0, s[0:1]
+; GFX11-NEXT:    global_store_b8 v1, v0, s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -1624,25 +1624,25 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ;
 ; VI-LABEL: v_ctlz_i16_sel_eq_neg1:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
-; VI-NEXT:    s_mov_b32 s10, s6
-; VI-NEXT:    s_mov_b32 s11, s7
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s10, s2
+; VI-NEXT:    s_mov_b32 s11, s3
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s8, s2
-; VI-NEXT:    s_mov_b32 s9, s3
+; VI-NEXT:    s_mov_b32 s8, s6
+; VI-NEXT:    s_mov_b32 s9, s7
 ; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
 ; VI-NEXT:    v_mov_b32_e32 v1, 0xffff
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbh_u32_e32 v2, v0
 ; VI-NEXT:    v_min_u32_e32 v2, 32, v2
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, -16, v2
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_ctlz_i16_sel_eq_neg1:
@@ -1674,25 +1674,25 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ;
 ; GFX10-LABEL: v_ctlz_i16_sel_eq_neg1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3]
+; GFX10-NEXT:    global_load_ushort v1, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbh_u32_e32 v2, v1
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
 ; GFX10-NEXT:    v_min_u32_e32 v2, 32, v2
 ; GFX10-NEXT:    v_add_nc_u32_e32 v2, -16, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo
-; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_ctlz_i16_sel_eq_neg1:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_ushort v1, v0, s[2:3]
+; GFX10-GISEL-NEXT:    global_load_ushort v1, v0, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v2, v1
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
@@ -1700,15 +1700,15 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; GFX10-GISEL-NEXT:    v_subrev_nc_u32_e32 v2, 16, v2
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v2, 0xffff, vcc_lo
-; GFX10-GISEL-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-GISEL-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_ctlz_i16_sel_eq_neg1:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_u16 v1, v0, s[2:3]
+; GFX11-NEXT:    global_load_u16 v1, v0, s[6:7]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_clz_i32_u32_e32 v2, v1
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
@@ -1717,7 +1717,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; GFX11-NEXT:    v_add_nc_u32_e32 v2, -16, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo
-; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT:    global_store_b16 v0, v1, s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -1752,18 +1752,18 @@ define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out,
 ;
 ; VI-LABEL: v_ctlz_i7_sel_eq_neg1:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_ubyte v0, v[0:1]
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbh_u32_e32 v0, v0
 ; VI-NEXT:    v_and_b32_e32 v0, 0x7f, v0
-; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; VI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_ctlz_i7_sel_eq_neg1:
@@ -1795,23 +1795,23 @@ define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out,
 ;
 ; GFX10-LABEL: v_ctlz_i7_sel_eq_neg1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_ubyte v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0x7f, v0
-; GFX10-NEXT:    global_store_byte v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_byte v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_ctlz_i7_sel_eq_neg1:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s7
 ; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
 ; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo
 ; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
@@ -1824,19 +1824,19 @@ define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out,
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0x7f, vcc_lo
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0x7f, v0
-; GFX10-GISEL-NEXT:    global_store_byte v1, v0, s[0:1]
+; GFX10-GISEL-NEXT:    global_store_byte v1, v0, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_ctlz_i7_sel_eq_neg1:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3]
+; GFX11-NEXT:    global_load_u8 v0, v0, s[6:7]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_clz_i32_u32_e32 v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x7f, v0
-; GFX11-NEXT:    global_store_b8 v1, v0, s[0:1]
+; GFX11-NEXT:    global_store_b8 v1, v0, s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index 756b819099682..97529b5687a71 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -99,17 +99,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out,
 ;
 ; VI-LABEL: v_ctlz_zero_undef_i32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbh_u32_e32 v2, v0
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -134,14 +134,14 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out,
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
-; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
@@ -174,15 +174,15 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(ptr addrspace(1) noalias %out
 ;
 ; VI-LABEL: v_ctlz_zero_undef_v2i32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    v_mov_b32_e32 v2, s4
+; VI-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbh_u32_e32 v1, v1
 ; VI-NEXT:    v_ffbh_u32_e32 v0, v0
@@ -211,15 +211,15 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(ptr addrspace(1) noalias %out
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i32:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_dwordx2 v[0:1], v0, s[6:7]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
-; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr <2 x i32>, ptr addrspace(1) %valptr, i32 %tid
@@ -254,15 +254,15 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v4i32(ptr addrspace(1) noalias %out
 ;
 ; VI-LABEL: v_ctlz_zero_undef_v4i32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v5, s1
-; VI-NEXT:    v_mov_b32_e32 v4, s0
+; VI-NEXT:    v_mov_b32_e32 v4, s4
+; VI-NEXT:    v_mov_b32_e32 v5, s5
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbh_u32_e32 v3, v3
 ; VI-NEXT:    v_ffbh_u32_e32 v2, v2
@@ -295,17 +295,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v4i32(ptr addrspace(1) noalias %out
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v4i32:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[0:3], v0, s[6:7]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v2, v2
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v3, v3
-; GFX9-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr <4 x i32>, ptr addrspace(1) %valptr, i32 %tid
@@ -534,13 +534,13 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no
 ;
 ; VI-LABEL: s_ctlz_zero_undef_i64_with_select:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_flbit_i32_b64 s2, s[2:3]
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    s_flbit_i32_b64 s0, s[6:7]
+; VI-NEXT:    v_mov_b32_e32 v2, s4
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
 ;
@@ -562,14 +562,14 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no
 ;
 ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i64_with_select:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_mov_b32 s1, 0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_flbit_i32_b64 s4, s[2:3]
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-GISEL-NEXT:    s_flbit_i32_b64 s0, s[6:7]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %ctlz = tail call i64 @llvm.ctlz.i64(i64 %val, i1 true) nounwind readnone
   %ctlz_ret = icmp ne i64 %val, 0
@@ -602,18 +602,18 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
 ;
 ; VI-LABEL: v_ctlz_zero_undef_i8_with_select:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_mov_b32_e32 v0, s6
+; VI-NEXT:    v_mov_b32_e32 v1, s7
 ; VI-NEXT:    flat_load_ubyte v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 24, v0
 ; VI-NEXT:    v_ffbh_u32_e32 v1, v1
 ; VI-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v2, 32, v1, vcc
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_store_byte v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -650,17 +650,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i8_with_select:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[6:7]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v1
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v2, v2
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
-; GFX9-GISEL-NEXT:    global_store_byte v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_byte v0, v1, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i8, ptr addrspace(1) %arrayidx, align 1
   %ctlz = tail call i8 @llvm.ctlz.i8(i8 %val, i1 true) nounwind readnone
@@ -698,14 +698,14 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
 ;
 ; VI-LABEL: v_ctlz_zero_undef_i16_with_select:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_add_u32 s4, s2, 1
-; VI-NEXT:    s_addc_u32 s5, s3, 0
-; VI-NEXT:    v_mov_b32_e32 v2, s4
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v3, s5
-; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    s_add_u32 s0, s6, 1
+; VI-NEXT:    s_addc_u32 s1, s7, 0
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s6
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
 ; VI-NEXT:    flat_load_ubyte v2, v[2:3]
 ; VI-NEXT:    flat_load_ubyte v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(1)
@@ -716,8 +716,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, -16, v1
 ; VI-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v2, 32, v1, vcc
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_store_short v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -754,11 +754,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i16_with_select:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
-; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:1
+; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[6:7] offset:1
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
@@ -766,7 +766,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
-; GFX9-GISEL-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i16, ptr addrspace(1) %arrayidx, align 1
   %ctlz = tail call i16 @llvm.ctlz.i16(i16 %val, i1 true) nounwind readnone
@@ -810,22 +810,22 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no
 ;
 ; VI-LABEL: v_ctlz_zero_undef_i32_with_select:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_add_u32 s4, s2, 3
-; VI-NEXT:    s_addc_u32 s5, s3, 0
-; VI-NEXT:    v_mov_b32_e32 v2, s4
-; VI-NEXT:    v_mov_b32_e32 v3, s5
-; VI-NEXT:    s_add_u32 s4, s2, 2
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    s_addc_u32 s5, s3, 0
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    s_add_u32 s2, s2, 1
-; VI-NEXT:    s_addc_u32 s3, s3, 0
-; VI-NEXT:    v_mov_b32_e32 v4, s4
-; VI-NEXT:    v_mov_b32_e32 v7, s3
-; VI-NEXT:    v_mov_b32_e32 v5, s5
-; VI-NEXT:    v_mov_b32_e32 v6, s2
+; VI-NEXT:    s_add_u32 s0, s6, 3
+; VI-NEXT:    s_addc_u32 s1, s7, 0
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    s_add_u32 s0, s6, 2
+; VI-NEXT:    s_addc_u32 s1, s7, 0
+; VI-NEXT:    v_mov_b32_e32 v5, s1
+; VI-NEXT:    v_mov_b32_e32 v4, s0
+; VI-NEXT:    s_add_u32 s0, s6, 1
+; VI-NEXT:    s_addc_u32 s1, s7, 0
+; VI-NEXT:    v_mov_b32_e32 v0, s6
+; VI-NEXT:    v_mov_b32_e32 v7, s1
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_mov_b32_e32 v6, s0
 ; VI-NEXT:    flat_load_ubyte v2, v[2:3]
 ; VI-NEXT:    flat_load_ubyte v3, v[4:5]
 ; VI-NEXT:    flat_load_ubyte v4, v[6:7]
@@ -841,8 +841,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no
 ; VI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; VI-NEXT:    v_ffbh_u32_e32 v0, v0
 ; VI-NEXT:    v_min_u32_e32 v2, 32, v0
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -870,13 +870,13 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_with_select:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
-; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:1
-; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[2:3] offset:3
-; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:2
+; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[6:7] offset:1
+; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[6:7] offset:3
+; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[6:7] offset:2
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
@@ -887,7 +887,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v2, v1
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i32, ptr addrspace(1) %arrayidx, align 1
   %ctlz = tail call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
@@ -947,43 +947,43 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no
 ;
 ; VI-LABEL: v_ctlz_zero_undef_i64_with_select:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_add_u32 s4, s2, 5
-; VI-NEXT:    s_addc_u32 s5, s3, 0
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    s_add_u32 s4, s2, 4
-; VI-NEXT:    s_addc_u32 s5, s3, 0
-; VI-NEXT:    v_mov_b32_e32 v2, s4
-; VI-NEXT:    v_mov_b32_e32 v3, s5
-; VI-NEXT:    s_add_u32 s4, s2, 7
-; VI-NEXT:    s_addc_u32 s5, s3, 0
-; VI-NEXT:    v_mov_b32_e32 v4, s4
-; VI-NEXT:    v_mov_b32_e32 v5, s5
-; VI-NEXT:    s_add_u32 s4, s2, 6
-; VI-NEXT:    s_addc_u32 s5, s3, 0
-; VI-NEXT:    v_mov_b32_e32 v7, s5
-; VI-NEXT:    v_mov_b32_e32 v6, s4
-; VI-NEXT:    s_add_u32 s4, s2, 3
-; VI-NEXT:    s_addc_u32 s5, s3, 0
-; VI-NEXT:    v_mov_b32_e32 v9, s5
-; VI-NEXT:    v_mov_b32_e32 v8, s4
-; VI-NEXT:    s_add_u32 s4, s2, 2
-; VI-NEXT:    s_addc_u32 s5, s3, 0
-; VI-NEXT:    v_mov_b32_e32 v11, s5
-; VI-NEXT:    v_mov_b32_e32 v10, s4
-; VI-NEXT:    s_add_u32 s4, s2, 1
+; VI-NEXT:    s_add_u32 s0, s6, 5
+; VI-NEXT:    s_addc_u32 s1, s7, 0
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_add_u32 s0, s6, 4
+; VI-NEXT:    s_addc_u32 s1, s7, 0
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    s_add_u32 s0, s6, 7
+; VI-NEXT:    s_addc_u32 s1, s7, 0
+; VI-NEXT:    v_mov_b32_e32 v5, s1
+; VI-NEXT:    v_mov_b32_e32 v4, s0
+; VI-NEXT:    s_add_u32 s0, s6, 6
+; VI-NEXT:    s_addc_u32 s1, s7, 0
+; VI-NEXT:    v_mov_b32_e32 v7, s1
+; VI-NEXT:    v_mov_b32_e32 v6, s0
+; VI-NEXT:    s_add_u32 s0, s6, 3
+; VI-NEXT:    s_addc_u32 s1, s7, 0
+; VI-NEXT:    v_mov_b32_e32 v9, s1
+; VI-NEXT:    v_mov_b32_e32 v8, s0
+; VI-NEXT:    s_add_u32 s0, s6, 2
+; VI-NEXT:    s_addc_u32 s1, s7, 0
+; VI-NEXT:    v_mov_b32_e32 v11, s1
+; VI-NEXT:    v_mov_b32_e32 v10, s0
+; VI-NEXT:    s_add_u32 s0, s6, 1
 ; VI-NEXT:    flat_load_ubyte v12, v[0:1]
 ; VI-NEXT:    flat_load_ubyte v13, v[2:3]
 ; VI-NEXT:    flat_load_ubyte v4, v[4:5]
 ; VI-NEXT:    flat_load_ubyte v5, v[6:7]
-; VI-NEXT:    s_addc_u32 s5, s3, 0
-; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    s_addc_u32 s1, s7, 0
+; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    flat_load_ubyte v6, v[8:9]
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    v_mov_b32_e32 v3, s3
+; VI-NEXT:    v_mov_b32_e32 v2, s6
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v3, s7
 ; VI-NEXT:    flat_load_ubyte v7, v[10:11]
 ; VI-NEXT:    flat_load_ubyte v0, v[0:1]
 ; VI-NEXT:    flat_load_ubyte v2, v[2:3]
@@ -1010,9 +1010,9 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no
 ; VI-NEXT:    v_ffbh_u32_e32 v0, v0
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
 ; VI-NEXT:    v_min_u32_e32 v0, v0, v3
-; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s4
 ; VI-NEXT:    v_min_u32_e32 v0, 64, v0
-; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
 ;
@@ -1051,17 +1051,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i64_with_select:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_ubyte v0, v1, s[2:3]
-; GFX9-GISEL-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:1
-; GFX9-GISEL-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:2
-; GFX9-GISEL-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:3
-; GFX9-GISEL-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:4
-; GFX9-GISEL-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:5
-; GFX9-GISEL-NEXT:    global_load_ubyte v7, v1, s[2:3] offset:6
-; GFX9-GISEL-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:7
+; GFX9-GISEL-NEXT:    global_load_ubyte v0, v1, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_ubyte v2, v1, s[6:7] offset:1
+; GFX9-GISEL-NEXT:    global_load_ubyte v3, v1, s[6:7] offset:2
+; GFX9-GISEL-NEXT:    global_load_ubyte v4, v1, s[6:7] offset:3
+; GFX9-GISEL-NEXT:    global_load_ubyte v5, v1, s[6:7] offset:4
+; GFX9-GISEL-NEXT:    global_load_ubyte v6, v1, s[6:7] offset:5
+; GFX9-GISEL-NEXT:    global_load_ubyte v7, v1, s[6:7] offset:6
+; GFX9-GISEL-NEXT:    global_load_ubyte v8, v1, s[6:7] offset:7
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v2, 8, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(5)
@@ -1082,7 +1082,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
 ; GFX9-GISEL-NEXT:    v_min_u32_e32 v0, v4, v0
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 64, v0, vcc
-; GFX9-GISEL-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dwordx2 v1, v[0:1], s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i64, ptr addrspace(1) %arrayidx, align 1
   %ctlz = tail call i64 @llvm.ctlz.i64(i64 %val, i1 true) nounwind readnone
@@ -1114,17 +1114,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p
 ;
 ; VI-LABEL: v_ctlz_zero_undef_i8:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_ubyte v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; VI-NEXT:    v_ffbh_u32_e32 v2, v0
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_store_byte v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -1159,11 +1159,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i8:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s7
 ; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v1, v0
 ; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v3, vcc
 ; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
@@ -1171,7 +1171,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
-; GFX9-GISEL-NEXT:    global_store_byte v1, v0, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_byte v1, v0, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i8, ptr addrspace(1) %valptr, i32 %tid
@@ -1257,12 +1257,12 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias
 ;
 ; VI-LABEL: s_ctlz_zero_undef_i64_trunc:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_flbit_i32_b64 s2, s[2:3]
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    s_flbit_i32_b64 s0, s[6:7]
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -1283,12 +1283,12 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias
 ;
 ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i64_trunc:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_flbit_i32_b64 s2, s[2:3]
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-GISEL-NEXT:    s_flbit_i32_b64 s0, s[6:7]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true)
   %trunc = trunc i64 %ctlz to i32
@@ -1319,16 +1319,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out,
 ;
 ; VI-LABEL: v_ctlz_zero_undef_i64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
 ; VI-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v3
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v3
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v4, s1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, s0, v3
+; VI-NEXT:    v_mov_b32_e32 v4, s5
+; VI-NEXT:    v_add_u32_e32 v3, vcc, s4, v3
 ; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbh_u32_e32 v0, v0
@@ -1365,17 +1365,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out,
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_dwordx2 v[0:1], v2, s[6:7]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
 ; GFX9-GISEL-NEXT:    v_add_u32_e32 v0, 32, v0
 ; GFX9-GISEL-NEXT:    v_min_u32_e32 v0, v1, v0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
@@ -1409,16 +1409,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias
 ;
 ; VI-LABEL: v_ctlz_zero_undef_i64_trunc:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v2, s3
-; VI-NEXT:    v_add_u32_e32 v1, vcc, s2, v1
+; VI-NEXT:    v_mov_b32_e32 v2, s7
+; VI-NEXT:    v_add_u32_e32 v1, vcc, s6, v1
 ; VI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[1:2], v[1:2]
-; VI-NEXT:    v_mov_b32_e32 v4, s1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, s0, v0
+; VI-NEXT:    v_mov_b32_e32 v4, s5
+; VI-NEXT:    v_add_u32_e32 v3, vcc, s4, v0
 ; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbh_u32_e32 v0, v1
@@ -1455,17 +1455,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i64_trunc:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dwordx2 v[1:2], v1, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_dwordx2 v[1:2], v1, s[6:7]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v2, v2
 ; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, 32, v1
 ; GFX9-GISEL-NEXT:    v_min_u32_e32 v1, v2, v1
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
@@ -1499,17 +1499,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(ptr addrspace(1) no
 ;
 ; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbh_u32_e32 v2, v0
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -1535,16 +1535,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(ptr addrspace(1) no
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, -1, vcc
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
@@ -1578,17 +1578,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_neg1(ptr addrspace(1) no
 ;
 ; VI-LABEL: v_ctlz_zero_undef_i32_sel_ne_neg1:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbh_u32_e32 v2, v0
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -1614,16 +1614,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_neg1(ptr addrspace(1) no
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_ne_neg1:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, -1, v1, vcc
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
@@ -1656,16 +1656,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noa
 ;
 ; VI-LABEL: v_ctlz_zero_undef_i8_sel_eq_neg1:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_ubyte v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbh_u32_e32 v2, v0
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_store_byte v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -1698,11 +1698,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noa
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i8_sel_eq_neg1:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s7
 ; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v1, v0
 ; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v3, vcc
 ; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
@@ -1710,9 +1710,9 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noa
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v0
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v2, v2
-; GFX9-GISEL-NEXT:    v_cmp_eq_u32_sdwa s[2:3], v0, v1 src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, -1, s[2:3]
-; GFX9-GISEL-NEXT:    global_store_byte v1, v0, s[0:1]
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_sdwa s[0:1], v0, v1 src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, -1, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_byte v1, v0, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %valptr.gep = getelementptr i8, ptr addrspace(1) %valptr, i32 %tid
@@ -1751,15 +1751,15 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(ptr addrspa
 ;
 ; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1_two_use:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v2, v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbh_u32_e32 v3, v2
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
@@ -1800,17 +1800,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(ptr addrspa
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1_two_use:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v2, v0
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, -1, vcc
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
@@ -1851,19 +1851,19 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(ptr addrspace(1) noali
 ;
 ; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_0:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbh_u32_e32 v1, v0
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v2, 0, v1, vcc
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -1889,16 +1889,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(ptr addrspace(1) noali
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_0:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
@@ -1935,19 +1935,19 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(ptr addrspace(1) noali
 ;
 ; VI-LABEL: v_ctlz_zero_undef_i32_sel_ne_0:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbh_u32_e32 v1, v0
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v2, 0, v1, vcc
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -1973,16 +1973,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(ptr addrspace(1) noali
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_ne_0:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
@@ -2019,19 +2019,19 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(ptr addrspace(1
 ;
 ; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_cmp_non0:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbh_u32_e32 v1, v0
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 1, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v2, 0, v1, vcc
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -2058,16 +2058,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(ptr addrspace(1
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_cmp_non0:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
@@ -2104,19 +2104,19 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(ptr addrspace(1
 ;
 ; VI-LABEL: v_ctlz_zero_undef_i32_sel_ne_cmp_non0:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbh_u32_e32 v1, v0
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 1, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v2, 0, v1, vcc
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -2143,16 +2143,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(ptr addrspace(1
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_ne_cmp_non0:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 1, v0
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/ctpop16.ll b/llvm/test/CodeGen/AMDGPU/ctpop16.ll
index b6359f1816979..4f2bde8d0842b 100644
--- a/llvm/test/CodeGen/AMDGPU/ctpop16.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctpop16.ll
@@ -91,18 +91,18 @@ define amdgpu_kernel void @v_ctpop_i16(ptr addrspace(1) noalias %out, ptr addrsp
 ;
 ; VI-LABEL: v_ctpop_i16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_ushort v0, v[0:1]
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_bcnt_u32_b32 v0, v0, 0
-; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_ctpop_i16:
@@ -344,15 +344,15 @@ define amdgpu_kernel void @v_ctpop_v2i16(ptr addrspace(1) noalias %out, ptr addr
 ;
 ; VI-LABEL: v_ctpop_v2i16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; VI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -360,7 +360,7 @@ define amdgpu_kernel void @v_ctpop_v2i16(ptr addrspace(1) noalias %out, ptr addr
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; VI-NEXT:    v_bcnt_u32_b32 v0, v0, 0
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_ctpop_v2i16:
@@ -430,15 +430,15 @@ define amdgpu_kernel void @v_ctpop_v4i16(ptr addrspace(1) noalias %out, ptr addr
 ;
 ; VI-LABEL: v_ctpop_v4i16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
 ; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
@@ -452,7 +452,7 @@ define amdgpu_kernel void @v_ctpop_v4i16(ptr addrspace(1) noalias %out, ptr addr
 ; VI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; VI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v3
-; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_ctpop_v4i16:
@@ -562,15 +562,17 @@ define amdgpu_kernel void @v_ctpop_v8i16(ptr addrspace(1) noalias %out, ptr addr
 ;
 ; VI-LABEL: v_ctpop_v8i16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
 ; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
@@ -769,18 +771,20 @@ define amdgpu_kernel void @v_ctpop_v16i16(ptr addrspace(1) noalias %out, ptr add
 ;
 ; VI-LABEL: v_ctpop_v16i16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v4, vcc, s2, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v4, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[4:5]
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, 16, v4
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
 ; VI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshrrev_b32_e32 v8, 16, v3
 ; VI-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
@@ -1035,18 +1039,18 @@ define amdgpu_kernel void @v_ctpop_i16_add_inline_constant(ptr addrspace(1) noal
 ;
 ; VI-LABEL: v_ctpop_i16_add_inline_constant:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_ushort v0, v[0:1]
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_bcnt_u32_b32 v0, v0, 4
-; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_ctpop_i16_add_inline_constant:
@@ -1107,18 +1111,18 @@ define amdgpu_kernel void @v_ctpop_i16_add_inline_constant_inv(ptr addrspace(1)
 ;
 ; VI-LABEL: v_ctpop_i16_add_inline_constant_inv:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_ushort v0, v[0:1]
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_bcnt_u32_b32 v0, v0, 4
-; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_ctpop_i16_add_inline_constant_inv:
@@ -1180,19 +1184,19 @@ define amdgpu_kernel void @v_ctpop_i16_add_literal(ptr addrspace(1) noalias %out
 ;
 ; VI-LABEL: v_ctpop_i16_add_literal:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
-; VI-NEXT:    s_movk_i32 s4, 0x3e7
+; VI-NEXT:    s_movk_i32 s0, 0x3e7
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_ushort v0, v[0:1]
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_bcnt_u32_b32 v0, v0, s4
-; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; VI-NEXT:    v_bcnt_u32_b32 v0, v0, s0
+; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_ctpop_i16_add_literal:
diff --git a/llvm/test/CodeGen/AMDGPU/ctpop64.ll b/llvm/test/CodeGen/AMDGPU/ctpop64.ll
index 131ce14a7847c..633f12047e5b1 100644
--- a/llvm/test/CodeGen/AMDGPU/ctpop64.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctpop64.ll
@@ -66,19 +66,19 @@ define amdgpu_kernel void @v_ctpop_i64(ptr addrspace(1) noalias %out, ptr addrsp
 ;
 ; VI-LABEL: v_ctpop_i64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_bcnt_u32_b32 v0, v0, 0
 ; VI-NEXT:    v_bcnt_u32_b32 v0, v1, v0
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
@@ -242,21 +242,21 @@ define amdgpu_kernel void @v_ctpop_v2i64(ptr addrspace(1) noalias %out, ptr addr
 ;
 ; VI-LABEL: v_ctpop_v2i64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_bcnt_u32_b32 v0, v0, 0
 ; VI-NEXT:    v_bcnt_u32_b32 v2, v2, 0
 ; VI-NEXT:    v_bcnt_u32_b32 v0, v1, v0
 ; VI-NEXT:    v_bcnt_u32_b32 v1, v3, v2
-; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr <2 x i64>, ptr addrspace(1) %in, i32 %tid
@@ -298,18 +298,18 @@ define amdgpu_kernel void @v_ctpop_v4i64(ptr addrspace(1) noalias %out, ptr addr
 ;
 ; VI-LABEL: v_ctpop_v4i64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v4, vcc, s2, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v4, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[4:5]
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, 16, v4
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
 ; VI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_bcnt_u32_b32 v0, v0, 0
 ; VI-NEXT:    v_bcnt_u32_b32 v8, v2, 0
@@ -320,7 +320,7 @@ define amdgpu_kernel void @v_ctpop_v4i64(ptr addrspace(1) noalias %out, ptr addr
 ; VI-NEXT:    v_bcnt_u32_b32 v6, v6, 0
 ; VI-NEXT:    v_bcnt_u32_b32 v4, v5, v4
 ; VI-NEXT:    v_bcnt_u32_b32 v5, v7, v6
-; VI-NEXT:    buffer_store_dwordx4 v[2:5], off, s[0:3], 0
+; VI-NEXT:    buffer_store_dwordx4 v[2:5], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr <4 x i64>, ptr addrspace(1) %in, i32 %tid
@@ -507,22 +507,22 @@ define amdgpu_kernel void @v_ctpop_i128(ptr addrspace(1) noalias %out, ptr addrs
 ;
 ; VI-LABEL: v_ctpop_i128:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_bcnt_u32_b32 v2, v2, 0
 ; VI-NEXT:    v_bcnt_u32_b32 v0, v0, 0
 ; VI-NEXT:    v_bcnt_u32_b32 v2, v3, v2
 ; VI-NEXT:    v_bcnt_u32_b32 v0, v1, v0
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i128, ptr addrspace(1) %in, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/cttz.ll b/llvm/test/CodeGen/AMDGPU/cttz.ll
index ee2894a66fbfc..483402d4778d1 100644
--- a/llvm/test/CodeGen/AMDGPU/cttz.ll
+++ b/llvm/test/CodeGen/AMDGPU/cttz.ll
@@ -111,19 +111,19 @@ define amdgpu_kernel void @v_cttz_i32(ptr addrspace(1) noalias %out, ptr addrspa
 ;
 ; VI-LABEL: v_cttz_i32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbl_b32_e32 v0, v0
 ; VI-NEXT:    v_min_u32_e32 v0, 32, v0
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_cttz_i32:
@@ -148,28 +148,28 @@ define amdgpu_kernel void @v_cttz_i32(ptr addrspace(1) noalias %out, ptr addrspa
 ;
 ; GFX10-LABEL: v_cttz_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
 ; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_cttz_i32:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v0, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
@@ -204,21 +204,21 @@ define amdgpu_kernel void @v_cttz_v2i32(ptr addrspace(1) noalias %out, ptr addrs
 ;
 ; VI-LABEL: v_cttz_v2i32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbl_b32_e32 v1, v1
 ; VI-NEXT:    v_ffbl_b32_e32 v0, v0
 ; VI-NEXT:    v_min_u32_e32 v1, 32, v1
 ; VI-NEXT:    v_min_u32_e32 v0, 32, v0
-; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_cttz_v2i32:
@@ -246,32 +246,32 @@ define amdgpu_kernel void @v_cttz_v2i32(ptr addrspace(1) noalias %out, ptr addrs
 ;
 ; GFX10-LABEL: v_cttz_v2i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbl_b32_e32 v1, v1
 ; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
 ; GFX10-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
-; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_cttz_v2i32:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
+; GFX10-GISEL-NEXT:    global_load_dwordx2 v[0:1], v0, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v0, v0
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v1, v1
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
-; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr <2 x i32>, ptr addrspace(1) %valptr, i32 %tid
@@ -310,15 +310,15 @@ define amdgpu_kernel void @v_cttz_v4i32(ptr addrspace(1) noalias %out, ptr addrs
 ;
 ; VI-LABEL: v_cttz_v4i32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbl_b32_e32 v3, v3
 ; VI-NEXT:    v_ffbl_b32_e32 v2, v2
@@ -328,7 +328,7 @@ define amdgpu_kernel void @v_cttz_v4i32(ptr addrspace(1) noalias %out, ptr addrs
 ; VI-NEXT:    v_min_u32_e32 v2, 32, v2
 ; VI-NEXT:    v_min_u32_e32 v1, 32, v1
 ; VI-NEXT:    v_min_u32_e32 v0, 32, v0
-; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_cttz_v4i32:
@@ -362,11 +362,11 @@ define amdgpu_kernel void @v_cttz_v4i32(ptr addrspace(1) noalias %out, ptr addrs
 ;
 ; GFX10-LABEL: v_cttz_v4i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbl_b32_e32 v3, v3
 ; GFX10-NEXT:    v_ffbl_b32_e32 v2, v2
@@ -376,16 +376,16 @@ define amdgpu_kernel void @v_cttz_v4i32(ptr addrspace(1) noalias %out, ptr addrs
 ; GFX10-NEXT:    v_min_u32_e32 v2, 32, v2
 ; GFX10-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
-; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_cttz_v4i32:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[0:3], v0, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v0, v0
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v1, v1
@@ -395,7 +395,7 @@ define amdgpu_kernel void @v_cttz_v4i32(ptr addrspace(1) noalias %out, ptr addrs
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v2, 32, v2
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v3, 32, v3
-; GFX10-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr <4 x i32>, ptr addrspace(1) %valptr, i32 %tid
@@ -427,21 +427,21 @@ define amdgpu_kernel void @v_cttz_i8(ptr addrspace(1) noalias %out, ptr addrspac
 ;
 ; VI-LABEL: v_cttz_i8:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
-; VI-NEXT:    s_mov_b32 s10, s6
-; VI-NEXT:    s_mov_b32 s11, s7
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s10, s2
+; VI-NEXT:    s_mov_b32 s11, s3
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s8, s2
-; VI-NEXT:    s_mov_b32 s9, s3
+; VI-NEXT:    s_mov_b32 s8, s6
+; VI-NEXT:    s_mov_b32 s9, s7
 ; VI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_e32 v0, 0x100, v0
 ; VI-NEXT:    v_ffbl_b32_e32 v0, v0
-; VI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
+; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_cttz_i8:
@@ -475,26 +475,26 @@ define amdgpu_kernel void @v_cttz_i8(ptr addrspace(1) noalias %out, ptr addrspac
 ;
 ; GFX10-LABEL: v_cttz_i8:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ubyte v1, v0, s[2:3]
+; GFX10-NEXT:    global_load_ubyte v1, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_or_b32_e32 v1, 0x100, v1
 ; GFX10-NEXT:    v_ffbl_b32_e32 v1, v1
-; GFX10-NEXT:    global_store_byte v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_byte v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_cttz_i8:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
+; GFX10-GISEL-NEXT:    global_load_ubyte v1, v0, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_or_b32_e32 v1, 0x100, v1
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v1, v1
-; GFX10-GISEL-NEXT:    global_store_byte v0, v1, s[0:1]
+; GFX10-GISEL-NEXT:    global_store_byte v0, v1, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %val = load i8, ptr addrspace(1) %valptr
   %cttz = call i8 @llvm.cttz.i8(i8 %val, i1 false) nounwind readnone
@@ -598,16 +598,16 @@ define amdgpu_kernel void @s_cttz_i64_trunc(ptr addrspace(1) noalias %out, i64 %
 ;
 ; VI-LABEL: s_cttz_i64_trunc:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_ff1_i32_b64 s0, s[2:3]
-; VI-NEXT:    s_min_u32 s0, s0, 64
-; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_ff1_i32_b64 s4, s[6:7]
+; VI-NEXT:    s_min_u32 s4, s4, 64
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_cttz_i64_trunc:
@@ -629,24 +629,24 @@ define amdgpu_kernel void @s_cttz_i64_trunc(ptr addrspace(1) noalias %out, i64 %
 ;
 ; GFX10-LABEL: s_cttz_i64_trunc:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_ff1_i32_b64 s2, s[2:3]
-; GFX10-NEXT:    s_min_u32 s2, s2, 64
-; GFX10-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    s_ff1_i32_b64 s0, s[6:7]
+; GFX10-NEXT:    s_min_u32 s0, s0, 64
+; GFX10-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: s_cttz_i64_trunc:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_ff1_i32_b64 s2, s[2:3]
-; GFX10-GISEL-NEXT:    s_min_u32 s2, s2, 64
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT:    s_ff1_i32_b64 s0, s[6:7]
+; GFX10-GISEL-NEXT:    s_min_u32 s0, s0, 64
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %cttz = call i64 @llvm.cttz.i64(i64 %val, i1 false)
   %trunc = trunc i64 %cttz to i32
@@ -678,16 +678,16 @@ define amdgpu_kernel void @v_cttz_i64(ptr addrspace(1) noalias %out, ptr addrspa
 ;
 ; VI-LABEL: v_cttz_i64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
 ; VI-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v3
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v3
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v4, s1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, s0, v3
+; VI-NEXT:    v_mov_b32_e32 v4, s5
+; VI-NEXT:    v_add_u32_e32 v3, vcc, s4, v3
 ; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbl_b32_e32 v1, v1
@@ -726,25 +726,25 @@ define amdgpu_kernel void @v_cttz_i64(ptr addrspace(1) noalias %out, ptr addrspa
 ;
 ; GFX10-LABEL: v_cttz_i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v2, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbl_b32_e32 v1, v1
 ; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
 ; GFX10-NEXT:    v_add_nc_u32_e64 v1, v1, 32 clamp
 ; GFX10-NEXT:    v_min3_u32 v0, v0, v1, 64
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_cttz_i64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX10-GISEL-NEXT:    global_load_dwordx2 v[0:1], v2, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v1, v1
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v0, v0
@@ -752,7 +752,7 @@ define amdgpu_kernel void @v_cttz_i64(ptr addrspace(1) noalias %out, ptr addrspa
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, v0, v1
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 64, v0
-; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
@@ -787,16 +787,16 @@ define amdgpu_kernel void @v_cttz_i64_trunc(ptr addrspace(1) noalias %out, ptr a
 ;
 ; VI-LABEL: v_cttz_i64_trunc:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v2, s3
-; VI-NEXT:    v_add_u32_e32 v1, vcc, s2, v1
+; VI-NEXT:    v_mov_b32_e32 v2, s7
+; VI-NEXT:    v_add_u32_e32 v1, vcc, s6, v1
 ; VI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[1:2], v[1:2]
-; VI-NEXT:    v_add_u32_e32 v3, vcc, s0, v0
-; VI-NEXT:    v_mov_b32_e32 v4, s1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, s4, v0
+; VI-NEXT:    v_mov_b32_e32 v4, s5
 ; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbl_b32_e32 v0, v2
@@ -835,33 +835,33 @@ define amdgpu_kernel void @v_cttz_i64_trunc(ptr addrspace(1) noalias %out, ptr a
 ;
 ; GFX10-LABEL: v_cttz_i64_trunc:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[1:2], v1, s[2:3]
+; GFX10-NEXT:    global_load_dwordx2 v[1:2], v1, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbl_b32_e32 v2, v2
 ; GFX10-NEXT:    v_ffbl_b32_e32 v1, v1
 ; GFX10-NEXT:    v_add_nc_u32_e64 v2, v2, 32 clamp
 ; GFX10-NEXT:    v_min3_u32 v1, v1, v2, 64
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_cttz_i64_trunc:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dwordx2 v[1:2], v1, s[2:3]
+; GFX10-GISEL-NEXT:    global_load_dwordx2 v[1:2], v1, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v2, v2
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v1, v1
 ; GFX10-GISEL-NEXT:    v_add_nc_u32_e64 v2, v2, 32 clamp
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, v1, v2
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 64, v1
-; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
@@ -895,18 +895,18 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out,
 ;
 ; VI-LABEL: v_cttz_i32_sel_eq_neg1:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbl_b32_e32 v0, v0
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_cttz_i32_sel_eq_neg1:
@@ -933,29 +933,29 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out,
 ;
 ; GFX10-LABEL: v_cttz_i32_sel_eq_neg1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_cttz_i32_sel_eq_neg1:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v1, v0
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, -1, vcc_lo
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
@@ -989,18 +989,18 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out,
 ;
 ; VI-LABEL: v_cttz_i32_sel_ne_neg1:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbl_b32_e32 v0, v0
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_cttz_i32_sel_ne_neg1:
@@ -1027,29 +1027,29 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out,
 ;
 ; GFX10-LABEL: v_cttz_i32_sel_ne_neg1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_cttz_i32_sel_ne_neg1:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v1, v0
 ; GFX10-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e32 v0, -1, v1, vcc_lo
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
@@ -1087,21 +1087,21 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias %
 ;
 ; VI-LABEL: v_cttz_i32_sel_eq_bitwidth:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbl_b32_e32 v0, v0
 ; VI-NEXT:    v_min_u32_e32 v0, 32, v0
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_cttz_i32_sel_eq_bitwidth:
@@ -1130,32 +1130,32 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias %
 ;
 ; GFX10-LABEL: v_cttz_i32_sel_eq_bitwidth:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
 ; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 32, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_cttz_i32_sel_eq_bitwidth:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v0, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 32, v0
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc_lo
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
@@ -1192,21 +1192,21 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ;
 ; VI-LABEL: v_cttz_i32_sel_ne_bitwidth:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbl_b32_e32 v0, v0
 ; VI-NEXT:    v_min_u32_e32 v0, 32, v0
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_cttz_i32_sel_ne_bitwidth:
@@ -1235,32 +1235,32 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ;
 ; GFX10-LABEL: v_cttz_i32_sel_ne_bitwidth:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
 ; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 32, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_cttz_i32_sel_ne_bitwidth:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v0, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
 ; GFX10-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 32, v0
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
@@ -1293,17 +1293,17 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ;
 ; VI-LABEL: v_cttz_i8_sel_eq_neg1:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_ubyte v0, v[0:1]
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbl_b32_e32 v0, v0
-; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; VI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_cttz_i8_sel_eq_neg1:
@@ -1335,32 +1335,32 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ;
 ; GFX10-LABEL: v_cttz_i8_sel_eq_neg1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_ubyte v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
-; GFX10-NEXT:    global_store_byte v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_byte v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_cttz_i8_sel_eq_neg1:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s7
 ; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
 ; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_or_b32_e32 v1, 0x100, v0
-; GFX10-GISEL-NEXT:    v_cmp_eq_u32_sdwa s2, v0, v2 src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_sdwa s0, v0, v2 src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v1, v1
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, -1, s2
-; GFX10-GISEL-NEXT:    global_store_byte v2, v0, s[0:1]
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, -1, s0
+; GFX10-GISEL-NEXT:    global_store_byte v2, v0, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %valptr.gep = getelementptr i8, ptr addrspace(1) %valptr, i32 %tid
@@ -1393,24 +1393,24 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ;
 ; VI-LABEL: v_cttz_i16_sel_eq_neg1:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
-; VI-NEXT:    s_mov_b32 s10, s6
-; VI-NEXT:    s_mov_b32 s11, s7
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s10, s2
+; VI-NEXT:    s_mov_b32 s11, s3
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s8, s2
-; VI-NEXT:    s_mov_b32 s9, s3
+; VI-NEXT:    s_mov_b32 s8, s6
+; VI-NEXT:    s_mov_b32 s9, s7
 ; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
 ; VI-NEXT:    v_mov_b32_e32 v1, 0xffff
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_e32 v2, 0x10000, v0
 ; VI-NEXT:    v_ffbl_b32_e32 v2, v2
 ; VI-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_cttz_i16_sel_eq_neg1:
@@ -1442,31 +1442,31 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ;
 ; GFX10-LABEL: v_cttz_i16_sel_eq_neg1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3]
+; GFX10-NEXT:    global_load_ushort v1, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_or_b32_e32 v2, 0x10000, v1
 ; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v1
 ; GFX10-NEXT:    v_ffbl_b32_e32 v2, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo
-; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_cttz_i16_sel_eq_neg1:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_ushort v1, v0, s[2:3]
+; GFX10-GISEL-NEXT:    global_load_ushort v1, v0, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_or_b32_e32 v2, 0x10000, v1
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v2, v2
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v2, 0xffff, vcc_lo
-; GFX10-GISEL-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-GISEL-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %val = load i16, ptr addrspace(1) %valptr
   %cttz = call i16 @llvm.cttz.i16(i16 %val, i1 false) nounwind readnone
@@ -1499,18 +1499,18 @@ define amdgpu_kernel void @v_cttz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out,
 ;
 ; VI-LABEL: v_cttz_i7_sel_eq_neg1:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_ubyte v0, v[0:1]
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbl_b32_e32 v0, v0
 ; VI-NEXT:    v_and_b32_e32 v0, 0x7f, v0
-; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; VI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_cttz_i7_sel_eq_neg1:
@@ -1542,23 +1542,23 @@ define amdgpu_kernel void @v_cttz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out,
 ;
 ; GFX10-LABEL: v_cttz_i7_sel_eq_neg1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_ubyte v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0x7f, v0
-; GFX10-NEXT:    global_store_byte v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_byte v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_cttz_i7_sel_eq_neg1:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s7
 ; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
 ; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo
 ; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
@@ -1570,7 +1570,7 @@ define amdgpu_kernel void @v_cttz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out,
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0x7f, vcc_lo
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0x7f, v0
-; GFX10-GISEL-NEXT:    global_store_byte v1, v0, s[0:1]
+; GFX10-GISEL-NEXT:    global_store_byte v1, v0, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %valptr.gep = getelementptr i7, ptr addrspace(1) %valptr, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
index 392a44318b0a5..a6cbfa52db532 100644
--- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
@@ -86,17 +86,17 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32(ptr addrspace(1) noalias %out,
 ;
 ; VI-LABEL: v_cttz_zero_undef_i32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbl_b32_e32 v2, v0
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -121,14 +121,14 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32(ptr addrspace(1) noalias %out,
 ;
 ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i32:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v0, v0
-; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
@@ -161,15 +161,15 @@ define amdgpu_kernel void @v_cttz_zero_undef_v2i32(ptr addrspace(1) noalias %out
 ;
 ; VI-LABEL: v_cttz_zero_undef_v2i32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    v_mov_b32_e32 v2, s4
+; VI-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbl_b32_e32 v1, v1
 ; VI-NEXT:    v_ffbl_b32_e32 v0, v0
@@ -198,15 +198,15 @@ define amdgpu_kernel void @v_cttz_zero_undef_v2i32(ptr addrspace(1) noalias %out
 ;
 ; GFX9-GISEL-LABEL: v_cttz_zero_undef_v2i32:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_dwordx2 v[0:1], v0, s[6:7]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v0, v0
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v1, v1
-; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr <2 x i32>, ptr addrspace(1) %valptr, i32 %tid
@@ -241,15 +241,15 @@ define amdgpu_kernel void @v_cttz_zero_undef_v4i32(ptr addrspace(1) noalias %out
 ;
 ; VI-LABEL: v_cttz_zero_undef_v4i32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v5, s1
-; VI-NEXT:    v_mov_b32_e32 v4, s0
+; VI-NEXT:    v_mov_b32_e32 v4, s4
+; VI-NEXT:    v_mov_b32_e32 v5, s5
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbl_b32_e32 v3, v3
 ; VI-NEXT:    v_ffbl_b32_e32 v2, v2
@@ -282,17 +282,17 @@ define amdgpu_kernel void @v_cttz_zero_undef_v4i32(ptr addrspace(1) noalias %out
 ;
 ; GFX9-GISEL-LABEL: v_cttz_zero_undef_v4i32:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[0:3], v0, s[6:7]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v0, v0
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v1, v1
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v2, v2
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v3, v3
-; GFX9-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr <4 x i32>, ptr addrspace(1) %valptr, i32 %tid
@@ -510,13 +510,13 @@ define amdgpu_kernel void @s_cttz_zero_undef_i64_with_select(ptr addrspace(1) no
 ;
 ; VI-LABEL: s_cttz_zero_undef_i64_with_select:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_ff1_i32_b64 s2, s[2:3]
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    s_ff1_i32_b64 s0, s[6:7]
+; VI-NEXT:    v_mov_b32_e32 v2, s4
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
 ;
@@ -538,14 +538,14 @@ define amdgpu_kernel void @s_cttz_zero_undef_i64_with_select(ptr addrspace(1) no
 ;
 ; GFX9-GISEL-LABEL: s_cttz_zero_undef_i64_with_select:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_mov_b32 s1, 0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s4, s[2:3]
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s0, s[6:7]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 true) nounwind readnone
   %cttz_ret = icmp ne i64 %val, 0
@@ -577,17 +577,17 @@ define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa
 ;
 ; VI-LABEL: v_cttz_zero_undef_i8_with_select:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_mov_b32_e32 v0, s6
+; VI-NEXT:    v_mov_b32_e32 v1, s7
 ; VI-NEXT:    flat_load_ubyte v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbl_b32_e32 v1, v0
 ; VI-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v2, 32, v1, vcc
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_store_byte v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -622,16 +622,16 @@ define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa
 ;
 ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i8_with_select:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[6:7]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v2, v1
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
-; GFX9-GISEL-NEXT:    global_store_byte v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_byte v0, v1, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i8, ptr addrspace(1) %arrayidx, align 1
   %cttz = tail call i8 @llvm.cttz.i8(i8 %val, i1 true) nounwind readnone
@@ -668,14 +668,14 @@ define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(ptr addrspace(1) no
 ;
 ; VI-LABEL: v_cttz_zero_undef_i16_with_select:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_add_u32 s4, s2, 1
-; VI-NEXT:    s_addc_u32 s5, s3, 0
-; VI-NEXT:    v_mov_b32_e32 v2, s4
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v3, s5
-; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    s_add_u32 s0, s6, 1
+; VI-NEXT:    s_addc_u32 s1, s7, 0
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s6
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
 ; VI-NEXT:    flat_load_ubyte v2, v[2:3]
 ; VI-NEXT:    flat_load_ubyte v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(1)
@@ -685,8 +685,8 @@ define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(ptr addrspace(1) no
 ; VI-NEXT:    v_ffbl_b32_e32 v1, v0
 ; VI-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v2, 32, v1, vcc
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_store_short v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -721,18 +721,18 @@ define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(ptr addrspace(1) no
 ;
 ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i16_with_select:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
-; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:1
+; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[6:7] offset:1
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v2, v1
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
-; GFX9-GISEL-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i16, ptr addrspace(1) %arrayidx, align 1
   %cttz = tail call i16 @llvm.cttz.i16(i16 %val, i1 true) nounwind readnone
@@ -776,22 +776,22 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(ptr addrspace(1) no
 ;
 ; VI-LABEL: v_cttz_zero_undef_i32_with_select:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_add_u32 s4, s2, 3
-; VI-NEXT:    s_addc_u32 s5, s3, 0
-; VI-NEXT:    v_mov_b32_e32 v2, s4
-; VI-NEXT:    v_mov_b32_e32 v3, s5
-; VI-NEXT:    s_add_u32 s4, s2, 2
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    s_addc_u32 s5, s3, 0
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    s_add_u32 s2, s2, 1
-; VI-NEXT:    s_addc_u32 s3, s3, 0
-; VI-NEXT:    v_mov_b32_e32 v4, s4
-; VI-NEXT:    v_mov_b32_e32 v7, s3
-; VI-NEXT:    v_mov_b32_e32 v5, s5
-; VI-NEXT:    v_mov_b32_e32 v6, s2
+; VI-NEXT:    s_add_u32 s0, s6, 3
+; VI-NEXT:    s_addc_u32 s1, s7, 0
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    s_add_u32 s0, s6, 2
+; VI-NEXT:    s_addc_u32 s1, s7, 0
+; VI-NEXT:    v_mov_b32_e32 v5, s1
+; VI-NEXT:    v_mov_b32_e32 v4, s0
+; VI-NEXT:    s_add_u32 s0, s6, 1
+; VI-NEXT:    s_addc_u32 s1, s7, 0
+; VI-NEXT:    v_mov_b32_e32 v0, s6
+; VI-NEXT:    v_mov_b32_e32 v7, s1
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_mov_b32_e32 v6, s0
 ; VI-NEXT:    flat_load_ubyte v2, v[2:3]
 ; VI-NEXT:    flat_load_ubyte v3, v[4:5]
 ; VI-NEXT:    flat_load_ubyte v4, v[6:7]
@@ -807,8 +807,8 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(ptr addrspace(1) no
 ; VI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; VI-NEXT:    v_ffbl_b32_e32 v0, v0
 ; VI-NEXT:    v_min_u32_e32 v2, 32, v0
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -836,13 +836,13 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(ptr addrspace(1) no
 ;
 ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i32_with_select:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
-; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:1
-; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[2:3] offset:3
-; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:2
+; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[6:7] offset:1
+; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[6:7] offset:3
+; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[6:7] offset:2
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
@@ -853,7 +853,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(ptr addrspace(1) no
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v2, v1
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i32, ptr addrspace(1) %arrayidx, align 1
   %cttz = tail call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone
@@ -913,43 +913,43 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no
 ;
 ; VI-LABEL: v_cttz_zero_undef_i64_with_select:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_add_u32 s4, s2, 5
-; VI-NEXT:    s_addc_u32 s5, s3, 0
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    s_add_u32 s4, s2, 4
-; VI-NEXT:    s_addc_u32 s5, s3, 0
-; VI-NEXT:    v_mov_b32_e32 v2, s4
-; VI-NEXT:    v_mov_b32_e32 v3, s5
-; VI-NEXT:    s_add_u32 s4, s2, 7
-; VI-NEXT:    s_addc_u32 s5, s3, 0
-; VI-NEXT:    v_mov_b32_e32 v4, s4
-; VI-NEXT:    v_mov_b32_e32 v5, s5
-; VI-NEXT:    s_add_u32 s4, s2, 6
-; VI-NEXT:    s_addc_u32 s5, s3, 0
-; VI-NEXT:    v_mov_b32_e32 v7, s5
-; VI-NEXT:    v_mov_b32_e32 v6, s4
-; VI-NEXT:    s_add_u32 s4, s2, 3
-; VI-NEXT:    s_addc_u32 s5, s3, 0
-; VI-NEXT:    v_mov_b32_e32 v9, s5
-; VI-NEXT:    v_mov_b32_e32 v8, s4
-; VI-NEXT:    s_add_u32 s4, s2, 2
-; VI-NEXT:    s_addc_u32 s5, s3, 0
-; VI-NEXT:    v_mov_b32_e32 v11, s5
-; VI-NEXT:    v_mov_b32_e32 v10, s4
+; VI-NEXT:    s_add_u32 s0, s6, 5
+; VI-NEXT:    s_addc_u32 s1, s7, 0
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_add_u32 s0, s6, 4
+; VI-NEXT:    s_addc_u32 s1, s7, 0
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    s_add_u32 s0, s6, 7
+; VI-NEXT:    s_addc_u32 s1, s7, 0
+; VI-NEXT:    v_mov_b32_e32 v5, s1
+; VI-NEXT:    v_mov_b32_e32 v4, s0
+; VI-NEXT:    s_add_u32 s0, s6, 6
+; VI-NEXT:    s_addc_u32 s1, s7, 0
+; VI-NEXT:    v_mov_b32_e32 v7, s1
+; VI-NEXT:    v_mov_b32_e32 v6, s0
+; VI-NEXT:    s_add_u32 s0, s6, 3
+; VI-NEXT:    s_addc_u32 s1, s7, 0
+; VI-NEXT:    v_mov_b32_e32 v9, s1
+; VI-NEXT:    v_mov_b32_e32 v8, s0
+; VI-NEXT:    s_add_u32 s0, s6, 2
+; VI-NEXT:    s_addc_u32 s1, s7, 0
+; VI-NEXT:    v_mov_b32_e32 v11, s1
+; VI-NEXT:    v_mov_b32_e32 v10, s0
 ; VI-NEXT:    flat_load_ubyte v12, v[0:1]
 ; VI-NEXT:    flat_load_ubyte v13, v[2:3]
 ; VI-NEXT:    flat_load_ubyte v4, v[4:5]
 ; VI-NEXT:    flat_load_ubyte v5, v[6:7]
-; VI-NEXT:    s_add_u32 s4, s2, 1
+; VI-NEXT:    s_add_u32 s0, s6, 1
 ; VI-NEXT:    flat_load_ubyte v6, v[8:9]
-; VI-NEXT:    s_addc_u32 s5, s3, 0
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    v_mov_b32_e32 v3, s3
+; VI-NEXT:    s_addc_u32 s1, s7, 0
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v2, s6
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v3, s7
 ; VI-NEXT:    flat_load_ubyte v7, v[10:11]
 ; VI-NEXT:    flat_load_ubyte v0, v[0:1]
 ; VI-NEXT:    flat_load_ubyte v2, v[2:3]
@@ -976,9 +976,9 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no
 ; VI-NEXT:    v_or_b32_e32 v0, v4, v0
 ; VI-NEXT:    v_ffbl_b32_e32 v0, v0
 ; VI-NEXT:    v_min_u32_e32 v0, v3, v0
-; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s4
 ; VI-NEXT:    v_min_u32_e32 v0, 64, v0
-; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
 ;
@@ -1017,17 +1017,17 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no
 ;
 ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i64_with_select:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_ubyte v0, v1, s[2:3]
-; GFX9-GISEL-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:1
-; GFX9-GISEL-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:2
-; GFX9-GISEL-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:3
-; GFX9-GISEL-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:4
-; GFX9-GISEL-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:5
-; GFX9-GISEL-NEXT:    global_load_ubyte v7, v1, s[2:3] offset:6
-; GFX9-GISEL-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:7
+; GFX9-GISEL-NEXT:    global_load_ubyte v0, v1, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_ubyte v2, v1, s[6:7] offset:1
+; GFX9-GISEL-NEXT:    global_load_ubyte v3, v1, s[6:7] offset:2
+; GFX9-GISEL-NEXT:    global_load_ubyte v4, v1, s[6:7] offset:3
+; GFX9-GISEL-NEXT:    global_load_ubyte v5, v1, s[6:7] offset:4
+; GFX9-GISEL-NEXT:    global_load_ubyte v6, v1, s[6:7] offset:5
+; GFX9-GISEL-NEXT:    global_load_ubyte v7, v1, s[6:7] offset:6
+; GFX9-GISEL-NEXT:    global_load_ubyte v8, v1, s[6:7] offset:7
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v2, 8, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(5)
@@ -1048,7 +1048,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
 ; GFX9-GISEL-NEXT:    v_min_u32_e32 v0, v0, v4
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 64, v0, vcc
-; GFX9-GISEL-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dwordx2 v1, v[0:1], s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i64, ptr addrspace(1) %arrayidx, align 1
   %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 true) nounwind readnone
@@ -1091,22 +1091,22 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out,
 ;
 ; VI-LABEL: v_cttz_i32_sel_eq_neg1:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_add_u32 s4, s2, 3
-; VI-NEXT:    s_addc_u32 s5, s3, 0
-; VI-NEXT:    v_mov_b32_e32 v2, s4
-; VI-NEXT:    v_mov_b32_e32 v3, s5
-; VI-NEXT:    s_add_u32 s4, s2, 2
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    s_addc_u32 s5, s3, 0
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    s_add_u32 s2, s2, 1
-; VI-NEXT:    s_addc_u32 s3, s3, 0
-; VI-NEXT:    v_mov_b32_e32 v4, s4
-; VI-NEXT:    v_mov_b32_e32 v7, s3
-; VI-NEXT:    v_mov_b32_e32 v5, s5
-; VI-NEXT:    v_mov_b32_e32 v6, s2
+; VI-NEXT:    s_add_u32 s0, s6, 3
+; VI-NEXT:    s_addc_u32 s1, s7, 0
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    s_add_u32 s0, s6, 2
+; VI-NEXT:    s_addc_u32 s1, s7, 0
+; VI-NEXT:    v_mov_b32_e32 v5, s1
+; VI-NEXT:    v_mov_b32_e32 v4, s0
+; VI-NEXT:    s_add_u32 s0, s6, 1
+; VI-NEXT:    s_addc_u32 s1, s7, 0
+; VI-NEXT:    v_mov_b32_e32 v0, s6
+; VI-NEXT:    v_mov_b32_e32 v7, s1
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_mov_b32_e32 v6, s0
 ; VI-NEXT:    flat_load_ubyte v2, v[2:3]
 ; VI-NEXT:    flat_load_ubyte v3, v[4:5]
 ; VI-NEXT:    flat_load_ubyte v4, v[6:7]
@@ -1121,8 +1121,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out,
 ; VI-NEXT:    v_or_b32_e32 v0, v2, v0
 ; VI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; VI-NEXT:    v_ffbl_b32_e32 v2, v0
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -1152,13 +1152,13 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out,
 ;
 ; GFX9-GISEL-LABEL: v_cttz_i32_sel_eq_neg1:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
-; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:1
-; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[2:3] offset:3
-; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:2
+; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[6:7] offset:1
+; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[6:7] offset:3
+; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[6:7] offset:2
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
@@ -1170,7 +1170,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out,
 ; GFX9-GISEL-NEXT:    v_min_u32_e32 v2, 32, v2
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, v2, -1, vcc
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i32, ptr addrspace(1) %arrayidx, align 1
   %ctlz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
@@ -1213,22 +1213,22 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out,
 ;
 ; VI-LABEL: v_cttz_i32_sel_ne_neg1:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_add_u32 s4, s2, 3
-; VI-NEXT:    s_addc_u32 s5, s3, 0
-; VI-NEXT:    v_mov_b32_e32 v2, s4
-; VI-NEXT:    v_mov_b32_e32 v3, s5
-; VI-NEXT:    s_add_u32 s4, s2, 2
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    s_addc_u32 s5, s3, 0
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    s_add_u32 s2, s2, 1
-; VI-NEXT:    s_addc_u32 s3, s3, 0
-; VI-NEXT:    v_mov_b32_e32 v4, s4
-; VI-NEXT:    v_mov_b32_e32 v7, s3
-; VI-NEXT:    v_mov_b32_e32 v5, s5
-; VI-NEXT:    v_mov_b32_e32 v6, s2
+; VI-NEXT:    s_add_u32 s0, s6, 3
+; VI-NEXT:    s_addc_u32 s1, s7, 0
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    s_add_u32 s0, s6, 2
+; VI-NEXT:    s_addc_u32 s1, s7, 0
+; VI-NEXT:    v_mov_b32_e32 v5, s1
+; VI-NEXT:    v_mov_b32_e32 v4, s0
+; VI-NEXT:    s_add_u32 s0, s6, 1
+; VI-NEXT:    s_addc_u32 s1, s7, 0
+; VI-NEXT:    v_mov_b32_e32 v0, s6
+; VI-NEXT:    v_mov_b32_e32 v7, s1
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_mov_b32_e32 v6, s0
 ; VI-NEXT:    flat_load_ubyte v2, v[2:3]
 ; VI-NEXT:    flat_load_ubyte v3, v[4:5]
 ; VI-NEXT:    flat_load_ubyte v4, v[6:7]
@@ -1243,8 +1243,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out,
 ; VI-NEXT:    v_or_b32_e32 v0, v2, v0
 ; VI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; VI-NEXT:    v_ffbl_b32_e32 v2, v0
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -1274,13 +1274,13 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out,
 ;
 ; GFX9-GISEL-LABEL: v_cttz_i32_sel_ne_neg1:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
-; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:1
-; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[2:3] offset:3
-; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:2
+; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[6:7] offset:1
+; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[6:7] offset:3
+; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[6:7] offset:2
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
@@ -1292,7 +1292,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out,
 ; GFX9-GISEL-NEXT:    v_min_u32_e32 v2, 32, v2
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, -1, v2, vcc
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i32, ptr addrspace(1) %arrayidx, align 1
   %ctlz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
@@ -1338,22 +1338,22 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ;
 ; VI-LABEL: v_cttz_i32_sel_ne_bitwidth:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_add_u32 s4, s2, 3
-; VI-NEXT:    s_addc_u32 s5, s3, 0
-; VI-NEXT:    v_mov_b32_e32 v2, s4
-; VI-NEXT:    v_mov_b32_e32 v3, s5
-; VI-NEXT:    s_add_u32 s4, s2, 2
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    s_addc_u32 s5, s3, 0
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    s_add_u32 s2, s2, 1
-; VI-NEXT:    s_addc_u32 s3, s3, 0
-; VI-NEXT:    v_mov_b32_e32 v4, s4
-; VI-NEXT:    v_mov_b32_e32 v7, s3
-; VI-NEXT:    v_mov_b32_e32 v5, s5
-; VI-NEXT:    v_mov_b32_e32 v6, s2
+; VI-NEXT:    s_add_u32 s0, s6, 3
+; VI-NEXT:    s_addc_u32 s1, s7, 0
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    s_add_u32 s0, s6, 2
+; VI-NEXT:    s_addc_u32 s1, s7, 0
+; VI-NEXT:    v_mov_b32_e32 v5, s1
+; VI-NEXT:    v_mov_b32_e32 v4, s0
+; VI-NEXT:    s_add_u32 s0, s6, 1
+; VI-NEXT:    s_addc_u32 s1, s7, 0
+; VI-NEXT:    v_mov_b32_e32 v0, s6
+; VI-NEXT:    v_mov_b32_e32 v7, s1
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_mov_b32_e32 v6, s0
 ; VI-NEXT:    flat_load_ubyte v2, v[2:3]
 ; VI-NEXT:    flat_load_ubyte v3, v[4:5]
 ; VI-NEXT:    flat_load_ubyte v4, v[6:7]
@@ -1371,8 +1371,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; VI-NEXT:    v_min_u32_e32 v0, 32, v0
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -1404,13 +1404,13 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ;
 ; GFX9-GISEL-LABEL: v_cttz_i32_sel_ne_bitwidth:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
-; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:1
-; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[2:3] offset:3
-; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:2
+; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[6:7] offset:1
+; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[6:7] offset:3
+; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[6:7] offset:2
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
@@ -1422,7 +1422,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; GFX9-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v1
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, -1, v1, vcc
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i32, ptr addrspace(1) %arrayidx, align 1
   %ctlz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
@@ -1453,10 +1453,10 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ;
 ; VI-LABEL: v_cttz_i8_sel_eq_neg1:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_mov_b32_e32 v0, s6
+; VI-NEXT:    v_mov_b32_e32 v1, s7
 ; VI-NEXT:    flat_load_ubyte v0, v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v1, 0xff
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -1464,8 +1464,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; VI-NEXT:    v_ffbl_b32_e32 v2, v2
 ; VI-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v2, v1, v2, vcc
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_store_byte v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -1498,18 +1498,18 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ;
 ; GFX9-GISEL-LABEL: v_cttz_i8_sel_eq_neg1:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xff
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[6:7]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_or_b32_e32 v3, 0x100, v1
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v3, v3
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
-; GFX9-GISEL-NEXT:    global_store_byte v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_byte v0, v1, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i8, ptr addrspace(1) %arrayidx, align 1
   %ctlz = call i8 @llvm.cttz.i8(i8 %val, i1 false) nounwind readnone
@@ -1544,14 +1544,14 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ;
 ; VI-LABEL: v_cttz_i16_sel_eq_neg1:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_add_u32 s4, s2, 1
-; VI-NEXT:    s_addc_u32 s5, s3, 0
-; VI-NEXT:    v_mov_b32_e32 v2, s4
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v3, s5
-; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    s_add_u32 s0, s6, 1
+; VI-NEXT:    s_addc_u32 s1, s7, 0
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s6
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
 ; VI-NEXT:    flat_load_ubyte v2, v[2:3]
 ; VI-NEXT:    flat_load_ubyte v0, v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v1, 0xffff
@@ -1563,8 +1563,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; VI-NEXT:    v_ffbl_b32_e32 v2, v2
 ; VI-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v2, v1, v2, vcc
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_store_short v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -1597,12 +1597,12 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ;
 ; GFX9-GISEL-LABEL: v_cttz_i16_sel_eq_neg1:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xffff
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
-; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:1
+; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[6:7] offset:1
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
 ; GFX9-GISEL-NEXT:    v_or_b32_e32 v2, 0x10000, v1
@@ -1610,7 +1610,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-GISEL-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i16, ptr addrspace(1) %arrayidx, align 1
   %ctlz = call i16 @llvm.cttz.i16(i16 %val, i1 false) nounwind readnone
diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
index 3f513e120e141..fd4e182f6804e 100644
--- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@@ -918,50 +918,50 @@ define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr add
 ;
 ; VI-LABEL: load_i8_to_f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_ubyte v0, v[0:1]
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: load_i8_to_f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_ubyte v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: load_i8_to_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3]
+; GFX9-NEXT:    global_load_ubyte v0, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: load_i8_to_f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3]
+; GFX11-NEXT:    global_load_u8 v0, v0, s[6:7]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -996,58 +996,58 @@ define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr
 ;
 ; VI-LABEL: load_v2i8_to_v2f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_ushort v0, v[0:1]
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
 ; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: load_v2i8_to_v2f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ushort v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_ushort v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: load_v2i8_to_v2f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v0, v0, s[2:3]
+; GFX9-NEXT:    global_load_ushort v0, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: load_v2i8_to_v2f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_u16 v0, v0, s[2:3]
+; GFX11-NEXT:    global_load_u16 v0, v0, s[6:7]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -1084,61 +1084,61 @@ define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr
 ;
 ; VI-LABEL: load_v3i8_to_v3f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
 ; VI-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
 ; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; VI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
+; VI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: load_v3i8_to_v3f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
 ; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX10-NEXT:    global_store_dwordx3 v3, v[0:2], s[0:1]
+; GFX10-NEXT:    global_store_dwordx3 v3, v[0:2], s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: load_v3i8_to_v3f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
 ; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX9-NEXT:    global_store_dwordx3 v3, v[0:2], s[0:1]
+; GFX9-NEXT:    global_store_dwordx3 v3, v[0:2], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: load_v3i8_to_v3f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3]
+; GFX11-NEXT:    global_load_b32 v0, v0, s[6:7]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
 ; GFX11-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX11-NEXT:    global_store_b96 v3, v[0:2], s[0:1]
+; GFX11-NEXT:    global_store_b96 v3, v[0:2], s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -1175,66 +1175,66 @@ define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr
 ;
 ; VI-LABEL: load_v4i8_to_v4f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
 ; VI-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
 ; VI-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
 ; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: load_v4i8_to_v4f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
 ; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
 ; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: load_v4i8_to_v4f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
 ; GFX9-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
 ; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: load_v4i8_to_v4f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3]
+; GFX11-NEXT:    global_load_b32 v0, v0, s[6:7]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
 ; GFX11-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
 ; GFX11-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
+; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -1281,11 +1281,11 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias
 ;
 ; VI-LABEL: load_v4i8_to_v4f32_unaligned:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -1297,8 +1297,8 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 1, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_ubyte v1, v[0:1]
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v2
 ; VI-NEXT:    s_waitcnt vmcnt(2)
@@ -1307,20 +1307,20 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias
 ; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v4
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v1
-; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: load_v4i8_to_v4f32_unaligned:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x3
-; GFX10-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:3
-; GFX10-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:2
-; GFX10-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:1
-; GFX10-NEXT:    global_load_ubyte v5, v0, s[2:3]
+; GFX10-NEXT:    global_load_ubyte v1, v0, s[6:7] offset:3
+; GFX10-NEXT:    global_load_ubyte v2, v0, s[6:7] offset:2
+; GFX10-NEXT:    global_load_ubyte v4, v0, s[6:7] offset:1
+; GFX10-NEXT:    global_load_ubyte v5, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(3)
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v3, v1
 ; GFX10-NEXT:    s_waitcnt vmcnt(2)
@@ -1329,19 +1329,19 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v1, v4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v5
-; GFX10-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
+; GFX10-NEXT:    global_store_dwordx4 v6, v[0:3], s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: load_v4i8_to_v4f32_unaligned:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:3
-; GFX9-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:2
-; GFX9-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:1
-; GFX9-NEXT:    global_load_ubyte v5, v0, s[2:3]
+; GFX9-NEXT:    global_load_ubyte v1, v0, s[6:7] offset:3
+; GFX9-NEXT:    global_load_ubyte v2, v0, s[6:7] offset:2
+; GFX9-NEXT:    global_load_ubyte v4, v0, s[6:7] offset:1
+; GFX9-NEXT:    global_load_ubyte v5, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v3, v1
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
@@ -1350,19 +1350,19 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v1, v4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v5
-; GFX9-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
+; GFX9-NEXT:    global_store_dwordx4 v6, v[0:3], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: load_v4i8_to_v4f32_unaligned:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    v_dual_mov_b32 v5, 0 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    global_load_u8 v1, v0, s[2:3] offset:3
-; GFX11-NEXT:    global_load_u8 v2, v0, s[2:3] offset:2
-; GFX11-NEXT:    global_load_u8 v4, v0, s[2:3] offset:1
-; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3]
+; GFX11-NEXT:    global_load_u8 v1, v0, s[6:7] offset:3
+; GFX11-NEXT:    global_load_u8 v2, v0, s[6:7] offset:2
+; GFX11-NEXT:    global_load_u8 v4, v0, s[6:7] offset:1
+; GFX11-NEXT:    global_load_u8 v0, v0, s[6:7]
 ; GFX11-NEXT:    s_waitcnt vmcnt(3)
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v3, v1
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
@@ -1371,7 +1371,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v1, v4
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX11-NEXT:    global_store_b128 v5, v[0:3], s[0:1]
+; GFX11-NEXT:    global_store_b128 v5, v[0:3], s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -1426,15 +1426,16 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1
 ;
 ; VI-LABEL: load_v4i8_to_v4f32_unaligned_multiuse:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT:    s_mov_b32 s8, 0x4000405
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s9
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s8, v0
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_add_u32_e32 v4, vcc, s6, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s11
+; VI-NEXT:    v_add_u32_e32 v4, vcc, s10, v0
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
@@ -1448,12 +1449,11 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1
 ; VI-NEXT:    flat_load_ubyte v2, v[2:3]
 ; VI-NEXT:    flat_load_ubyte v3, v[4:5]
 ; VI-NEXT:    flat_load_ubyte v4, v[0:1]
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
-; VI-NEXT:    s_mov_b32 s4, s2
-; VI-NEXT:    s_mov_b32 s5, s3
-; VI-NEXT:    s_mov_b32 s2, s6
-; VI-NEXT:    s_mov_b32 s3, s7
+; VI-NEXT:    s_mov_b32 s8, 0x4000405
+; VI-NEXT:    s_mov_b32 s0, s6
+; VI-NEXT:    s_mov_b32 s1, s7
+; VI-NEXT:    s_mov_b32 s6, s2
+; VI-NEXT:    s_mov_b32 s7, s3
 ; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_lshlrev_b32_e32 v5, 8, v6
 ; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v6
@@ -1467,21 +1467,21 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1
 ; VI-NEXT:    v_or_b32_e32 v5, v7, v3
 ; VI-NEXT:    v_mov_b32_e32 v3, v1
 ; VI-NEXT:    v_perm_b32 v4, v4, v5, s8
-; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
-; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: load_v4i8_to_v4f32_unaligned_multiuse:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x3
-; GFX10-NEXT:    global_load_ubyte v1, v0, s[4:5] offset:2
-; GFX10-NEXT:    global_load_ubyte v3, v0, s[4:5] offset:3
-; GFX10-NEXT:    global_load_ubyte v2, v0, s[6:7] offset:3
-; GFX10-NEXT:    global_load_ubyte v4, v0, s[6:7] offset:2
+; GFX10-NEXT:    global_load_ubyte v1, v0, s[8:9] offset:2
+; GFX10-NEXT:    global_load_ubyte v3, v0, s[8:9] offset:3
+; GFX10-NEXT:    global_load_ubyte v2, v0, s[10:11] offset:3
+; GFX10-NEXT:    global_load_ubyte v4, v0, s[10:11] offset:2
 ; GFX10-NEXT:    s_waitcnt vmcnt(2)
 ; GFX10-NEXT:    v_lshl_or_b32 v5, v3, 8, v1
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v1, v1
@@ -1491,21 +1491,21 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v3
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX10-NEXT:    v_perm_b32 v4, v5, v6, 0x4000405
-; GFX10-NEXT:    global_store_dwordx4 v7, v[0:3], s[0:1]
-; GFX10-NEXT:    global_store_dword v7, v4, s[2:3]
+; GFX10-NEXT:    global_store_dwordx4 v7, v[0:3], s[4:5]
+; GFX10-NEXT:    global_store_dword v7, v4, s[6:7]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: load_v4i8_to_v4f32_unaligned_multiuse:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-NEXT:    s_mov_b32 s0, 0x4000405
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ubyte v1, v0, s[4:5] offset:2
-; GFX9-NEXT:    global_load_ubyte v2, v0, s[6:7] offset:3
-; GFX9-NEXT:    global_load_ubyte v3, v0, s[4:5] offset:3
-; GFX9-NEXT:    global_load_ubyte v4, v0, s[6:7] offset:2
-; GFX9-NEXT:    s_mov_b32 s4, 0x4000405
+; GFX9-NEXT:    global_load_ubyte v1, v0, s[8:9] offset:2
+; GFX9-NEXT:    global_load_ubyte v2, v0, s[10:11] offset:3
+; GFX9-NEXT:    global_load_ubyte v3, v0, s[8:9] offset:3
+; GFX9-NEXT:    global_load_ubyte v4, v0, s[10:11] offset:2
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_lshl_or_b32 v6, v3, 8, v1
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v1, v1
@@ -1514,22 +1514,22 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v2, v4
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v3, v1
-; GFX9-NEXT:    v_perm_b32 v4, v6, v7, s4
-; GFX9-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1]
-; GFX9-NEXT:    global_store_dword v5, v4, s[2:3]
+; GFX9-NEXT:    v_perm_b32 v4, v6, v7, s0
+; GFX9-NEXT:    global_store_dwordx4 v5, v[0:3], s[4:5]
+; GFX9-NEXT:    global_store_dword v5, v4, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: load_v4i8_to_v4f32_unaligned_multiuse:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b256 s[4:11], s[0:1], 0x24
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    global_load_u8 v1, v0, s[4:5] offset:2
-; GFX11-NEXT:    global_load_u8 v3, v0, s[4:5] offset:3
-; GFX11-NEXT:    global_load_u8 v2, v0, s[6:7] offset:3
-; GFX11-NEXT:    global_load_u8 v0, v0, s[6:7] offset:2
+; GFX11-NEXT:    global_load_u8 v1, v0, s[8:9] offset:2
+; GFX11-NEXT:    global_load_u8 v3, v0, s[8:9] offset:3
+; GFX11-NEXT:    global_load_u8 v2, v0, s[10:11] offset:3
+; GFX11-NEXT:    global_load_u8 v0, v0, s[10:11] offset:2
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
 ; GFX11-NEXT:    v_lshl_or_b32 v4, v3, 8, v1
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v1, v1
@@ -1541,8 +1541,8 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-NEXT:    v_perm_b32 v4, v4, v5, 0x4000405
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_store_b128 v6, v[0:3], s[0:1]
-; GFX11-NEXT:    global_store_b32 v6, v4, s[2:3]
+; GFX11-NEXT:    global_store_b128 v6, v[0:3], s[4:5]
+; GFX11-NEXT:    global_store_b32 v6, v4, s[6:7]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -1605,23 +1605,22 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
 ; VI-LABEL: load_v4i8_to_v4f32_2_uses:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    v_mov_b32_e32 v5, 0xffffff00
+; VI-NEXT:    v_mov_b32_e32 v6, 9
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v4, v[0:1]
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    v_mov_b32_e32 v6, 9
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s0, s6
+; VI-NEXT:    s_mov_b32 s1, s7
+; VI-NEXT:    s_mov_b32 s6, s2
+; VI-NEXT:    s_mov_b32 s7, s3
 ; VI-NEXT:    v_mov_b32_e32 v7, 0x900
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s4, s2
-; VI-NEXT:    s_mov_b32 s5, s3
-; VI-NEXT:    s_mov_b32 s2, s6
-; VI-NEXT:    s_mov_b32 s3, s7
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v4
 ; VI-NEXT:    v_cvt_f32_ubyte2_e32 v2, v4
@@ -1631,24 +1630,24 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
 ; VI-NEXT:    v_add_u16_e32 v9, 9, v4
 ; VI-NEXT:    v_and_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_add_u16_sdwa v4, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; VI-NEXT:    s_nop 0
 ; VI-NEXT:    v_or_b32_sdwa v0, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_or_b32_sdwa v1, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u16_e32 v0, 0x900, v0
 ; VI-NEXT:    v_add_u16_sdwa v1, v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
-; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: load_v4i8_to_v4f32_2_uses:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffffff00, v0
@@ -1666,22 +1665,21 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
 ; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
 ; GFX10-NEXT:    v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
-; GFX10-NEXT:    global_store_dword v4, v5, s[2:3]
+; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
+; GFX10-NEXT:    global_store_dword v4, v5, s[6:7]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: load_v4i8_to_v4f32_2_uses:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT:    s_movk_i32 s4, 0xff00
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 9
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v4, v0, s[0:1]
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX9-NEXT:    s_movk_i32 s5, 0x900
+; GFX9-NEXT:    s_movk_i32 s0, 0xff00
+; GFX9-NEXT:    s_movk_i32 s1, 0x900
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte3_e32 v3, v4
 ; GFX9-NEXT:    v_cvt_f32_ubyte2_e32 v2, v4
@@ -1689,26 +1687,26 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v4
 ; GFX9-NEXT:    v_and_b32_e32 v7, 0xffffff00, v4
 ; GFX9-NEXT:    v_add_u16_e32 v8, 9, v4
-; GFX9-NEXT:    v_and_b32_sdwa v9, v4, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v9, v4, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    v_add_u16_sdwa v4, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX9-NEXT:    global_store_dwordx4 v5, v[0:3], s[4:5]
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u16_e32 v0, 0x900, v0
-; GFX9-NEXT:    v_add_u16_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_add_u16_sdwa v1, v1, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX9-NEXT:    global_store_dword v5, v0, s[2:3]
+; GFX9-NEXT:    global_store_dword v5, v0, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: load_v4i8_to_v4f32_2_uses:
 ; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x34
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3]
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX11-NEXT:    v_add_nc_u16 v2, v0, 9
@@ -1734,10 +1732,9 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
 ; GFX11-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
 ; GFX11-NEXT:    v_or_b32_e32 v5, v5, v6
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
-; GFX11-NEXT:    global_store_b32 v4, v5, s[2:3]
+; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[4:5]
+; GFX11-NEXT:    global_store_b32 v4, v5, s[6:7]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -1794,11 +1791,11 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr
 ;
 ; VI-LABEL: load_v7i8_to_v7f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 5, v0
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -1819,8 +1816,8 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 4, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_ubyte v9, v[0:1]
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt vmcnt(6)
 ; VI-NEXT:    v_cvt_f32_ubyte0_e32 v5, v10
 ; VI-NEXT:    s_waitcnt vmcnt(4)
@@ -1833,23 +1830,23 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr
 ; VI-NEXT:    v_cvt_f32_ubyte0_e32 v6, v8
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte0_e32 v4, v9
-; VI-NEXT:    buffer_store_dwordx3 v[4:6], off, s[0:3], 0 offset:16
-; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT:    buffer_store_dwordx3 v[4:6], off, s[4:7], 0 offset:16
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: load_v7i8_to_v7f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x5
-; GFX10-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:6
-; GFX10-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:3
-; GFX10-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:2
-; GFX10-NEXT:    global_load_ubyte v5, v0, s[2:3] offset:1
-; GFX10-NEXT:    global_load_short_d16 v7, v0, s[2:3] offset:4
-; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_ubyte v4, v0, s[6:7] offset:6
+; GFX10-NEXT:    global_load_ubyte v1, v0, s[6:7] offset:3
+; GFX10-NEXT:    global_load_ubyte v2, v0, s[6:7] offset:2
+; GFX10-NEXT:    global_load_ubyte v5, v0, s[6:7] offset:1
+; GFX10-NEXT:    global_load_short_d16 v7, v0, s[6:7] offset:4
+; GFX10-NEXT:    global_load_ubyte v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(5)
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v6, v4
 ; GFX10-NEXT:    s_waitcnt vmcnt(4)
@@ -1863,22 +1860,22 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v4, v7
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX10-NEXT:    global_store_dwordx3 v8, v[4:6], s[0:1] offset:16
-; GFX10-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX10-NEXT:    global_store_dwordx3 v8, v[4:6], s[4:5] offset:16
+; GFX10-NEXT:    global_store_dwordx4 v8, v[0:3], s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: load_v7i8_to_v7f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v10, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:6
-; GFX9-NEXT:    global_load_ushort v2, v0, s[2:3] offset:4
-; GFX9-NEXT:    global_load_ubyte v3, v0, s[2:3] offset:3
-; GFX9-NEXT:    global_load_ubyte v7, v0, s[2:3] offset:2
-; GFX9-NEXT:    global_load_ubyte v8, v0, s[2:3] offset:1
-; GFX9-NEXT:    global_load_ubyte v9, v0, s[2:3]
+; GFX9-NEXT:    global_load_ubyte v1, v0, s[6:7] offset:6
+; GFX9-NEXT:    global_load_ushort v2, v0, s[6:7] offset:4
+; GFX9-NEXT:    global_load_ubyte v3, v0, s[6:7] offset:3
+; GFX9-NEXT:    global_load_ubyte v7, v0, s[6:7] offset:2
+; GFX9-NEXT:    global_load_ubyte v8, v0, s[6:7] offset:1
+; GFX9-NEXT:    global_load_ubyte v9, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v6, v1
 ; GFX9-NEXT:    s_waitcnt vmcnt(4)
@@ -1892,23 +1889,23 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v1, v8
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v9
-; GFX9-NEXT:    global_store_dwordx4 v10, v[0:3], s[0:1]
-; GFX9-NEXT:    global_store_dwordx3 v10, v[4:6], s[0:1] offset:16
+; GFX9-NEXT:    global_store_dwordx4 v10, v[0:3], s[4:5]
+; GFX9-NEXT:    global_store_dwordx3 v10, v[4:6], s[4:5] offset:16
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: load_v7i8_to_v7f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x5
-; GFX11-NEXT:    global_load_u8 v4, v0, s[2:3] offset:6
-; GFX11-NEXT:    global_load_u8 v1, v0, s[2:3] offset:3
-; GFX11-NEXT:    global_load_u8 v2, v0, s[2:3] offset:2
-; GFX11-NEXT:    global_load_u8 v5, v0, s[2:3] offset:1
-; GFX11-NEXT:    global_load_d16_b16 v7, v0, s[2:3] offset:4
-; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3]
+; GFX11-NEXT:    global_load_u8 v4, v0, s[6:7] offset:6
+; GFX11-NEXT:    global_load_u8 v1, v0, s[6:7] offset:3
+; GFX11-NEXT:    global_load_u8 v2, v0, s[6:7] offset:2
+; GFX11-NEXT:    global_load_u8 v5, v0, s[6:7] offset:1
+; GFX11-NEXT:    global_load_d16_b16 v7, v0, s[6:7] offset:4
+; GFX11-NEXT:    global_load_u8 v0, v0, s[6:7]
 ; GFX11-NEXT:    s_waitcnt vmcnt(5)
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v6, v4
 ; GFX11-NEXT:    s_waitcnt vmcnt(4)
@@ -1923,8 +1920,8 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_store_b96 v8, v[4:6], s[0:1] offset:16
-; GFX11-NEXT:    global_store_b128 v8, v[0:3], s[0:1]
+; GFX11-NEXT:    global_store_b96 v8, v[4:6], s[4:5] offset:16
+; GFX11-NEXT:    global_store_b128 v8, v[0:3], s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -1966,15 +1963,15 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr
 ;
 ; VI-LABEL: load_v8i8_to_v8f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[7:8], v[0:1]
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v7
 ; VI-NEXT:    v_cvt_f32_ubyte2_e32 v2, v7
@@ -1984,17 +1981,17 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr
 ; VI-NEXT:    v_cvt_f32_ubyte2_e32 v6, v8
 ; VI-NEXT:    v_cvt_f32_ubyte1_e32 v5, v8
 ; VI-NEXT:    v_cvt_f32_ubyte0_e32 v4, v8
-; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
-; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: load_v8i8_to_v8f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v10, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[8:9], v0, s[2:3]
+; GFX10-NEXT:    global_load_dwordx2 v[8:9], v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte3_e32 v7, v9
 ; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v6, v9
@@ -2004,17 +2001,17 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr
 ; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v2, v8
 ; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v1, v8
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v8
-; GFX10-NEXT:    global_store_dwordx4 v10, v[4:7], s[0:1] offset:16
-; GFX10-NEXT:    global_store_dwordx4 v10, v[0:3], s[0:1]
+; GFX10-NEXT:    global_store_dwordx4 v10, v[4:7], s[4:5] offset:16
+; GFX10-NEXT:    global_store_dwordx4 v10, v[0:3], s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: load_v8i8_to_v8f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v9, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[7:8], v0, s[2:3]
+; GFX9-NEXT:    global_load_dwordx2 v[7:8], v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte3_e32 v3, v7
 ; GFX9-NEXT:    v_cvt_f32_ubyte2_e32 v2, v7
@@ -2024,17 +2021,17 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr
 ; GFX9-NEXT:    v_cvt_f32_ubyte2_e32 v6, v8
 ; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v5, v8
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v4, v8
-; GFX9-NEXT:    global_store_dwordx4 v9, v[4:7], s[0:1] offset:16
-; GFX9-NEXT:    global_store_dwordx4 v9, v[0:3], s[0:1]
+; GFX9-NEXT:    global_store_dwordx4 v9, v[4:7], s[4:5] offset:16
+; GFX9-NEXT:    global_store_dwordx4 v9, v[0:3], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: load_v8i8_to_v8f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v10, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[8:9], v0, s[2:3]
+; GFX11-NEXT:    global_load_b64 v[8:9], v0, s[6:7]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_ubyte3_e32 v7, v9
 ; GFX11-NEXT:    v_cvt_f32_ubyte2_e32 v6, v9
@@ -2045,8 +2042,8 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr
 ; GFX11-NEXT:    v_cvt_f32_ubyte1_e32 v1, v8
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, v8
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_store_b128 v10, v[4:7], s[0:1] offset:16
-; GFX11-NEXT:    global_store_b128 v10, v[0:3], s[0:1]
+; GFX11-NEXT:    global_store_b128 v10, v[4:7], s[4:5] offset:16
+; GFX11-NEXT:    global_store_b128 v10, v[0:3], s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -2081,58 +2078,58 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %ou
 ;
 ; VI-LABEL: i8_zext_inreg_i32_to_f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 2, v0
 ; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: i8_zext_inreg_i32_to_f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: i8_zext_inreg_i32_to_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: i8_zext_inreg_i32_to_f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3]
+; GFX11-NEXT:    global_load_b32 v0, v0, s[6:7]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_add_nc_u32_e32 v0, 2, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -2168,53 +2165,53 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %ou
 ;
 ; VI-LABEL: i8_zext_inreg_hi1_to_f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: i8_zext_inreg_hi1_to_f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: i8_zext_inreg_hi1_to_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
-; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: i8_zext_inreg_hi1_to_f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3]
+; GFX11-NEXT:    global_load_b32 v0, v0, s[6:7]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
-; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -2251,50 +2248,50 @@ define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr
 ;
 ; VI-LABEL: i8_zext_i32_to_f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_ubyte v0, v[0:1]
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: i8_zext_i32_to_f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_ubyte v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: i8_zext_i32_to_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3]
+; GFX9-NEXT:    global_load_ubyte v0, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: i8_zext_i32_to_f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3]
+; GFX11-NEXT:    global_load_u8 v0, v0, s[6:7]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -2338,11 +2335,11 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou
 ;
 ; VI-LABEL: v4i8_zext_v4i32_to_v4f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -2354,8 +2351,8 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 1, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_ubyte v1, v[0:1]
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v2
 ; VI-NEXT:    s_waitcnt vmcnt(2)
@@ -2364,20 +2361,20 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou
 ; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v4
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v1
-; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v4i8_zext_v4i32_to_v4f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x3
-; GFX10-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:3
-; GFX10-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:2
-; GFX10-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:1
-; GFX10-NEXT:    global_load_ubyte v5, v0, s[2:3]
+; GFX10-NEXT:    global_load_ubyte v1, v0, s[6:7] offset:3
+; GFX10-NEXT:    global_load_ubyte v2, v0, s[6:7] offset:2
+; GFX10-NEXT:    global_load_ubyte v4, v0, s[6:7] offset:1
+; GFX10-NEXT:    global_load_ubyte v5, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(3)
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v3, v1
 ; GFX10-NEXT:    s_waitcnt vmcnt(2)
@@ -2386,19 +2383,19 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v1, v4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v5
-; GFX10-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
+; GFX10-NEXT:    global_store_dwordx4 v6, v[0:3], s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v4i8_zext_v4i32_to_v4f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:3
-; GFX9-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:2
-; GFX9-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:1
-; GFX9-NEXT:    global_load_ubyte v5, v0, s[2:3]
+; GFX9-NEXT:    global_load_ubyte v1, v0, s[6:7] offset:3
+; GFX9-NEXT:    global_load_ubyte v2, v0, s[6:7] offset:2
+; GFX9-NEXT:    global_load_ubyte v4, v0, s[6:7] offset:1
+; GFX9-NEXT:    global_load_ubyte v5, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v3, v1
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
@@ -2407,19 +2404,19 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v1, v4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v5
-; GFX9-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
+; GFX9-NEXT:    global_store_dwordx4 v6, v[0:3], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v4i8_zext_v4i32_to_v4f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    v_dual_mov_b32 v5, 0 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    global_load_u8 v1, v0, s[2:3] offset:3
-; GFX11-NEXT:    global_load_u8 v2, v0, s[2:3] offset:2
-; GFX11-NEXT:    global_load_u8 v4, v0, s[2:3] offset:1
-; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3]
+; GFX11-NEXT:    global_load_u8 v1, v0, s[6:7] offset:3
+; GFX11-NEXT:    global_load_u8 v2, v0, s[6:7] offset:2
+; GFX11-NEXT:    global_load_u8 v4, v0, s[6:7] offset:1
+; GFX11-NEXT:    global_load_u8 v0, v0, s[6:7]
 ; GFX11-NEXT:    s_waitcnt vmcnt(3)
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v3, v1
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
@@ -2428,7 +2425,7 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v1, v4
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX11-NEXT:    global_store_b128 v5, v[0:3], s[0:1]
+; GFX11-NEXT:    global_store_b128 v5, v[0:3], s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -2463,53 +2460,53 @@ define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, p
 ;
 ; VI-LABEL: extract_byte0_to_f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: extract_byte0_to_f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: extract_byte0_to_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: extract_byte0_to_f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3]
+; GFX11-NEXT:    global_load_b32 v0, v0, s[6:7]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -2544,53 +2541,53 @@ define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, p
 ;
 ; VI-LABEL: extract_byte1_to_f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: extract_byte1_to_f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: extract_byte1_to_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
-; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: extract_byte1_to_f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3]
+; GFX11-NEXT:    global_load_b32 v0, v0, s[6:7]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
-; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -2626,53 +2623,53 @@ define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, p
 ;
 ; VI-LABEL: extract_byte2_to_f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte2_e32 v0, v0
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: extract_byte2_to_f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v0, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: extract_byte2_to_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte2_e32 v0, v0
-; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: extract_byte2_to_f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3]
+; GFX11-NEXT:    global_load_b32 v0, v0, s[6:7]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_ubyte2_e32 v0, v0
-; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -2708,53 +2705,53 @@ define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, p
 ;
 ; VI-LABEL: extract_byte3_to_f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte3_e32 v0, v0
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: extract_byte3_to_f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte3_e32 v0, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: extract_byte3_to_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte3_e32 v0, v0
-; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: extract_byte3_to_f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3]
+; GFX11-NEXT:    global_load_b32 v0, v0, s[6:7]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_ubyte3_e32 v0, v0
-; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -2825,16 +2822,16 @@ define amdgpu_kernel void @cvt_ubyte0_or_multiuse(ptr addrspace(1) %in, ptr addr
 ;
 ; GFX9-LABEL: cvt_ubyte0_or_multiuse:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[0:1]
+; GFX9-NEXT:    global_load_dword v0, v0, s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_e32 v0, 0x80000001, v0
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v2, v0
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX9-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_store_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: cvt_ubyte0_or_multiuse:
diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
index 8f31bb1fe0a81..67b0cef92d4d3 100644
--- a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
@@ -376,32 +376,32 @@ define amdgpu_kernel void @uniform_vec_i16_LH(ptr addrspace(1) %out, i16 %a, i32
 ;
 ; GFX9-LABEL: uniform_vec_i16_LH:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_pack_lh_b32_b16 s2, s2, s3
-; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_pack_lh_b32_b16 s0, s6, s7
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX906-LABEL: uniform_vec_i16_LH:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX906-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    s_pack_lh_b32_b16 s2, s2, s3
-; GFX906-NEXT:    v_mov_b32_e32 v1, s2
-; GFX906-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX906-NEXT:    s_pack_lh_b32_b16 s0, s6, s7
+; GFX906-NEXT:    v_mov_b32_e32 v1, s0
+; GFX906-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX906-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: uniform_vec_i16_LH:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_pack_lh_b32_b16 s2, s2, s3
+; GFX11-NEXT:    s_pack_lh_b32_b16 s0, s6, s7
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -466,32 +466,32 @@ define amdgpu_kernel void @uniform_vec_i16_HH(ptr addrspace(1) %out, i32 %a, i32
 ;
 ; GFX9-LABEL: uniform_vec_i16_HH:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_pack_hh_b32_b16 s2, s2, s3
-; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_pack_hh_b32_b16 s0, s6, s7
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX906-LABEL: uniform_vec_i16_HH:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX906-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    s_pack_hh_b32_b16 s2, s2, s3
-; GFX906-NEXT:    v_mov_b32_e32 v1, s2
-; GFX906-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX906-NEXT:    s_pack_hh_b32_b16 s0, s6, s7
+; GFX906-NEXT:    v_mov_b32_e32 v1, s0
+; GFX906-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX906-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: uniform_vec_i16_HH:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_pack_hh_b32_b16 s2, s2, s3
+; GFX11-NEXT:    s_pack_hh_b32_b16 s0, s6, s7
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
index 777a8f3fef1c1..4cef7015811fe 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
@@ -319,16 +319,16 @@ define amdgpu_kernel void @read2_ptr_is_subreg_arg_f32(ptr addrspace(1) %out, <2
 ;
 ; GFX9-LABEL: read2_ptr_is_subreg_arg_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-NEXT:    v_mov_b32_e32 v2, s7
 ; GFX9-NEXT:    ds_read_b32 v1, v1 offset:32
 ; GFX9-NEXT:    ds_read_b32 v2, v2
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
@@ -370,16 +370,16 @@ define amdgpu_kernel void @read2_ptr_is_subreg_arg_offset_f32(ptr addrspace(1) %
 ;
 ; GFX9-LABEL: read2_ptr_is_subreg_arg_offset_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-NEXT:    v_mov_b32_e32 v2, s7
 ; GFX9-NEXT:    ds_read_b32 v1, v1 offset:32
 ; GFX9-NEXT:    ds_read_b32 v2, v2 offset:32
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
index 70011e56d016e..44d65c9e50086 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
@@ -208,30 +208,30 @@ define amdgpu_kernel void @extract_vector_elt_v3f16(ptr addrspace(1) %out, <3 x
 ;
 ; VI-LABEL: extract_vector_elt_v3f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    v_mov_b32_e32 v0, s3
-; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:2
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    v_mov_b32_e32 v0, s7
+; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; VI-NEXT:    v_mov_b32_e32 v0, s6
+; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:2
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: extract_vector_elt_v3f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s6, -1
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT:    s_mov_b32 s4, s0
-; GFX11-NEXT:    s_mov_b32 s5, s1
+; GFX11-NEXT:    v_dual_mov_b32 v0, s7 :: v_dual_mov_b32 v1, s6
+; GFX11-NEXT:    s_mov_b32 s0, s4
+; GFX11-NEXT:    s_mov_b32 s1, s5
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
-; GFX11-NEXT:    buffer_store_b16 v1, off, s[4:7], 0 offset:2
+; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-NEXT:    buffer_store_b16 v1, off, s[0:3], 0 offset:2
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -278,16 +278,16 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v3f16(ptr addrspace(1) %ou
 ; GFX11-LABEL: dynamic_extract_vector_elt_v3f16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b32 s4, s[0:1], 0x34
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x34
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b32 s4, s4, 4
+; GFX11-NEXT:    s_lshl_b32 s0, s2, 4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_lshr_b64 s[2:3], s[2:3], s4
-; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-NEXT:    v_mov_b32_e32 v0, s2
-; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_lshr_b64 s[0:1], s[6:7], s0
+; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    s_mov_b32 s6, -1
+; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
index f34824cd6cefe..c35e1e2a7e68a 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
@@ -212,14 +212,14 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) {
 ;
 ; GFX11-LABEL: s_fabs_v4f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_and_b32 s2, s2, 0x7fff7fff
-; GFX11-NEXT:    s_and_b32 s3, s3, 0x7fff7fff
+; GFX11-NEXT:    s_and_b32 s0, s6, 0x7fff7fff
+; GFX11-NEXT:    s_and_b32 s1, s7, 0x7fff7fff
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT:    v_mov_b32_e32 v0, s2
-; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fabs.ll b/llvm/test/CodeGen/AMDGPU/fabs.ll
index 07581ade57ccd..8f0d6393641a2 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.ll
@@ -113,14 +113,14 @@ define amdgpu_kernel void @fabs_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
 ;
 ; VI-LABEL: fabs_v2f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_bitset0_b32 s3, 31
-; VI-NEXT:    s_bitset0_b32 s2, 31
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    s_and_b32 s0, s7, 0x7fffffff
+; VI-NEXT:    s_and_b32 s1, s6, 0x7fffffff
+; VI-NEXT:    v_mov_b32_e32 v2, s4
+; VI-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NEXT:    v_mov_b32_e32 v1, s0
+; VI-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
   %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in)
@@ -185,12 +185,12 @@ define amdgpu_kernel void @fabsf_fn_fold(ptr addrspace(1) %out, float %in0, floa
 ;
 ; VI-LABEL: fabsf_fn_fold:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s3
-; VI-NEXT:    v_mul_f32_e64 v2, |s2|, v0
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s7
+; VI-NEXT:    v_mul_f32_e64 v2, |s6|, v0
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
   %fabs = call float @fabsf(float %in0)
@@ -215,12 +215,12 @@ define amdgpu_kernel void @fabs_fold(ptr addrspace(1) %out, float %in0, float %i
 ;
 ; VI-LABEL: fabs_fold:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s3
-; VI-NEXT:    v_mul_f32_e64 v2, |s2|, v0
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s7
+; VI-NEXT:    v_mul_f32_e64 v2, |s6|, v0
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
   %fabs = call float @llvm.fabs.f32(float %in0)
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
index d53c0411ad88c..bdd08aa6287a9 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
@@ -123,11 +123,11 @@ define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, fl
 ;
 ; GFX12-LABEL: s_test_canonicalize_var_f32:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-NEXT:    s_load_b96 s[4:6], s[0:1], 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_max_num_f32_e64 v1, s2, s2
-; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    v_max_num_f32_e64 v1, s6, s6
+; GFX12-NEXT:    global_store_b32 v0, v1, s[4:5]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
index b8936911f0576..fd80580c2fb27 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
@@ -1811,60 +1811,59 @@ define amdgpu_kernel void @s_copysign_v2f16(ptr addrspace(1) %arg_out, <2 x half
 ;
 ; VI-LABEL: s_copysign_v2f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    s_movk_i32 s4, 0x7fff
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_movk_i32 s0, 0x7fff
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    s_lshr_b32 s3, s3, 16
-; VI-NEXT:    s_lshr_b32 s2, s2, 16
-; VI-NEXT:    v_bfi_b32 v0, s4, v0, v1
+; VI-NEXT:    v_mov_b32_e32 v0, s6
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    s_lshr_b32 s1, s7, 16
+; VI-NEXT:    s_lshr_b32 s2, s6, 16
+; VI-NEXT:    v_bfi_b32 v0, s0, v0, v1
 ; VI-NEXT:    v_mov_b32_e32 v1, s2
-; VI-NEXT:    v_mov_b32_e32 v2, s3
-; VI-NEXT:    v_bfi_b32 v1, s4, v1, v2
+; VI-NEXT:    v_mov_b32_e32 v2, s1
+; VI-NEXT:    v_bfi_b32 v1, s0, v1, v2
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; VI-NEXT:    v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: s_copysign_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_movk_i32 s0, 0x7fff
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    v_mov_b32_e32 v2, s3
-; GFX9-NEXT:    s_lshr_b32 s3, s3, 16
-; GFX9-NEXT:    s_lshr_b32 s2, s2, 16
-; GFX9-NEXT:    v_bfi_b32 v1, s4, v1, v2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-NEXT:    v_mov_b32_e32 v2, s7
+; GFX9-NEXT:    s_lshr_b32 s1, s7, 16
+; GFX9-NEXT:    s_lshr_b32 s2, s6, 16
+; GFX9-NEXT:    v_bfi_b32 v1, s0, v1, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-NEXT:    v_bfi_b32 v2, s4, v2, v3
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_bfi_b32 v2, s0, v2, v3
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_copysign_v2f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v0, s3
-; GFX11-NEXT:    s_lshr_b32 s3, s3, 16
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mov_b32_e32 v1, s3
-; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff, s2, v0
-; GFX11-NEXT:    s_lshr_b32 s2, s2, 16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_bfi_b32 v1, 0x7fff, s2, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_lshr_b32 s0, s7, 16
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s7 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT:    s_lshr_b32 s0, s6, 16
+; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff, s6, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfi_b32 v1, 0x7fff, s0, v1
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX11-NEXT:    global_store_b32 v2, v0, s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
index f48961c905f58..fb04b66e1a6ad 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
@@ -21,25 +21,25 @@ define amdgpu_kernel void @s_test_copysign_f32(ptr addrspace(1) %out, float %mag
 ;
 ; VI-LABEL: s_test_copysign_f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    s_brev_b32 s4, -2
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_brev_b32 s0, -2
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_bfi_b32 v2, s4, v0, v1
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s6
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_bfi_b32 v2, s0, v0, v1
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_test_copysign_f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s3
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s7
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, s2, v0
-; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, s6, v0
+; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -1018,26 +1018,26 @@ define amdgpu_kernel void @s_test_copysign_f32_fpext_f16(ptr addrspace(1) %out,
 ;
 ; VI-LABEL: s_test_copysign_f32_fpext_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    s_brev_b32 s4, -2
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_brev_b32 s0, -2
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_lshlrev_b32_e64 v0, 16, s3
-; VI-NEXT:    v_mov_b32_e32 v1, s2
-; VI-NEXT:    v_bfi_b32 v2, s4, v1, v0
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_lshlrev_b32_e64 v0, 16, s7
+; VI-NEXT:    v_mov_b32_e32 v1, s6
+; VI-NEXT:    v_bfi_b32 v2, s0, v1, v0
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_test_copysign_f32_fpext_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e64 v0, 16, s3
+; GFX11-NEXT:    v_lshlrev_b32_e64 v0, 16, s7
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, s2, v0
-; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, s6, v0
+; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -1116,26 +1116,26 @@ define amdgpu_kernel void @s_test_copysign_f32_fpext_bf16(ptr addrspace(1) %out,
 ;
 ; VI-LABEL: s_test_copysign_f32_fpext_bf16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    s_brev_b32 s4, -2
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_brev_b32 s0, -2
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_lshlrev_b32_e64 v0, 16, s3
-; VI-NEXT:    v_mov_b32_e32 v1, s2
-; VI-NEXT:    v_bfi_b32 v2, s4, v1, v0
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_lshlrev_b32_e64 v0, 16, s7
+; VI-NEXT:    v_mov_b32_e32 v1, s6
+; VI-NEXT:    v_bfi_b32 v2, s0, v1, v0
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_test_copysign_f32_fpext_bf16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e64 v0, 16, s3
+; GFX11-NEXT:    v_lshlrev_b32_e64 v0, 16, s7
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, s2, v0
-; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, s6, v0
+; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.ll b/llvm/test/CodeGen/AMDGPU/fdiv.ll
index 0468175c5df50..b63976876c41c 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.ll
@@ -94,12 +94,12 @@ define amdgpu_kernel void @s_fdiv_f32_ninf(ptr addrspace(1) %out, float %a, floa
 ;
 ; GFX8-LABEL: s_fdiv_f32_ninf:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v0, s2
-; GFX8-NEXT:    v_div_scale_f32 v1, s[4:5], s3, s3, v0
-; GFX8-NEXT:    v_mov_b32_e32 v2, s3
-; GFX8-NEXT:    v_div_scale_f32 v2, vcc, s2, v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_div_scale_f32 v1, s[0:1], s7, s7, v0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s7
+; GFX8-NEXT:    v_div_scale_f32 v2, vcc, s6, v2, s6
 ; GFX8-NEXT:    v_rcp_f32_e32 v3, v1
 ; GFX8-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX8-NEXT:    v_fma_f32 v4, -v1, v3, 1.0
@@ -110,18 +110,18 @@ define amdgpu_kernel void @s_fdiv_f32_ninf(ptr addrspace(1) %out, float %a, floa
 ; GFX8-NEXT:    v_fma_f32 v1, -v1, v4, v2
 ; GFX8-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX8-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
-; GFX8-NEXT:    v_div_fixup_f32 v2, v1, s3, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_div_fixup_f32 v2, v1, s7, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: s_fdiv_f32_ninf:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_div_scale_f32 v0, s4, s3, s3, s2
-; GFX10-NEXT:    v_div_scale_f32 v2, vcc_lo, s2, s3, s2
+; GFX10-NEXT:    v_div_scale_f32 v0, s0, s7, s7, s6
+; GFX10-NEXT:    v_div_scale_f32 v2, vcc_lo, s6, s7, s6
 ; GFX10-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX10-NEXT:    s_denorm_mode 15
 ; GFX10-NEXT:    v_fma_f32 v3, -v0, v1, 1.0
@@ -133,16 +133,16 @@ define amdgpu_kernel void @s_fdiv_f32_ninf(ptr addrspace(1) %out, float %a, floa
 ; GFX10-NEXT:    s_denorm_mode 12
 ; GFX10-NEXT:    v_div_fmas_f32 v0, v0, v1, v3
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    v_div_fixup_f32 v0, v0, s3, s2
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    v_div_fixup_f32 v0, v0, s7, s6
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_fdiv_f32_ninf:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_div_scale_f32 v0, null, s3, s3, s2
-; GFX11-NEXT:    v_div_scale_f32 v2, vcc_lo, s2, s3, s2
+; GFX11-NEXT:    v_div_scale_f32 v0, null, s7, s7, s6
+; GFX11-NEXT:    v_div_scale_f32 v2, vcc_lo, s6, s7, s6
 ; GFX11-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX11-NEXT:    s_denorm_mode 15
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
@@ -155,8 +155,8 @@ define amdgpu_kernel void @s_fdiv_f32_ninf(ptr addrspace(1) %out, float %a, floa
 ; GFX11-NEXT:    s_denorm_mode 12
 ; GFX11-NEXT:    v_div_fmas_f32 v0, v0, v1, v3
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    v_div_fixup_f32 v0, v0, s3, s2
-; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT:    v_div_fixup_f32 v0, v0, s7, s6
+; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -253,12 +253,12 @@ define amdgpu_kernel void @s_fdiv_f32_ieee(ptr addrspace(1) %out, float %a, floa
 ;
 ; GFX8-LABEL: s_fdiv_f32_ieee:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v0, s2
-; GFX8-NEXT:    v_div_scale_f32 v1, s[4:5], s3, s3, v0
-; GFX8-NEXT:    v_mov_b32_e32 v2, s3
-; GFX8-NEXT:    v_div_scale_f32 v2, vcc, s2, v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_div_scale_f32 v1, s[0:1], s7, s7, v0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s7
+; GFX8-NEXT:    v_div_scale_f32 v2, vcc, s6, v2, s6
 ; GFX8-NEXT:    v_rcp_f32_e32 v3, v1
 ; GFX8-NEXT:    v_fma_f32 v4, -v1, v3, 1.0
 ; GFX8-NEXT:    v_fma_f32 v3, v4, v3, v3
@@ -267,49 +267,49 @@ define amdgpu_kernel void @s_fdiv_f32_ieee(ptr addrspace(1) %out, float %a, floa
 ; GFX8-NEXT:    v_fma_f32 v4, v5, v3, v4
 ; GFX8-NEXT:    v_fma_f32 v1, -v1, v4, v2
 ; GFX8-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
-; GFX8-NEXT:    v_div_fixup_f32 v2, v1, s3, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_div_fixup_f32 v2, v1, s7, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: s_fdiv_f32_ieee:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_div_scale_f32 v0, s4, s3, s3, s2
+; GFX10-NEXT:    v_div_scale_f32 v0, s0, s7, s7, s6
 ; GFX10-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX10-NEXT:    v_fma_f32 v2, -v0, v1, 1.0
 ; GFX10-NEXT:    v_fmac_f32_e32 v1, v2, v1
-; GFX10-NEXT:    v_div_scale_f32 v2, vcc_lo, s2, s3, s2
+; GFX10-NEXT:    v_div_scale_f32 v2, vcc_lo, s6, s7, s6
 ; GFX10-NEXT:    v_mul_f32_e32 v3, v2, v1
 ; GFX10-NEXT:    v_fma_f32 v4, -v0, v3, v2
 ; GFX10-NEXT:    v_fmac_f32_e32 v3, v4, v1
 ; GFX10-NEXT:    v_fma_f32 v0, -v0, v3, v2
 ; GFX10-NEXT:    v_div_fmas_f32 v0, v0, v1, v3
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    v_div_fixup_f32 v0, v0, s3, s2
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    v_div_fixup_f32 v0, v0, s7, s6
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_fdiv_f32_ieee:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_div_scale_f32 v0, null, s3, s3, s2
+; GFX11-NEXT:    v_div_scale_f32 v0, null, s7, s7, s6
 ; GFX11-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_fma_f32 v2, -v0, v1, 1.0
 ; GFX11-NEXT:    v_fmac_f32_e32 v1, v2, v1
-; GFX11-NEXT:    v_div_scale_f32 v2, vcc_lo, s2, s3, s2
+; GFX11-NEXT:    v_div_scale_f32 v2, vcc_lo, s6, s7, s6
 ; GFX11-NEXT:    v_mul_f32_e32 v3, v2, v1
 ; GFX11-NEXT:    v_fma_f32 v4, -v0, v3, v2
 ; GFX11-NEXT:    v_fmac_f32_e32 v3, v4, v1
 ; GFX11-NEXT:    v_fma_f32 v0, -v0, v3, v2
 ; GFX11-NEXT:    v_div_fmas_f32 v0, v0, v1, v3
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    v_div_fixup_f32 v0, v0, s3, s2
-; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT:    v_div_fixup_f32 v0, v0, s7, s6
+; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -353,48 +353,48 @@ define amdgpu_kernel void @s_fdiv_25ulp_f32(ptr addrspace(1) %out, float %a, flo
 ;
 ; GFX8-LABEL: s_fdiv_25ulp_f32:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0x6f800000
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0x2f800000
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_gt_f32_e64 vcc, |s3|, v0
+; GFX8-NEXT:    v_cmp_gt_f32_e64 vcc, |s7|, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; GFX8-NEXT:    v_mul_f32_e32 v1, s3, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, s7, v0
 ; GFX8-NEXT:    v_rcp_f32_e32 v1, v1
-; GFX8-NEXT:    v_mul_f32_e32 v1, s2, v1
+; GFX8-NEXT:    v_mul_f32_e32 v1, s6, v1
 ; GFX8-NEXT:    v_mul_f32_e32 v2, v0, v1
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: s_fdiv_25ulp_f32:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_cmp_lt_f32_e64 s4, 0x6f800000, |s3|
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x2f800000, s4
-; GFX10-NEXT:    v_mul_f32_e32 v1, s3, v0
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s0, 0x6f800000, |s7|
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x2f800000, s0
+; GFX10-NEXT:    v_mul_f32_e32 v1, s7, v0
 ; GFX10-NEXT:    v_rcp_f32_e32 v1, v1
-; GFX10-NEXT:    v_mul_f32_e32 v1, s2, v1
+; GFX10-NEXT:    v_mul_f32_e32 v1, s6, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX10-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v2, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_fdiv_25ulp_f32:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_lt_f32_e64 s4, 0x6f800000, |s3|
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x2f800000, s4
-; GFX11-NEXT:    v_mul_f32_e32 v1, s3, v0
+; GFX11-NEXT:    v_cmp_lt_f32_e64 s0, 0x6f800000, |s7|
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x2f800000, s0
+; GFX11-NEXT:    v_mul_f32_e32 v1, s7, v0
 ; GFX11-NEXT:    v_rcp_f32_e32 v1, v1
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v1, s2, v1
+; GFX11-NEXT:    v_mul_f32_e32 v1, s6, v1
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX11-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX11-NEXT:    global_store_b32 v2, v0, s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -465,52 +465,52 @@ define amdgpu_kernel void @s_fdiv_25ulp_ieee_f32(ptr addrspace(1) %out, float %a
 ;
 ; GFX8-LABEL: s_fdiv_25ulp_ieee_f32:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_frexp_mant_f32_e32 v1, s3
+; GFX8-NEXT:    v_frexp_mant_f32_e32 v1, s7
 ; GFX8-NEXT:    v_rcp_f32_e32 v1, v1
-; GFX8-NEXT:    v_frexp_exp_i32_f32_e32 v0, s3
-; GFX8-NEXT:    v_frexp_exp_i32_f32_e32 v2, s2
-; GFX8-NEXT:    v_frexp_mant_f32_e32 v3, s2
+; GFX8-NEXT:    v_frexp_exp_i32_f32_e32 v0, s7
+; GFX8-NEXT:    v_frexp_exp_i32_f32_e32 v2, s6
+; GFX8-NEXT:    v_frexp_mant_f32_e32 v3, s6
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v2, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v1, v3, v1
 ; GFX8-NEXT:    v_ldexp_f32 v2, v1, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: s_fdiv_25ulp_ieee_f32:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_frexp_mant_f32_e32 v0, s3
-; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v1, s3
-; GFX10-NEXT:    v_frexp_mant_f32_e32 v2, s2
-; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v3, s2
+; GFX10-NEXT:    v_frexp_mant_f32_e32 v0, s7
+; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v1, s7
+; GFX10-NEXT:    v_frexp_mant_f32_e32 v2, s6
+; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v3, s6
 ; GFX10-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v1, v3, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX10-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v2, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_fdiv_25ulp_ieee_f32:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_frexp_mant_f32_e32 v0, s3
-; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v1, s3
-; GFX11-NEXT:    v_frexp_mant_f32_e32 v2, s2
-; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v3, s2
+; GFX11-NEXT:    v_frexp_mant_f32_e32 v0, s7
+; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v1, s7
+; GFX11-NEXT:    v_frexp_mant_f32_e32 v2, s6
+; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v3, s6
 ; GFX11-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v1, v3, v1
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v2, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX11-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX11-NEXT:    global_store_b32 v2, v0, s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -548,33 +548,33 @@ define amdgpu_kernel void @s_fdiv_fast_ieee_f32(ptr addrspace(1) %out, float %a,
 ;
 ; GFX8-LABEL: s_fdiv_fast_ieee_f32:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_rcp_f32_e32 v0, s3
-; GFX8-NEXT:    v_mul_f32_e32 v2, s2, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_rcp_f32_e32 v0, s7
+; GFX8-NEXT:    v_mul_f32_e32 v2, s6, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: s_fdiv_fast_ieee_f32:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_rcp_f32_e32 v0, s3
-; GFX10-NEXT:    v_mul_f32_e32 v0, s2, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    v_rcp_f32_e32 v0, s7
+; GFX10-NEXT:    v_mul_f32_e32 v0, s6, v0
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_fdiv_fast_ieee_f32:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_rcp_f32_e32 v0, s3
+; GFX11-NEXT:    v_rcp_f32_e32 v0, s7
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, s2, v0
-; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, s6, v0
+; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -612,33 +612,33 @@ define amdgpu_kernel void @s_fdiv_f32_fast_math(ptr addrspace(1) %out, float %a,
 ;
 ; GFX8-LABEL: s_fdiv_f32_fast_math:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_rcp_f32_e32 v0, s3
-; GFX8-NEXT:    v_mul_f32_e32 v2, s2, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_rcp_f32_e32 v0, s7
+; GFX8-NEXT:    v_mul_f32_e32 v2, s6, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: s_fdiv_f32_fast_math:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_rcp_f32_e32 v0, s3
-; GFX10-NEXT:    v_mul_f32_e32 v0, s2, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    v_rcp_f32_e32 v0, s7
+; GFX10-NEXT:    v_mul_f32_e32 v0, s6, v0
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_fdiv_f32_fast_math:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_rcp_f32_e32 v0, s3
+; GFX11-NEXT:    v_rcp_f32_e32 v0, s7
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, s2, v0
-; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, s6, v0
+; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -676,33 +676,33 @@ define amdgpu_kernel void @s_fdiv_ulp25_f32_fast_math(ptr addrspace(1) %out, flo
 ;
 ; GFX8-LABEL: s_fdiv_ulp25_f32_fast_math:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_rcp_f32_e32 v0, s3
-; GFX8-NEXT:    v_mul_f32_e32 v2, s2, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_rcp_f32_e32 v0, s7
+; GFX8-NEXT:    v_mul_f32_e32 v2, s6, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: s_fdiv_ulp25_f32_fast_math:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_rcp_f32_e32 v0, s3
-; GFX10-NEXT:    v_mul_f32_e32 v0, s2, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    v_rcp_f32_e32 v0, s7
+; GFX10-NEXT:    v_mul_f32_e32 v0, s6, v0
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_fdiv_ulp25_f32_fast_math:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_rcp_f32_e32 v0, s3
+; GFX11-NEXT:    v_rcp_f32_e32 v0, s7
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, s2, v0
-; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, s6, v0
+; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -805,12 +805,12 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_daz(ptr addrspace(1) %out, float %a,
 ;
 ; GFX8-LABEL: s_fdiv_f32_arcp_daz:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v0, s2
-; GFX8-NEXT:    v_div_scale_f32 v1, s[4:5], s3, s3, v0
-; GFX8-NEXT:    v_mov_b32_e32 v2, s3
-; GFX8-NEXT:    v_div_scale_f32 v2, vcc, s2, v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_div_scale_f32 v1, s[0:1], s7, s7, v0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s7
+; GFX8-NEXT:    v_div_scale_f32 v2, vcc, s6, v2, s6
 ; GFX8-NEXT:    v_rcp_f32_e32 v3, v1
 ; GFX8-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX8-NEXT:    v_fma_f32 v4, -v1, v3, 1.0
@@ -821,18 +821,18 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_daz(ptr addrspace(1) %out, float %a,
 ; GFX8-NEXT:    v_fma_f32 v1, -v1, v4, v2
 ; GFX8-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX8-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
-; GFX8-NEXT:    v_div_fixup_f32 v2, v1, s3, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_div_fixup_f32 v2, v1, s7, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: s_fdiv_f32_arcp_daz:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_div_scale_f32 v0, s4, s3, s3, s2
-; GFX10-NEXT:    v_div_scale_f32 v2, vcc_lo, s2, s3, s2
+; GFX10-NEXT:    v_div_scale_f32 v0, s0, s7, s7, s6
+; GFX10-NEXT:    v_div_scale_f32 v2, vcc_lo, s6, s7, s6
 ; GFX10-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX10-NEXT:    s_denorm_mode 15
 ; GFX10-NEXT:    v_fma_f32 v3, -v0, v1, 1.0
@@ -844,16 +844,16 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_daz(ptr addrspace(1) %out, float %a,
 ; GFX10-NEXT:    s_denorm_mode 12
 ; GFX10-NEXT:    v_div_fmas_f32 v0, v0, v1, v3
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    v_div_fixup_f32 v0, v0, s3, s2
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    v_div_fixup_f32 v0, v0, s7, s6
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_fdiv_f32_arcp_daz:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_div_scale_f32 v0, null, s3, s3, s2
-; GFX11-NEXT:    v_div_scale_f32 v2, vcc_lo, s2, s3, s2
+; GFX11-NEXT:    v_div_scale_f32 v0, null, s7, s7, s6
+; GFX11-NEXT:    v_div_scale_f32 v2, vcc_lo, s6, s7, s6
 ; GFX11-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX11-NEXT:    s_denorm_mode 15
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
@@ -866,8 +866,8 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_daz(ptr addrspace(1) %out, float %a,
 ; GFX11-NEXT:    s_denorm_mode 12
 ; GFX11-NEXT:    v_div_fmas_f32 v0, v0, v1, v3
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    v_div_fixup_f32 v0, v0, s3, s2
-; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT:    v_div_fixup_f32 v0, v0, s7, s6
+; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -905,33 +905,33 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_ninf(ptr addrspace(1) %out, float %a,
 ;
 ; GFX8-LABEL: s_fdiv_f32_arcp_ninf:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_rcp_f32_e32 v0, s3
-; GFX8-NEXT:    v_mul_f32_e32 v2, s2, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_rcp_f32_e32 v0, s7
+; GFX8-NEXT:    v_mul_f32_e32 v2, s6, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: s_fdiv_f32_arcp_ninf:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_rcp_f32_e32 v0, s3
-; GFX10-NEXT:    v_mul_f32_e32 v0, s2, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    v_rcp_f32_e32 v0, s7
+; GFX10-NEXT:    v_mul_f32_e32 v0, s6, v0
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_fdiv_f32_arcp_ninf:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_rcp_f32_e32 v0, s3
+; GFX11-NEXT:    v_rcp_f32_e32 v0, s7
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, s2, v0
-; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, s6, v0
+; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
index e44572985e6d2..ab3650f5650ac 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
@@ -4300,14 +4300,14 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr %out, i32 %in, i32 %old
 ;
 ; GCN2-LABEL: atomic_cmpxchg_i32_offset:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_add_u32 s0, s0, 16
-; GCN2-NEXT:    s_addc_u32 s1, s1, 0
+; GCN2-NEXT:    s_add_u32 s0, s4, 16
+; GCN2-NEXT:    s_addc_u32 s1, s5, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
-; GCN2-NEXT:    v_mov_b32_e32 v2, s2
+; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
-; GCN2-NEXT:    v_mov_b32_e32 v3, s3
+; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
@@ -4315,12 +4315,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr %out, i32 %in, i32 %old
 ;
 ; GCN3-LABEL: atomic_cmpxchg_i32_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v0, s0
-; GCN3-NEXT:    v_mov_b32_e32 v2, s2
-; GCN3-NEXT:    v_mov_b32_e32 v1, s1
-; GCN3-NEXT:    v_mov_b32_e32 v3, s3
+; GCN3-NEXT:    v_mov_b32_e32 v0, s4
+; GCN3-NEXT:    v_mov_b32_e32 v2, s6
+; GCN3-NEXT:    v_mov_b32_e32 v1, s5
+; GCN3-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN3-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
@@ -4557,12 +4557,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32(ptr %out, i32 %in, i32 %old) {
 ;
 ; GCN2-LABEL: atomic_cmpxchg_i32:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    v_mov_b32_e32 v0, s0
-; GCN2-NEXT:    v_mov_b32_e32 v2, s2
-; GCN2-NEXT:    v_mov_b32_e32 v1, s1
-; GCN2-NEXT:    v_mov_b32_e32 v3, s3
+; GCN2-NEXT:    v_mov_b32_e32 v0, s4
+; GCN2-NEXT:    v_mov_b32_e32 v2, s6
+; GCN2-NEXT:    v_mov_b32_e32 v1, s5
+; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
@@ -4570,12 +4570,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32(ptr %out, i32 %in, i32 %old) {
 ;
 ; GCN3-LABEL: atomic_cmpxchg_i32:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v0, s0
-; GCN3-NEXT:    v_mov_b32_e32 v2, s2
-; GCN3-NEXT:    v_mov_b32_e32 v1, s1
-; GCN3-NEXT:    v_mov_b32_e32 v3, s3
+; GCN3-NEXT:    v_mov_b32_e32 v0, s4
+; GCN3-NEXT:    v_mov_b32_e32 v2, s6
+; GCN3-NEXT:    v_mov_b32_e32 v1, s5
+; GCN3-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN3-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll
index 5bd527149572e..816142dd17cce 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll
@@ -3853,13 +3853,13 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 %
 ;
 ; GCN2-LABEL: atomic_max_i32_addr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_ashr_i32 s5, s3, 31
-; GCN2-NEXT:    s_mov_b32 s4, s3
-; GCN2-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
-; GCN2-NEXT:    s_add_u32 s0, s0, s4
-; GCN2-NEXT:    s_addc_u32 s1, s1, s5
+; GCN2-NEXT:    s_ashr_i32 s1, s7, 31
+; GCN2-NEXT:    s_mov_b32 s0, s7
+; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; GCN2-NEXT:    s_add_u32 s0, s4, s0
+; GCN2-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN2-NEXT:    s_add_u32 s0, s0, 16
 ; GCN2-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
@@ -3869,7 +3869,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 %
 ; GCN2-NEXT:  .LBB88_1: ; %atomicrmw.start
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_max_i32_e32 v2, s2, v3
+; GCN2-NEXT:    v_max_i32_e32 v2, s6, v3
 ; GCN2-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
@@ -3883,13 +3883,13 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 %
 ;
 ; GCN3-LABEL: atomic_max_i32_addr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_ashr_i32 s5, s3, 31
-; GCN3-NEXT:    s_mov_b32 s4, s3
-; GCN3-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
-; GCN3-NEXT:    s_add_u32 s0, s0, s4
-; GCN3-NEXT:    s_addc_u32 s1, s1, s5
+; GCN3-NEXT:    s_ashr_i32 s1, s7, 31
+; GCN3-NEXT:    s_mov_b32 s0, s7
+; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; GCN3-NEXT:    s_add_u32 s0, s4, s0
+; GCN3-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    flat_load_dword v3, v[0:1] offset:16
@@ -3897,7 +3897,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 %
 ; GCN3-NEXT:  .LBB88_1: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_max_i32_e32 v2, s2, v3
+; GCN3-NEXT:    v_max_i32_e32 v2, s6, v3
 ; GCN3-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
@@ -4057,13 +4057,13 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index)
 ;
 ; GCN2-LABEL: atomic_max_i32_addr64:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_ashr_i32 s5, s3, 31
-; GCN2-NEXT:    s_mov_b32 s4, s3
-; GCN2-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
-; GCN2-NEXT:    s_add_u32 s0, s0, s4
-; GCN2-NEXT:    s_addc_u32 s1, s1, s5
+; GCN2-NEXT:    s_ashr_i32 s1, s7, 31
+; GCN2-NEXT:    s_mov_b32 s0, s7
+; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; GCN2-NEXT:    s_add_u32 s0, s4, s0
+; GCN2-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN2-NEXT:    flat_load_dword v3, v[0:1]
@@ -4071,7 +4071,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index)
 ; GCN2-NEXT:  .LBB90_1: ; %atomicrmw.start
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_max_i32_e32 v2, s2, v3
+; GCN2-NEXT:    v_max_i32_e32 v2, s6, v3
 ; GCN2-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
@@ -4085,13 +4085,13 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index)
 ;
 ; GCN3-LABEL: atomic_max_i32_addr64:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_ashr_i32 s5, s3, 31
-; GCN3-NEXT:    s_mov_b32 s4, s3
-; GCN3-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
-; GCN3-NEXT:    s_add_u32 s0, s0, s4
-; GCN3-NEXT:    s_addc_u32 s1, s1, s5
+; GCN3-NEXT:    s_ashr_i32 s1, s7, 31
+; GCN3-NEXT:    s_mov_b32 s0, s7
+; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; GCN3-NEXT:    s_add_u32 s0, s4, s0
+; GCN3-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    flat_load_dword v3, v[0:1]
@@ -4099,7 +4099,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index)
 ; GCN3-NEXT:  .LBB90_1: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_max_i32_e32 v2, s2, v3
+; GCN3-NEXT:    v_max_i32_e32 v2, s6, v3
 ; GCN3-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
@@ -4996,13 +4996,13 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32
 ;
 ; GCN2-LABEL: atomic_umax_i32_addr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_ashr_i32 s5, s3, 31
-; GCN2-NEXT:    s_mov_b32 s4, s3
-; GCN2-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
-; GCN2-NEXT:    s_add_u32 s0, s0, s4
-; GCN2-NEXT:    s_addc_u32 s1, s1, s5
+; GCN2-NEXT:    s_ashr_i32 s1, s7, 31
+; GCN2-NEXT:    s_mov_b32 s0, s7
+; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; GCN2-NEXT:    s_add_u32 s0, s4, s0
+; GCN2-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN2-NEXT:    s_add_u32 s0, s0, 16
 ; GCN2-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
@@ -5012,7 +5012,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32
 ; GCN2-NEXT:  .LBB102_1: ; %atomicrmw.start
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_max_u32_e32 v2, s2, v3
+; GCN2-NEXT:    v_max_u32_e32 v2, s6, v3
 ; GCN2-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
@@ -5026,13 +5026,13 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32
 ;
 ; GCN3-LABEL: atomic_umax_i32_addr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_ashr_i32 s5, s3, 31
-; GCN3-NEXT:    s_mov_b32 s4, s3
-; GCN3-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
-; GCN3-NEXT:    s_add_u32 s0, s0, s4
-; GCN3-NEXT:    s_addc_u32 s1, s1, s5
+; GCN3-NEXT:    s_ashr_i32 s1, s7, 31
+; GCN3-NEXT:    s_mov_b32 s0, s7
+; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; GCN3-NEXT:    s_add_u32 s0, s4, s0
+; GCN3-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    flat_load_dword v3, v[0:1] offset:16
@@ -5040,7 +5040,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32
 ; GCN3-NEXT:  .LBB102_1: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_max_u32_e32 v2, s2, v3
+; GCN3-NEXT:    v_max_u32_e32 v2, s6, v3
 ; GCN3-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
@@ -6790,13 +6790,13 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 %
 ;
 ; GCN2-LABEL: atomic_min_i32_addr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_ashr_i32 s5, s3, 31
-; GCN2-NEXT:    s_mov_b32 s4, s3
-; GCN2-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
-; GCN2-NEXT:    s_add_u32 s0, s0, s4
-; GCN2-NEXT:    s_addc_u32 s1, s1, s5
+; GCN2-NEXT:    s_ashr_i32 s1, s7, 31
+; GCN2-NEXT:    s_mov_b32 s0, s7
+; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; GCN2-NEXT:    s_add_u32 s0, s4, s0
+; GCN2-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN2-NEXT:    s_add_u32 s0, s0, 16
 ; GCN2-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
@@ -6806,7 +6806,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 %
 ; GCN2-NEXT:  .LBB125_1: ; %atomicrmw.start
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_min_i32_e32 v2, s2, v3
+; GCN2-NEXT:    v_min_i32_e32 v2, s6, v3
 ; GCN2-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
@@ -6820,13 +6820,13 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 %
 ;
 ; GCN3-LABEL: atomic_min_i32_addr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_ashr_i32 s5, s3, 31
-; GCN3-NEXT:    s_mov_b32 s4, s3
-; GCN3-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
-; GCN3-NEXT:    s_add_u32 s0, s0, s4
-; GCN3-NEXT:    s_addc_u32 s1, s1, s5
+; GCN3-NEXT:    s_ashr_i32 s1, s7, 31
+; GCN3-NEXT:    s_mov_b32 s0, s7
+; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; GCN3-NEXT:    s_add_u32 s0, s4, s0
+; GCN3-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    flat_load_dword v3, v[0:1] offset:16
@@ -6834,7 +6834,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 %
 ; GCN3-NEXT:  .LBB125_1: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_min_i32_e32 v2, s2, v3
+; GCN3-NEXT:    v_min_i32_e32 v2, s6, v3
 ; GCN3-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
diff --git a/llvm/test/CodeGen/AMDGPU/fma-combine.ll b/llvm/test/CodeGen/AMDGPU/fma-combine.ll
index bac2d8b8b40c2..2a9a9ef7c43b6 100644
--- a/llvm/test/CodeGen/AMDGPU/fma-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/fma-combine.ll
@@ -41,17 +41,17 @@ define amdgpu_kernel void @combine_to_fma_f64_0(ptr addrspace(1) noalias %out, p
 ;
 ; GFX11-LABEL: combine_to_fma_f64_0:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 3, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[0:1], v6, s[2:3] glc dlc
+; GFX11-NEXT:    global_load_b64 v[0:1], v6, s[6:7] glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[2:3], v6, s[2:3] offset:8 glc dlc
+; GFX11-NEXT:    global_load_b64 v[2:3], v6, s[6:7] offset:8 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[4:5], v6, s[2:3] offset:16 glc dlc
+; GFX11-NEXT:    global_load_b64 v[4:5], v6, s[6:7] offset:16 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
-; GFX11-NEXT:    global_store_b64 v6, v[0:1], s[0:1]
+; GFX11-NEXT:    global_store_b64 v6, v[0:1], s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -101,22 +101,22 @@ define amdgpu_kernel void @combine_to_fma_f64_0_2use(ptr addrspace(1) noalias %o
 ;
 ; GFX11-LABEL: combine_to_fma_f64_0_2use:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 3, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[0:1], v8, s[2:3] glc dlc
+; GFX11-NEXT:    global_load_b64 v[0:1], v8, s[6:7] glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[2:3], v8, s[2:3] offset:8 glc dlc
+; GFX11-NEXT:    global_load_b64 v[2:3], v8, s[6:7] offset:8 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[4:5], v8, s[2:3] offset:16 glc dlc
+; GFX11-NEXT:    global_load_b64 v[4:5], v8, s[6:7] offset:16 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[6:7], v8, s[2:3] offset:24 glc dlc
+; GFX11-NEXT:    global_load_b64 v[6:7], v8, s[6:7] offset:24 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_fma_f64 v[4:5], v[0:1], v[2:3], v[4:5]
 ; GFX11-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[6:7]
-; GFX11-NEXT:    global_store_b64 v8, v[4:5], s[0:1] dlc
+; GFX11-NEXT:    global_store_b64 v8, v[4:5], s[4:5] dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_store_b64 v8, v[0:1], s[0:1] offset:8 dlc
+; GFX11-NEXT:    global_store_b64 v8, v[0:1], s[4:5] offset:8 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -166,17 +166,17 @@ define amdgpu_kernel void @combine_to_fma_f64_1(ptr addrspace(1) noalias %out, p
 ;
 ; GFX11-LABEL: combine_to_fma_f64_1:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 3, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[0:1], v6, s[2:3] glc dlc
+; GFX11-NEXT:    global_load_b64 v[0:1], v6, s[6:7] glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[2:3], v6, s[2:3] offset:8 glc dlc
+; GFX11-NEXT:    global_load_b64 v[2:3], v6, s[6:7] offset:8 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[4:5], v6, s[2:3] offset:16 glc dlc
+; GFX11-NEXT:    global_load_b64 v[4:5], v6, s[6:7] offset:16 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
-; GFX11-NEXT:    global_store_b64 v6, v[0:1], s[0:1]
+; GFX11-NEXT:    global_store_b64 v6, v[0:1], s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -220,17 +220,17 @@ define amdgpu_kernel void @combine_to_fma_fsub_0_f64(ptr addrspace(1) noalias %o
 ;
 ; GFX11-LABEL: combine_to_fma_fsub_0_f64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 3, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[0:1], v6, s[2:3] glc dlc
+; GFX11-NEXT:    global_load_b64 v[0:1], v6, s[6:7] glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[2:3], v6, s[2:3] offset:8 glc dlc
+; GFX11-NEXT:    global_load_b64 v[2:3], v6, s[6:7] offset:8 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[4:5], v6, s[2:3] offset:16 glc dlc
+; GFX11-NEXT:    global_load_b64 v[4:5], v6, s[6:7] offset:16 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], -v[4:5]
-; GFX11-NEXT:    global_store_b64 v6, v[0:1], s[0:1]
+; GFX11-NEXT:    global_store_b64 v6, v[0:1], s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -280,22 +280,22 @@ define amdgpu_kernel void @combine_to_fma_fsub_f64_0_2use(ptr addrspace(1) noali
 ;
 ; GFX11-LABEL: combine_to_fma_fsub_f64_0_2use:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 3, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[0:1], v8, s[2:3] glc dlc
+; GFX11-NEXT:    global_load_b64 v[0:1], v8, s[6:7] glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[2:3], v8, s[2:3] offset:8 glc dlc
+; GFX11-NEXT:    global_load_b64 v[2:3], v8, s[6:7] offset:8 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[4:5], v8, s[2:3] offset:16 glc dlc
+; GFX11-NEXT:    global_load_b64 v[4:5], v8, s[6:7] offset:16 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[6:7], v8, s[2:3] offset:24 glc dlc
+; GFX11-NEXT:    global_load_b64 v[6:7], v8, s[6:7] offset:24 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_fma_f64 v[4:5], v[0:1], v[2:3], -v[4:5]
 ; GFX11-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], -v[6:7]
-; GFX11-NEXT:    global_store_b64 v8, v[4:5], s[0:1] dlc
+; GFX11-NEXT:    global_store_b64 v8, v[4:5], s[4:5] dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_store_b64 v8, v[0:1], s[0:1] offset:8 dlc
+; GFX11-NEXT:    global_store_b64 v8, v[0:1], s[4:5] offset:8 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -345,17 +345,17 @@ define amdgpu_kernel void @combine_to_fma_fsub_1_f64(ptr addrspace(1) noalias %o
 ;
 ; GFX11-LABEL: combine_to_fma_fsub_1_f64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 3, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[0:1], v6, s[2:3] glc dlc
+; GFX11-NEXT:    global_load_b64 v[0:1], v6, s[6:7] glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[2:3], v6, s[2:3] offset:8 glc dlc
+; GFX11-NEXT:    global_load_b64 v[2:3], v6, s[6:7] offset:8 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[4:5], v6, s[2:3] offset:16 glc dlc
+; GFX11-NEXT:    global_load_b64 v[4:5], v6, s[6:7] offset:16 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], v[4:5]
-; GFX11-NEXT:    global_store_b64 v6, v[0:1], s[0:1]
+; GFX11-NEXT:    global_store_b64 v6, v[0:1], s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -405,22 +405,22 @@ define amdgpu_kernel void @combine_to_fma_fsub_1_f64_2use(ptr addrspace(1) noali
 ;
 ; GFX11-LABEL: combine_to_fma_fsub_1_f64_2use:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 3, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[0:1], v8, s[2:3] glc dlc
+; GFX11-NEXT:    global_load_b64 v[0:1], v8, s[6:7] glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[2:3], v8, s[2:3] offset:8 glc dlc
+; GFX11-NEXT:    global_load_b64 v[2:3], v8, s[6:7] offset:8 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[4:5], v8, s[2:3] offset:16 glc dlc
+; GFX11-NEXT:    global_load_b64 v[4:5], v8, s[6:7] offset:16 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[6:7], v8, s[2:3] offset:24 glc dlc
+; GFX11-NEXT:    global_load_b64 v[6:7], v8, s[6:7] offset:24 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], v[4:5]
 ; GFX11-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], v[6:7]
-; GFX11-NEXT:    global_store_b64 v8, v[4:5], s[0:1] dlc
+; GFX11-NEXT:    global_store_b64 v8, v[4:5], s[4:5] dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_store_b64 v8, v[0:1], s[0:1] offset:8 dlc
+; GFX11-NEXT:    global_store_b64 v8, v[0:1], s[4:5] offset:8 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -470,17 +470,17 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64(ptr addrspace(1) noalias %o
 ;
 ; GFX11-LABEL: combine_to_fma_fsub_2_f64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 3, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[0:1], v6, s[2:3] glc dlc
+; GFX11-NEXT:    global_load_b64 v[0:1], v6, s[6:7] glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[2:3], v6, s[2:3] offset:8 glc dlc
+; GFX11-NEXT:    global_load_b64 v[2:3], v6, s[6:7] offset:8 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[4:5], v6, s[2:3] offset:16 glc dlc
+; GFX11-NEXT:    global_load_b64 v[4:5], v6, s[6:7] offset:16 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], -v[4:5]
-; GFX11-NEXT:    global_store_b64 v6, v[0:1], s[0:1]
+; GFX11-NEXT:    global_store_b64 v6, v[0:1], s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -532,22 +532,22 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_neg(ptr addrspace(1)
 ;
 ; GFX11-LABEL: combine_to_fma_fsub_2_f64_2uses_neg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 3, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[0:1], v8, s[2:3] glc dlc
+; GFX11-NEXT:    global_load_b64 v[0:1], v8, s[6:7] glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[2:3], v8, s[2:3] offset:8 glc dlc
+; GFX11-NEXT:    global_load_b64 v[2:3], v8, s[6:7] offset:8 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[4:5], v8, s[2:3] offset:16 glc dlc
+; GFX11-NEXT:    global_load_b64 v[4:5], v8, s[6:7] offset:16 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[6:7], v8, s[2:3] offset:24 glc dlc
+; GFX11-NEXT:    global_load_b64 v[6:7], v8, s[6:7] offset:24 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], -v[4:5]
 ; GFX11-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], -v[6:7]
-; GFX11-NEXT:    global_store_b64 v8, v[4:5], s[0:1] dlc
+; GFX11-NEXT:    global_store_b64 v8, v[4:5], s[4:5] dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_store_b64 v8, v[0:1], s[0:1] offset:8 dlc
+; GFX11-NEXT:    global_store_b64 v8, v[0:1], s[4:5] offset:8 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -605,22 +605,22 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_mul(ptr addrspace(1)
 ;
 ; GFX11-LABEL: combine_to_fma_fsub_2_f64_2uses_mul:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 3, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[0:1], v8, s[2:3] glc dlc
+; GFX11-NEXT:    global_load_b64 v[0:1], v8, s[6:7] glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[2:3], v8, s[2:3] offset:8 glc dlc
+; GFX11-NEXT:    global_load_b64 v[2:3], v8, s[6:7] offset:8 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[4:5], v8, s[2:3] offset:16 glc dlc
+; GFX11-NEXT:    global_load_b64 v[4:5], v8, s[6:7] offset:16 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[6:7], v8, s[2:3] offset:24 glc dlc
+; GFX11-NEXT:    global_load_b64 v[6:7], v8, s[6:7] offset:24 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], -v[4:5]
 ; GFX11-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], -v[6:7]
-; GFX11-NEXT:    global_store_b64 v8, v[4:5], s[0:1] dlc
+; GFX11-NEXT:    global_store_b64 v8, v[4:5], s[4:5] dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_store_b64 v8, v[0:1], s[0:1] offset:8 dlc
+; GFX11-NEXT:    global_store_b64 v8, v[0:1], s[4:5] offset:8 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -703,47 +703,47 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(ptr addrspace(1)
 ;
 ; GFX11-NOFMA-LABEL: aggressive_combine_to_fma_fsub_0_f64:
 ; GFX11-NOFMA:       ; %bb.0:
-; GFX11-NOFMA-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NOFMA-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NOFMA-NEXT:    v_lshlrev_b32_e32 v10, 3, v0
 ; GFX11-NOFMA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NOFMA-NEXT:    global_load_b64 v[0:1], v10, s[2:3] glc dlc
+; GFX11-NOFMA-NEXT:    global_load_b64 v[0:1], v10, s[6:7] glc dlc
 ; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NOFMA-NEXT:    global_load_b64 v[2:3], v10, s[2:3] offset:8 glc dlc
+; GFX11-NOFMA-NEXT:    global_load_b64 v[2:3], v10, s[6:7] offset:8 glc dlc
 ; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NOFMA-NEXT:    global_load_b64 v[4:5], v10, s[2:3] offset:16 glc dlc
+; GFX11-NOFMA-NEXT:    global_load_b64 v[4:5], v10, s[6:7] offset:16 glc dlc
 ; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NOFMA-NEXT:    global_load_b64 v[6:7], v10, s[2:3] offset:24 glc dlc
+; GFX11-NOFMA-NEXT:    global_load_b64 v[6:7], v10, s[6:7] offset:24 glc dlc
 ; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NOFMA-NEXT:    global_load_b64 v[8:9], v10, s[2:3] offset:32 glc dlc
+; GFX11-NOFMA-NEXT:    global_load_b64 v[8:9], v10, s[6:7] offset:32 glc dlc
 ; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NOFMA-NEXT:    v_mul_f64 v[6:7], v[6:7], v[8:9]
 ; GFX11-NOFMA-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NOFMA-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[6:7]
 ; GFX11-NOFMA-NEXT:    v_add_f64 v[0:1], v[0:1], -v[4:5]
-; GFX11-NOFMA-NEXT:    global_store_b64 v10, v[0:1], s[0:1]
+; GFX11-NOFMA-NEXT:    global_store_b64 v10, v[0:1], s[4:5]
 ; GFX11-NOFMA-NEXT:    s_nop 0
 ; GFX11-NOFMA-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NOFMA-NEXT:    s_endpgm
 ;
 ; GFX11-FMA-LABEL: aggressive_combine_to_fma_fsub_0_f64:
 ; GFX11-FMA:       ; %bb.0:
-; GFX11-FMA-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-FMA-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-FMA-NEXT:    v_lshlrev_b32_e32 v10, 3, v0
 ; GFX11-FMA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FMA-NEXT:    global_load_b64 v[0:1], v10, s[2:3] glc dlc
+; GFX11-FMA-NEXT:    global_load_b64 v[0:1], v10, s[6:7] glc dlc
 ; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FMA-NEXT:    global_load_b64 v[2:3], v10, s[2:3] offset:8 glc dlc
+; GFX11-FMA-NEXT:    global_load_b64 v[2:3], v10, s[6:7] offset:8 glc dlc
 ; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FMA-NEXT:    global_load_b64 v[4:5], v10, s[2:3] offset:16 glc dlc
+; GFX11-FMA-NEXT:    global_load_b64 v[4:5], v10, s[6:7] offset:16 glc dlc
 ; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FMA-NEXT:    global_load_b64 v[6:7], v10, s[2:3] offset:24 glc dlc
+; GFX11-FMA-NEXT:    global_load_b64 v[6:7], v10, s[6:7] offset:24 glc dlc
 ; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FMA-NEXT:    global_load_b64 v[8:9], v10, s[2:3] offset:32 glc dlc
+; GFX11-FMA-NEXT:    global_load_b64 v[8:9], v10, s[6:7] offset:32 glc dlc
 ; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FMA-NEXT:    v_fma_f64 v[4:5], v[6:7], v[8:9], -v[4:5]
 ; GFX11-FMA-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FMA-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
-; GFX11-FMA-NEXT:    global_store_b64 v10, v[0:1], s[0:1]
+; GFX11-FMA-NEXT:    global_store_b64 v10, v[0:1], s[4:5]
 ; GFX11-FMA-NEXT:    s_nop 0
 ; GFX11-FMA-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FMA-NEXT:    s_endpgm
@@ -825,47 +825,47 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(ptr addrspace(1)
 ;
 ; GFX11-NOFMA-LABEL: aggressive_combine_to_fma_fsub_1_f64:
 ; GFX11-NOFMA:       ; %bb.0:
-; GFX11-NOFMA-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NOFMA-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NOFMA-NEXT:    v_lshlrev_b32_e32 v10, 3, v0
 ; GFX11-NOFMA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NOFMA-NEXT:    global_load_b64 v[0:1], v10, s[2:3] glc dlc
+; GFX11-NOFMA-NEXT:    global_load_b64 v[0:1], v10, s[6:7] glc dlc
 ; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NOFMA-NEXT:    global_load_b64 v[2:3], v10, s[2:3] offset:8 glc dlc
+; GFX11-NOFMA-NEXT:    global_load_b64 v[2:3], v10, s[6:7] offset:8 glc dlc
 ; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NOFMA-NEXT:    global_load_b64 v[4:5], v10, s[2:3] offset:16 glc dlc
+; GFX11-NOFMA-NEXT:    global_load_b64 v[4:5], v10, s[6:7] offset:16 glc dlc
 ; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NOFMA-NEXT:    global_load_b64 v[6:7], v10, s[2:3] offset:24 glc dlc
+; GFX11-NOFMA-NEXT:    global_load_b64 v[6:7], v10, s[6:7] offset:24 glc dlc
 ; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NOFMA-NEXT:    global_load_b64 v[8:9], v10, s[2:3] offset:32 glc dlc
+; GFX11-NOFMA-NEXT:    global_load_b64 v[8:9], v10, s[6:7] offset:32 glc dlc
 ; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NOFMA-NEXT:    v_mul_f64 v[6:7], v[6:7], v[8:9]
 ; GFX11-NOFMA-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NOFMA-NEXT:    v_fma_f64 v[2:3], v[2:3], v[4:5], v[6:7]
 ; GFX11-NOFMA-NEXT:    v_add_f64 v[0:1], v[0:1], -v[2:3]
-; GFX11-NOFMA-NEXT:    global_store_b64 v10, v[0:1], s[0:1]
+; GFX11-NOFMA-NEXT:    global_store_b64 v10, v[0:1], s[4:5]
 ; GFX11-NOFMA-NEXT:    s_nop 0
 ; GFX11-NOFMA-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NOFMA-NEXT:    s_endpgm
 ;
 ; GFX11-FMA-LABEL: aggressive_combine_to_fma_fsub_1_f64:
 ; GFX11-FMA:       ; %bb.0:
-; GFX11-FMA-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-FMA-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-FMA-NEXT:    v_lshlrev_b32_e32 v10, 3, v0
 ; GFX11-FMA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FMA-NEXT:    global_load_b64 v[0:1], v10, s[2:3] glc dlc
+; GFX11-FMA-NEXT:    global_load_b64 v[0:1], v10, s[6:7] glc dlc
 ; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FMA-NEXT:    global_load_b64 v[2:3], v10, s[2:3] offset:8 glc dlc
+; GFX11-FMA-NEXT:    global_load_b64 v[2:3], v10, s[6:7] offset:8 glc dlc
 ; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FMA-NEXT:    global_load_b64 v[4:5], v10, s[2:3] offset:16 glc dlc
+; GFX11-FMA-NEXT:    global_load_b64 v[4:5], v10, s[6:7] offset:16 glc dlc
 ; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FMA-NEXT:    global_load_b64 v[6:7], v10, s[2:3] offset:24 glc dlc
+; GFX11-FMA-NEXT:    global_load_b64 v[6:7], v10, s[6:7] offset:24 glc dlc
 ; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FMA-NEXT:    global_load_b64 v[8:9], v10, s[2:3] offset:32 glc dlc
+; GFX11-FMA-NEXT:    global_load_b64 v[8:9], v10, s[6:7] offset:32 glc dlc
 ; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FMA-NEXT:    v_fma_f64 v[0:1], -v[6:7], v[8:9], v[0:1]
 ; GFX11-FMA-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FMA-NEXT:    v_fma_f64 v[0:1], -v[2:3], v[4:5], v[0:1]
-; GFX11-FMA-NEXT:    global_store_b64 v10, v[0:1], s[0:1]
+; GFX11-FMA-NEXT:    global_store_b64 v10, v[0:1], s[4:5]
 ; GFX11-FMA-NEXT:    s_nop 0
 ; GFX11-FMA-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FMA-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
index 7830c91851bfa..0cb6b783f496e 100644
--- a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
@@ -46,11 +46,11 @@ define amdgpu_kernel void @multiple_fadd_use_test_f32(ptr addrspace(1) %out, flo
 ;
 ; GFX11-LABEL: multiple_fadd_use_test_f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_add_f32_e64 v0, s3, -1.0
-; GFX11-NEXT:    v_add_f32_e64 v1, s2, -1.0
+; GFX11-NEXT:    v_add_f32_e64 v0, s7, -1.0
+; GFX11-NEXT:    v_add_f32_e64 v1, s6, -1.0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_gt_f32_e64 vcc_lo, |v0|, |v1|
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
@@ -58,7 +58,7 @@ define amdgpu_kernel void @multiple_fadd_use_test_f32(ptr addrspace(1) %out, flo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_f32_e32 v1, v0, v0
 ; GFX11-NEXT:    v_fma_f32 v0, -v1, v0, 1.0
-; GFX11-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX11-NEXT:    global_store_b32 v2, v0, s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -174,14 +174,14 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f32(ptr addrspace(1) %out, flo
 ;
 ; GFX11-LABEL: multiple_use_fadd_fmad_f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_add_f32_e64 v1, |s2|, |s2|
-; GFX11-NEXT:    v_fma_f32 v2, |s2|, 2.0, s3
-; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1] dlc
+; GFX11-NEXT:    v_add_f32_e64 v1, |s6|, |s6|
+; GFX11-NEXT:    v_fma_f32 v2, |s6|, 2.0, s7
+; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5] dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_store_b32 v0, v2, s[0:1] offset:4 dlc
+; GFX11-NEXT:    global_store_b32 v0, v2, s[4:5] offset:4 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
diff --git a/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll
index 718be90eb75fc..bde0dc326f9ac 100644
--- a/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll
@@ -1339,11 +1339,11 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(ptr addrspace(1) %out, ptr ad
 define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 {
 ; VI-FLUSH-LABEL: mad_sub_f16:
 ; VI-FLUSH:       ; %bb.0:
-; VI-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-FLUSH-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-FLUSH-NEXT:    v_lshlrev_b32_e32 v6, 1, v0
 ; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s3
-; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
+; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s7
+; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s6, v6
 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-FLUSH-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -1355,8 +1355,8 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out,
 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; VI-FLUSH-NEXT:    flat_load_ushort v3, v[4:5] glc
 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s1
-; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s0, v6
+; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s5
+; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s4, v6
 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-FLUSH-NEXT:    v_mad_f16 v2, v7, v2, -v3
 ; VI-FLUSH-NEXT:    flat_store_short v[0:1], v2
@@ -1364,11 +1364,11 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out,
 ;
 ; VI-DENORM-CONTRACT-LABEL: mad_sub_f16:
 ; VI-DENORM-CONTRACT:       ; %bb.0:
-; VI-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v6, 1, v0
 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s3
-; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
+; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s7
+; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s6, v6
 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -1380,8 +1380,8 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out,
 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v3, v[4:5] glc
 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s1
-; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s0, v6
+; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s5
+; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s4, v6
 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-DENORM-CONTRACT-NEXT:    v_fma_f16 v2, v7, v2, -v3
 ; VI-DENORM-CONTRACT-NEXT:    flat_store_short v[0:1], v2
@@ -1389,102 +1389,102 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out,
 ;
 ; GFX10-FLUSH-LABEL: mad_sub_f16:
 ; GFX10-FLUSH:       ; %bb.0:
-; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX10-FLUSH-NEXT:    v_sub_f16_e32 v1, v1, v3
-; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX10-DENORM-STRICT-LABEL: mad_sub_f16:
 ; GFX10-DENORM-STRICT:       ; %bb.0:
-; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-STRICT-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX10-DENORM-STRICT-NEXT:    v_sub_f16_e32 v1, v1, v3
-; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-DENORM-STRICT-NEXT:    s_endpgm
 ;
 ; GFX10-DENORM-CONTRACT-LABEL: mad_sub_f16:
 ; GFX10-DENORM-CONTRACT:       ; %bb.0:
-; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, v1, v2, -v3
-; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-DENORM-CONTRACT-NEXT:    s_endpgm
 ;
 ; GFX11-FLUSH-LABEL: mad_sub_f16:
 ; GFX11-FLUSH:       ; %bb.0:
-; GFX11-FLUSH-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-FLUSH-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[6:7] glc dlc
 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
+; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[6:7] offset:2 glc dlc
 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
+; GFX11-FLUSH-NEXT:    global_load_u16 v3, v0, s[6:7] offset:4 glc dlc
 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FLUSH-NEXT:    v_sub_f16_e32 v1, v1, v3
-; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[4:5]
 ; GFX11-FLUSH-NEXT:    s_nop 0
 ; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-STRICT-LABEL: mad_sub_f16:
 ; GFX11-DENORM-STRICT:       ; %bb.0:
-; GFX11-DENORM-STRICT-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-DENORM-STRICT-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v1, v0, s[6:7] glc dlc
 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
+; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v2, v0, s[6:7] offset:2 glc dlc
 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
+; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v3, v0, s[6:7] offset:4 glc dlc
 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DENORM-STRICT-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX11-DENORM-STRICT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-DENORM-STRICT-NEXT:    v_sub_f16_e32 v1, v1, v3
-; GFX11-DENORM-STRICT-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-DENORM-STRICT-NEXT:    global_store_b16 v0, v1, s[4:5]
 ; GFX11-DENORM-STRICT-NEXT:    s_nop 0
 ; GFX11-DENORM-STRICT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DENORM-STRICT-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-CONTRACT-LABEL: mad_sub_f16:
 ; GFX11-DENORM-CONTRACT:       ; %bb.0:
-; GFX11-DENORM-CONTRACT-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-DENORM-CONTRACT-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v1, v0, s[6:7] glc dlc
 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v2, v0, s[6:7] offset:2 glc dlc
 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
+; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v3, v0, s[6:7] offset:4 glc dlc
 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, v1, v2, -v3
-; GFX11-DENORM-CONTRACT-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-DENORM-CONTRACT-NEXT:    global_store_b16 v0, v1, s[4:5]
 ; GFX11-DENORM-CONTRACT-NEXT:    s_nop 0
 ; GFX11-DENORM-CONTRACT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DENORM-CONTRACT-NEXT:    s_endpgm
@@ -1508,11 +1508,11 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out,
 define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 {
 ; VI-FLUSH-LABEL: mad_sub_inv_f16:
 ; VI-FLUSH:       ; %bb.0:
-; VI-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-FLUSH-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-FLUSH-NEXT:    v_lshlrev_b32_e32 v6, 1, v0
 ; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s3
-; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
+; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s7
+; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s6, v6
 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-FLUSH-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -1524,8 +1524,8 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o
 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; VI-FLUSH-NEXT:    flat_load_ushort v3, v[4:5] glc
 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s1
-; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s0, v6
+; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s5
+; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s4, v6
 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-FLUSH-NEXT:    v_mad_f16 v2, -v7, v2, v3
 ; VI-FLUSH-NEXT:    flat_store_short v[0:1], v2
@@ -1533,11 +1533,11 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o
 ;
 ; VI-DENORM-CONTRACT-LABEL: mad_sub_inv_f16:
 ; VI-DENORM-CONTRACT:       ; %bb.0:
-; VI-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v6, 1, v0
 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s3
-; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
+; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s7
+; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s6, v6
 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -1549,8 +1549,8 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o
 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v3, v[4:5] glc
 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s1
-; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s0, v6
+; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s5
+; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s4, v6
 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-DENORM-CONTRACT-NEXT:    v_fma_f16 v2, -v7, v2, v3
 ; VI-DENORM-CONTRACT-NEXT:    flat_store_short v[0:1], v2
@@ -1558,102 +1558,102 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o
 ;
 ; GFX10-FLUSH-LABEL: mad_sub_inv_f16:
 ; GFX10-FLUSH:       ; %bb.0:
-; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX10-FLUSH-NEXT:    v_sub_f16_e32 v1, v3, v1
-; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX10-DENORM-STRICT-LABEL: mad_sub_inv_f16:
 ; GFX10-DENORM-STRICT:       ; %bb.0:
-; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-STRICT-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX10-DENORM-STRICT-NEXT:    v_sub_f16_e32 v1, v3, v1
-; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-DENORM-STRICT-NEXT:    s_endpgm
 ;
 ; GFX10-DENORM-CONTRACT-LABEL: mad_sub_inv_f16:
 ; GFX10-DENORM-CONTRACT:       ; %bb.0:
-; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, -v1, v2, v3
-; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-DENORM-CONTRACT-NEXT:    s_endpgm
 ;
 ; GFX11-FLUSH-LABEL: mad_sub_inv_f16:
 ; GFX11-FLUSH:       ; %bb.0:
-; GFX11-FLUSH-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-FLUSH-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[6:7] glc dlc
 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
+; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[6:7] offset:2 glc dlc
 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
+; GFX11-FLUSH-NEXT:    global_load_u16 v3, v0, s[6:7] offset:4 glc dlc
 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FLUSH-NEXT:    v_sub_f16_e32 v1, v3, v1
-; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[4:5]
 ; GFX11-FLUSH-NEXT:    s_nop 0
 ; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-STRICT-LABEL: mad_sub_inv_f16:
 ; GFX11-DENORM-STRICT:       ; %bb.0:
-; GFX11-DENORM-STRICT-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-DENORM-STRICT-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v1, v0, s[6:7] glc dlc
 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
+; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v2, v0, s[6:7] offset:2 glc dlc
 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
+; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v3, v0, s[6:7] offset:4 glc dlc
 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DENORM-STRICT-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX11-DENORM-STRICT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-DENORM-STRICT-NEXT:    v_sub_f16_e32 v1, v3, v1
-; GFX11-DENORM-STRICT-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-DENORM-STRICT-NEXT:    global_store_b16 v0, v1, s[4:5]
 ; GFX11-DENORM-STRICT-NEXT:    s_nop 0
 ; GFX11-DENORM-STRICT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DENORM-STRICT-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-CONTRACT-LABEL: mad_sub_inv_f16:
 ; GFX11-DENORM-CONTRACT:       ; %bb.0:
-; GFX11-DENORM-CONTRACT-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-DENORM-CONTRACT-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v1, v0, s[6:7] glc dlc
 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v2, v0, s[6:7] offset:2 glc dlc
 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
+; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v3, v0, s[6:7] offset:4 glc dlc
 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, -v1, v2, v3
-; GFX11-DENORM-CONTRACT-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-DENORM-CONTRACT-NEXT:    global_store_b16 v0, v1, s[4:5]
 ; GFX11-DENORM-CONTRACT-NEXT:    s_nop 0
 ; GFX11-DENORM-CONTRACT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DENORM-CONTRACT-NEXT:    s_endpgm
@@ -1677,11 +1677,11 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o
 define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 {
 ; VI-FLUSH-LABEL: mad_sub_fabs_f16:
 ; VI-FLUSH:       ; %bb.0:
-; VI-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-FLUSH-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-FLUSH-NEXT:    v_lshlrev_b32_e32 v6, 1, v0
 ; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s3
-; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
+; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s7
+; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s6, v6
 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-FLUSH-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -1693,8 +1693,8 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture %
 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; VI-FLUSH-NEXT:    flat_load_ushort v3, v[4:5] glc
 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s1
-; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s0, v6
+; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s5
+; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s4, v6
 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-FLUSH-NEXT:    v_mad_f16 v2, v7, v2, -|v3|
 ; VI-FLUSH-NEXT:    flat_store_short v[0:1], v2
@@ -1702,11 +1702,11 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture %
 ;
 ; VI-DENORM-CONTRACT-LABEL: mad_sub_fabs_f16:
 ; VI-DENORM-CONTRACT:       ; %bb.0:
-; VI-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v6, 1, v0
 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s3
-; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
+; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s7
+; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s6, v6
 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -1718,8 +1718,8 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture %
 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v3, v[4:5] glc
 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s1
-; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s0, v6
+; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s5
+; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s4, v6
 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-DENORM-CONTRACT-NEXT:    v_fma_f16 v2, v7, v2, -|v3|
 ; VI-DENORM-CONTRACT-NEXT:    flat_store_short v[0:1], v2
@@ -1727,102 +1727,102 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture %
 ;
 ; GFX10-FLUSH-LABEL: mad_sub_fabs_f16:
 ; GFX10-FLUSH:       ; %bb.0:
-; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX10-FLUSH-NEXT:    v_sub_f16_e64 v1, v1, |v3|
-; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX10-DENORM-STRICT-LABEL: mad_sub_fabs_f16:
 ; GFX10-DENORM-STRICT:       ; %bb.0:
-; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-STRICT-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX10-DENORM-STRICT-NEXT:    v_sub_f16_e64 v1, v1, |v3|
-; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-DENORM-STRICT-NEXT:    s_endpgm
 ;
 ; GFX10-DENORM-CONTRACT-LABEL: mad_sub_fabs_f16:
 ; GFX10-DENORM-CONTRACT:       ; %bb.0:
-; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, v1, v2, -|v3|
-; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-DENORM-CONTRACT-NEXT:    s_endpgm
 ;
 ; GFX11-FLUSH-LABEL: mad_sub_fabs_f16:
 ; GFX11-FLUSH:       ; %bb.0:
-; GFX11-FLUSH-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-FLUSH-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[6:7] glc dlc
 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
+; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[6:7] offset:2 glc dlc
 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
+; GFX11-FLUSH-NEXT:    global_load_u16 v3, v0, s[6:7] offset:4 glc dlc
 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FLUSH-NEXT:    v_sub_f16_e64 v1, v1, |v3|
-; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[4:5]
 ; GFX11-FLUSH-NEXT:    s_nop 0
 ; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-STRICT-LABEL: mad_sub_fabs_f16:
 ; GFX11-DENORM-STRICT:       ; %bb.0:
-; GFX11-DENORM-STRICT-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-DENORM-STRICT-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v1, v0, s[6:7] glc dlc
 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
+; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v2, v0, s[6:7] offset:2 glc dlc
 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
+; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v3, v0, s[6:7] offset:4 glc dlc
 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DENORM-STRICT-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX11-DENORM-STRICT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-DENORM-STRICT-NEXT:    v_sub_f16_e64 v1, v1, |v3|
-; GFX11-DENORM-STRICT-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-DENORM-STRICT-NEXT:    global_store_b16 v0, v1, s[4:5]
 ; GFX11-DENORM-STRICT-NEXT:    s_nop 0
 ; GFX11-DENORM-STRICT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DENORM-STRICT-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-CONTRACT-LABEL: mad_sub_fabs_f16:
 ; GFX11-DENORM-CONTRACT:       ; %bb.0:
-; GFX11-DENORM-CONTRACT-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-DENORM-CONTRACT-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v1, v0, s[6:7] glc dlc
 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v2, v0, s[6:7] offset:2 glc dlc
 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
+; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v3, v0, s[6:7] offset:4 glc dlc
 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, v1, v2, -|v3|
-; GFX11-DENORM-CONTRACT-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-DENORM-CONTRACT-NEXT:    global_store_b16 v0, v1, s[4:5]
 ; GFX11-DENORM-CONTRACT-NEXT:    s_nop 0
 ; GFX11-DENORM-CONTRACT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DENORM-CONTRACT-NEXT:    s_endpgm
@@ -1847,11 +1847,11 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture %
 define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 {
 ; VI-FLUSH-LABEL: mad_sub_fabs_inv_f16:
 ; VI-FLUSH:       ; %bb.0:
-; VI-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-FLUSH-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-FLUSH-NEXT:    v_lshlrev_b32_e32 v6, 1, v0
 ; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s3
-; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
+; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s7
+; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s6, v6
 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-FLUSH-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -1863,8 +1863,8 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu
 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; VI-FLUSH-NEXT:    flat_load_ushort v3, v[4:5] glc
 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s1
-; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s0, v6
+; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s5
+; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s4, v6
 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-FLUSH-NEXT:    v_mad_f16 v2, -v7, v2, |v3|
 ; VI-FLUSH-NEXT:    flat_store_short v[0:1], v2
@@ -1872,11 +1872,11 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu
 ;
 ; VI-DENORM-CONTRACT-LABEL: mad_sub_fabs_inv_f16:
 ; VI-DENORM-CONTRACT:       ; %bb.0:
-; VI-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v6, 1, v0
 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s3
-; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
+; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s7
+; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s6, v6
 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -1888,8 +1888,8 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu
 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v3, v[4:5] glc
 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s1
-; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s0, v6
+; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s5
+; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s4, v6
 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-DENORM-CONTRACT-NEXT:    v_fma_f16 v2, -v7, v2, |v3|
 ; VI-DENORM-CONTRACT-NEXT:    flat_store_short v[0:1], v2
@@ -1897,102 +1897,102 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu
 ;
 ; GFX10-FLUSH-LABEL: mad_sub_fabs_inv_f16:
 ; GFX10-FLUSH:       ; %bb.0:
-; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX10-FLUSH-NEXT:    v_sub_f16_e64 v1, |v3|, v1
-; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX10-DENORM-STRICT-LABEL: mad_sub_fabs_inv_f16:
 ; GFX10-DENORM-STRICT:       ; %bb.0:
-; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-STRICT-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX10-DENORM-STRICT-NEXT:    v_sub_f16_e64 v1, |v3|, v1
-; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-DENORM-STRICT-NEXT:    s_endpgm
 ;
 ; GFX10-DENORM-CONTRACT-LABEL: mad_sub_fabs_inv_f16:
 ; GFX10-DENORM-CONTRACT:       ; %bb.0:
-; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, -v1, v2, |v3|
-; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-DENORM-CONTRACT-NEXT:    s_endpgm
 ;
 ; GFX11-FLUSH-LABEL: mad_sub_fabs_inv_f16:
 ; GFX11-FLUSH:       ; %bb.0:
-; GFX11-FLUSH-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-FLUSH-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[6:7] glc dlc
 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
+; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[6:7] offset:2 glc dlc
 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
+; GFX11-FLUSH-NEXT:    global_load_u16 v3, v0, s[6:7] offset:4 glc dlc
 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FLUSH-NEXT:    v_sub_f16_e64 v1, |v3|, v1
-; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[4:5]
 ; GFX11-FLUSH-NEXT:    s_nop 0
 ; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-STRICT-LABEL: mad_sub_fabs_inv_f16:
 ; GFX11-DENORM-STRICT:       ; %bb.0:
-; GFX11-DENORM-STRICT-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-DENORM-STRICT-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v1, v0, s[6:7] glc dlc
 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
+; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v2, v0, s[6:7] offset:2 glc dlc
 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
+; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v3, v0, s[6:7] offset:4 glc dlc
 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DENORM-STRICT-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX11-DENORM-STRICT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-DENORM-STRICT-NEXT:    v_sub_f16_e64 v1, |v3|, v1
-; GFX11-DENORM-STRICT-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-DENORM-STRICT-NEXT:    global_store_b16 v0, v1, s[4:5]
 ; GFX11-DENORM-STRICT-NEXT:    s_nop 0
 ; GFX11-DENORM-STRICT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DENORM-STRICT-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-CONTRACT-LABEL: mad_sub_fabs_inv_f16:
 ; GFX11-DENORM-CONTRACT:       ; %bb.0:
-; GFX11-DENORM-CONTRACT-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-DENORM-CONTRACT-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v1, v0, s[6:7] glc dlc
 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v2, v0, s[6:7] offset:2 glc dlc
 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
+; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v3, v0, s[6:7] offset:4 glc dlc
 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, -v1, v2, |v3|
-; GFX11-DENORM-CONTRACT-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-DENORM-CONTRACT-NEXT:    global_store_b16 v0, v1, s[4:5]
 ; GFX11-DENORM-CONTRACT-NEXT:    s_nop 0
 ; GFX11-DENORM-CONTRACT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DENORM-CONTRACT-NEXT:    s_endpgm
@@ -2017,11 +2017,11 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu
 define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 {
 ; VI-FLUSH-LABEL: neg_neg_mad_f16:
 ; VI-FLUSH:       ; %bb.0:
-; VI-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-FLUSH-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-FLUSH-NEXT:    v_lshlrev_b32_e32 v6, 1, v0
 ; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s3
-; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
+; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s7
+; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s6, v6
 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-FLUSH-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -2033,8 +2033,8 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o
 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; VI-FLUSH-NEXT:    flat_load_ushort v3, v[4:5] glc
 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s1
-; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s0, v6
+; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s5
+; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s4, v6
 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-FLUSH-NEXT:    v_mac_f16_e32 v3, v7, v2
 ; VI-FLUSH-NEXT:    flat_store_short v[0:1], v3
@@ -2042,11 +2042,11 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o
 ;
 ; VI-DENORM-CONTRACT-LABEL: neg_neg_mad_f16:
 ; VI-DENORM-CONTRACT:       ; %bb.0:
-; VI-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v6, 1, v0
 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s3
-; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
+; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s7
+; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s6, v6
 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -2058,8 +2058,8 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o
 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v3, v[4:5] glc
 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s1
-; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s0, v6
+; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s5
+; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s4, v6
 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-DENORM-CONTRACT-NEXT:    v_fma_f16 v2, v7, v2, v3
 ; VI-DENORM-CONTRACT-NEXT:    flat_store_short v[0:1], v2
@@ -2067,102 +2067,102 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o
 ;
 ; GFX10-FLUSH-LABEL: neg_neg_mad_f16:
 ; GFX10-FLUSH:       ; %bb.0:
-; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX10-FLUSH-NEXT:    v_add_f16_e32 v1, v3, v1
-; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX10-DENORM-STRICT-LABEL: neg_neg_mad_f16:
 ; GFX10-DENORM-STRICT:       ; %bb.0:
-; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-STRICT-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX10-DENORM-STRICT-NEXT:    v_add_f16_e32 v1, v3, v1
-; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-DENORM-STRICT-NEXT:    s_endpgm
 ;
 ; GFX10-DENORM-CONTRACT-LABEL: neg_neg_mad_f16:
 ; GFX10-DENORM-CONTRACT:       ; %bb.0:
-; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-CONTRACT-NEXT:    v_fmac_f16_e32 v3, v1, v2
-; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v3, s[0:1]
+; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v3, s[4:5]
 ; GFX10-DENORM-CONTRACT-NEXT:    s_endpgm
 ;
 ; GFX11-FLUSH-LABEL: neg_neg_mad_f16:
 ; GFX11-FLUSH:       ; %bb.0:
-; GFX11-FLUSH-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-FLUSH-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[6:7] glc dlc
 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
+; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[6:7] offset:2 glc dlc
 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
+; GFX11-FLUSH-NEXT:    global_load_u16 v3, v0, s[6:7] offset:4 glc dlc
 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FLUSH-NEXT:    v_add_f16_e32 v1, v3, v1
-; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[4:5]
 ; GFX11-FLUSH-NEXT:    s_nop 0
 ; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-STRICT-LABEL: neg_neg_mad_f16:
 ; GFX11-DENORM-STRICT:       ; %bb.0:
-; GFX11-DENORM-STRICT-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-DENORM-STRICT-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v1, v0, s[6:7] glc dlc
 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
+; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v2, v0, s[6:7] offset:2 glc dlc
 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
+; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v3, v0, s[6:7] offset:4 glc dlc
 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DENORM-STRICT-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX11-DENORM-STRICT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-DENORM-STRICT-NEXT:    v_add_f16_e32 v1, v3, v1
-; GFX11-DENORM-STRICT-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-DENORM-STRICT-NEXT:    global_store_b16 v0, v1, s[4:5]
 ; GFX11-DENORM-STRICT-NEXT:    s_nop 0
 ; GFX11-DENORM-STRICT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DENORM-STRICT-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-CONTRACT-LABEL: neg_neg_mad_f16:
 ; GFX11-DENORM-CONTRACT:       ; %bb.0:
-; GFX11-DENORM-CONTRACT-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-DENORM-CONTRACT-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v1, v0, s[6:7] glc dlc
 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v2, v0, s[6:7] offset:2 glc dlc
 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
+; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v3, v0, s[6:7] offset:4 glc dlc
 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DENORM-CONTRACT-NEXT:    v_fmac_f16_e32 v3, v1, v2
-; GFX11-DENORM-CONTRACT-NEXT:    global_store_b16 v0, v3, s[0:1]
+; GFX11-DENORM-CONTRACT-NEXT:    global_store_b16 v0, v3, s[4:5]
 ; GFX11-DENORM-CONTRACT-NEXT:    s_nop 0
 ; GFX11-DENORM-CONTRACT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DENORM-CONTRACT-NEXT:    s_endpgm
@@ -2188,11 +2188,11 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o
 define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 {
 ; VI-FLUSH-LABEL: mad_fabs_sub_f16:
 ; VI-FLUSH:       ; %bb.0:
-; VI-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-FLUSH-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-FLUSH-NEXT:    v_lshlrev_b32_e32 v6, 1, v0
 ; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s3
-; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
+; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s7
+; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s6, v6
 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-FLUSH-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -2204,8 +2204,8 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture %
 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; VI-FLUSH-NEXT:    flat_load_ushort v3, v[4:5] glc
 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s1
-; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s0, v6
+; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s5
+; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s4, v6
 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-FLUSH-NEXT:    v_mad_f16 v2, v7, |v2|, -v3
 ; VI-FLUSH-NEXT:    flat_store_short v[0:1], v2
@@ -2213,11 +2213,11 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture %
 ;
 ; VI-DENORM-CONTRACT-LABEL: mad_fabs_sub_f16:
 ; VI-DENORM-CONTRACT:       ; %bb.0:
-; VI-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v6, 1, v0
 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s3
-; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
+; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s7
+; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s6, v6
 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -2229,8 +2229,8 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture %
 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v3, v[4:5] glc
 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s1
-; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s0, v6
+; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s5
+; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s4, v6
 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-DENORM-CONTRACT-NEXT:    v_fma_f16 v2, v7, |v2|, -v3
 ; VI-DENORM-CONTRACT-NEXT:    flat_store_short v[0:1], v2
@@ -2238,102 +2238,102 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture %
 ;
 ; GFX10-FLUSH-LABEL: mad_fabs_sub_f16:
 ; GFX10-FLUSH:       ; %bb.0:
-; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLUSH-NEXT:    v_mul_f16_e64 v1, v1, |v2|
 ; GFX10-FLUSH-NEXT:    v_sub_f16_e32 v1, v1, v3
-; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX10-DENORM-STRICT-LABEL: mad_fabs_sub_f16:
 ; GFX10-DENORM-STRICT:       ; %bb.0:
-; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-STRICT-NEXT:    v_mul_f16_e64 v1, v1, |v2|
 ; GFX10-DENORM-STRICT-NEXT:    v_sub_f16_e32 v1, v1, v3
-; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-DENORM-STRICT-NEXT:    s_endpgm
 ;
 ; GFX10-DENORM-CONTRACT-LABEL: mad_fabs_sub_f16:
 ; GFX10-DENORM-CONTRACT:       ; %bb.0:
-; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, v1, |v2|, -v3
-; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-DENORM-CONTRACT-NEXT:    s_endpgm
 ;
 ; GFX11-FLUSH-LABEL: mad_fabs_sub_f16:
 ; GFX11-FLUSH:       ; %bb.0:
-; GFX11-FLUSH-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-FLUSH-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[6:7] glc dlc
 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
+; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[6:7] offset:2 glc dlc
 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
+; GFX11-FLUSH-NEXT:    global_load_u16 v3, v0, s[6:7] offset:4 glc dlc
 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FLUSH-NEXT:    v_mul_f16_e64 v1, v1, |v2|
 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FLUSH-NEXT:    v_sub_f16_e32 v1, v1, v3
-; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[4:5]
 ; GFX11-FLUSH-NEXT:    s_nop 0
 ; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-STRICT-LABEL: mad_fabs_sub_f16:
 ; GFX11-DENORM-STRICT:       ; %bb.0:
-; GFX11-DENORM-STRICT-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-DENORM-STRICT-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v1, v0, s[6:7] glc dlc
 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
+; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v2, v0, s[6:7] offset:2 glc dlc
 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
+; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v3, v0, s[6:7] offset:4 glc dlc
 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DENORM-STRICT-NEXT:    v_mul_f16_e64 v1, v1, |v2|
 ; GFX11-DENORM-STRICT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-DENORM-STRICT-NEXT:    v_sub_f16_e32 v1, v1, v3
-; GFX11-DENORM-STRICT-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-DENORM-STRICT-NEXT:    global_store_b16 v0, v1, s[4:5]
 ; GFX11-DENORM-STRICT-NEXT:    s_nop 0
 ; GFX11-DENORM-STRICT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DENORM-STRICT-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-CONTRACT-LABEL: mad_fabs_sub_f16:
 ; GFX11-DENORM-CONTRACT:       ; %bb.0:
-; GFX11-DENORM-CONTRACT-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-DENORM-CONTRACT-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v1, v0, s[6:7] glc dlc
 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v2, v0, s[6:7] offset:2 glc dlc
 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
+; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v3, v0, s[6:7] offset:4 glc dlc
 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, v1, |v2|, -v3
-; GFX11-DENORM-CONTRACT-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-DENORM-CONTRACT-NEXT:    global_store_b16 v0, v1, s[4:5]
 ; GFX11-DENORM-CONTRACT-NEXT:    s_nop 0
 ; GFX11-DENORM-CONTRACT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DENORM-CONTRACT-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll
index f411a76e75ab6..ce5bb6617d9f4 100644
--- a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll
+++ b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll
@@ -124,23 +124,23 @@ define amdgpu_kernel void @fnearbyint_v2f32(ptr addrspace(1) %out, <2 x float> %
 ;
 ; VI-LABEL: fnearbyint_v2f32:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_rndne_f32_e32 v1, s3
-; VI-NEXT:    v_rndne_f32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    v_mov_b32_e32 v2, s4
+; VI-NEXT:    v_rndne_f32_e32 v1, s7
+; VI-NEXT:    v_rndne_f32_e32 v0, s6
+; VI-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fnearbyint_v2f32:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_rndne_f32_e32 v1, s3
-; GFX11-NEXT:    v_rndne_f32_e32 v0, s2
-; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT:    v_rndne_f32_e32 v1, s7
+; GFX11-NEXT:    v_rndne_f32_e32 v0, s6
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
index b5440b9c38c9f..277dc01ccd99c 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
@@ -2813,15 +2813,15 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_f32(float %arg, i1 %
 ;
 ; VI-LABEL: s_fneg_select_infloop_regression_f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_bitcmp1_b32 s1, 0
-; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_bitcmp1_b32 s5, 0
+; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[0:1]
 ; VI-NEXT:    v_cndmask_b32_e64 v2, -v0, 0, s[0:1]
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_mov_b32_e32 v0, s6
+; VI-NEXT:    v_mov_b32_e32 v1, s7
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
   %i = select i1 %arg1, float 0.0, float %arg
@@ -3161,15 +3161,15 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_v2f16(<2 x half> %ar
 ;
 ; VI-LABEL: s_fneg_select_infloop_regression_v2f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_and_b32 s1, 1, s1
-; VI-NEXT:    s_cselect_b32 s0, 0, s0
-; VI-NEXT:    s_xor_b32 s0, s0, 0x80008000
-; VI-NEXT:    s_cmp_eq_u32 s1, 1
-; VI-NEXT:    s_cselect_b32 s0, 0, s0
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    s_and_b32 s0, 1, s5
+; VI-NEXT:    s_cselect_b32 s1, 0, s4
+; VI-NEXT:    s_xor_b32 s1, s1, 0x80008000
+; VI-NEXT:    s_cmp_eq_u32 s0, 1
+; VI-NEXT:    s_cselect_b32 s0, 0, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s6
+; VI-NEXT:    v_mov_b32_e32 v1, s7
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
@@ -3293,15 +3293,15 @@ define amdgpu_kernel void @s_fabs_select_infloop_regression_f32(float %arg, i1 %
 ;
 ; VI-LABEL: s_fabs_select_infloop_regression_f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_bitcmp1_b32 s1, 0
-; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_bitcmp1_b32 s5, 0
+; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[0:1]
 ; VI-NEXT:    v_cndmask_b32_e64 v2, |v0|, 0, s[0:1]
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_mov_b32_e32 v0, s6
+; VI-NEXT:    v_mov_b32_e32 v1, s7
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
   %i = select i1 %arg1, float 0.0, float %arg
@@ -3343,15 +3343,15 @@ define amdgpu_kernel void @s_fneg_fabs_select_infloop_regression(float %arg, i1
 ;
 ; VI-LABEL: s_fneg_fabs_select_infloop_regression:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_bitcmp1_b32 s1, 0
-; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_bitcmp1_b32 s5, 0
+; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[0:1]
 ; VI-NEXT:    v_cndmask_b32_e64 v2, -|v0|, 0, s[0:1]
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_mov_b32_e32 v0, s6
+; VI-NEXT:    v_mov_b32_e32 v1, s7
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
   %i = select i1 %arg1, float 0.0, float %arg
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
index 4364b32e62f8c..79f1057af22e7 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
@@ -447,14 +447,14 @@ define amdgpu_kernel void @fneg_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in
 ;
 ; GFX11-LABEL: fneg_fabs_v4f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_or_b32 s2, s2, 0x80008000
-; GFX11-NEXT:    s_or_b32 s3, s3, 0x80008000
+; GFX11-NEXT:    s_or_b32 s0, s6, 0x80008000
+; GFX11-NEXT:    s_or_b32 s1, s7, 0x80008000
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT:    v_mov_b32_e32 v0, s2
-; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll
index 3c000d4fa63a3..32033c52fc998 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll
@@ -18,12 +18,12 @@ define amdgpu_kernel void @fneg_fabsf_fadd_f32(ptr addrspace(1) %out, float %x,
 ;
 ; VI-LABEL: fneg_fabsf_fadd_f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_sub_f32_e64 v2, s3, |v0|
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s6
+; VI-NEXT:    v_sub_f32_e64 v2, s7, |v0|
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
   %fabs = call float @llvm.fabs.f32(float %x)
@@ -49,12 +49,12 @@ define amdgpu_kernel void @fneg_fabsf_fmul_f32(ptr addrspace(1) %out, float %x,
 ;
 ; VI-LABEL: fneg_fabsf_fmul_f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mul_f32_e64 v2, s3, -|v0|
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s6
+; VI-NEXT:    v_mul_f32_e64 v2, s7, -|v0|
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
   %fabs = call float @llvm.fabs.f32(float %x)
@@ -213,14 +213,14 @@ define amdgpu_kernel void @fneg_fabsf_v2f32(ptr addrspace(1) %out, <2 x float> %
 ;
 ; VI-LABEL: fneg_fabsf_v2f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_bitset1_b32 s3, 31
-; VI-NEXT:    s_bitset1_b32 s2, 31
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    s_or_b32 s0, s7, 0x80000000
+; VI-NEXT:    s_or_b32 s1, s6, 0x80000000
+; VI-NEXT:    v_mov_b32_e32 v2, s4
+; VI-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NEXT:    v_mov_b32_e32 v1, s0
+; VI-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
   %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in)
diff --git a/llvm/test/CodeGen/AMDGPU/fneg.ll b/llvm/test/CodeGen/AMDGPU/fneg.ll
index d78bdfe08772a..94fc9293e774c 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg.ll
@@ -65,27 +65,27 @@ define amdgpu_kernel void @s_fneg_v2f32(ptr addrspace(1) nocapture %out, <2 x fl
 ;
 ; VI-LABEL: s_fneg_v2f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_xor_b32 s3, s3, 0x80000000
-; VI-NEXT:    s_xor_b32 s2, s2, 0x80000000
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    s_xor_b32 s0, s7, 0x80000000
+; VI-NEXT:    s_xor_b32 s1, s6, 0x80000000
+; VI-NEXT:    v_mov_b32_e32 v2, s4
+; VI-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NEXT:    v_mov_b32_e32 v1, s0
+; VI-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_fneg_v2f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_xor_b32 s2, s2, 0x80000000
-; GFX11-NEXT:    s_xor_b32 s3, s3, 0x80000000
+; GFX11-NEXT:    s_xor_b32 s0, s6, 0x80000000
+; GFX11-NEXT:    s_xor_b32 s1, s7, 0x80000000
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT:    v_mov_b32_e32 v0, s2
-; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll
index 9f339af0f5580..0095bcd89e33b 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll
@@ -16,17 +16,17 @@ declare <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %d
 define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, <2 x half> %data) {
 ; GFX12-SDAG-LABEL: local_atomic_fadd_v2f16_noret:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-SDAG-NEXT:    ds_pk_add_f16 v0, v1
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: local_atomic_fadd_v2f16_noret:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-GISEL-NEXT:    ds_pk_add_f16 v0, v1
 ; GFX12-GISEL-NEXT:    s_endpgm
   %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0)
@@ -36,9 +36,9 @@ define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr,
 define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, <2 x i16> %data) {
 ; GFX12-SDAG-LABEL: local_atomic_fadd_v2bf16_noret:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-SDAG-NEXT:    ds_pk_add_bf16 v0, v1
 ; GFX12-SDAG-NEXT:    s_wait_dscnt 0x0
 ; GFX12-SDAG-NEXT:    global_inv scope:SCOPE_SYS
@@ -46,9 +46,9 @@ define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr,
 ;
 ; GFX12-GISEL-LABEL: local_atomic_fadd_v2bf16_noret:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2
 ; GFX12-GISEL-NEXT:    ds_pk_add_bf16 v1, v0
 ; GFX12-GISEL-NEXT:    s_wait_dscnt 0x0
 ; GFX12-GISEL-NEXT:    global_inv scope:SCOPE_SYS
@@ -116,19 +116,19 @@ define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16>
 define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(ptr %ptr, <2 x half> %data) {
 ; GFX12-SDAG-LABEL: flat_atomic_fadd_v2f16_noret:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX12-SDAG-NEXT:    flat_atomic_pk_add_f16 v[0:1], v2
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: flat_atomic_fadd_v2f16_noret:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX12-GISEL-NEXT:    flat_atomic_pk_add_f16 v[0:1], v2
 ; GFX12-GISEL-NEXT:    s_endpgm
   %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data)
@@ -164,19 +164,19 @@ define <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) {
 define amdgpu_kernel void @flat_atomic_fadd_v2bf16_noret(ptr %ptr, <2 x i16> %data) {
 ; GFX12-SDAG-LABEL: flat_atomic_fadd_v2bf16_noret:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX12-SDAG-NEXT:    flat_atomic_pk_add_bf16 v[0:1], v2
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: flat_atomic_fadd_v2bf16_noret:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX12-GISEL-NEXT:    flat_atomic_pk_add_bf16 v[0:1], v2
 ; GFX12-GISEL-NEXT:    s_endpgm
   %ret = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data)
@@ -212,20 +212,20 @@ define <2 x i16> @flat_atomic_fadd_v2bf16_rtn(ptr %ptr, <2 x i16> %data) {
 define amdgpu_kernel void @global_atomic_fadd_v2bf16_noret(ptr addrspace(1) %ptr, <2 x i16> %data) {
 ; GFX12-SDAG-LABEL: global_atomic_fadd_v2bf16_noret:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-SDAG-NEXT:    global_atomic_pk_add_bf16 v0, v1, s[0:1]
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6
+; GFX12-SDAG-NEXT:    global_atomic_pk_add_bf16 v0, v1, s[4:5]
 ; GFX12-SDAG-NEXT:    s_nop 0
 ; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: global_atomic_fadd_v2bf16_noret:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX12-GISEL-NEXT:    global_atomic_pk_add_bf16 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX12-GISEL-NEXT:    global_atomic_pk_add_bf16 v1, v0, s[4:5]
 ; GFX12-GISEL-NEXT:    s_nop 0
 ; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll
index 5761c198e20ba..7f87b41127767 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll
@@ -24,10 +24,10 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret(ptr %ptr, float %data) {
 ;
 ; GFX12-LABEL: flat_atomic_fadd_f32_noret:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX12-NEXT:    flat_atomic_add_f32 v[0:1], v2
 ; GFX12-NEXT:    s_endpgm
   %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %ptr, float %data)
@@ -190,10 +190,10 @@ define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(ptr %ptr, <2 x half> %da
 ;
 ; GFX12-LABEL: flat_atomic_fadd_v2f16_noret:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX12-NEXT:    flat_atomic_pk_add_f16 v[0:1], v2
 ; GFX12-NEXT:    s_endpgm
   %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data)
@@ -235,10 +235,10 @@ define amdgpu_kernel void @flat_atomic_fadd_v2bf16_noret(ptr %ptr, <2 x i16> %da
 ;
 ; GFX12-LABEL: flat_atomic_fadd_v2bf16_noret:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX12-NEXT:    flat_atomic_pk_add_bf16 v[0:1], v2
 ; GFX12-NEXT:    s_endpgm
   %ret = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data)
@@ -280,10 +280,10 @@ define amdgpu_kernel void @global_atomic_fadd_v2bf16_noret(ptr addrspace(1) %ptr
 ;
 ; GFX12-LABEL: global_atomic_fadd_v2bf16_noret:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-NEXT:    global_atomic_pk_add_bf16 v0, v1, s[0:1]
+; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6
+; GFX12-NEXT:    global_atomic_pk_add_bf16 v0, v1, s[4:5]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
@@ -316,18 +316,18 @@ define <2 x i16> @global_atomic_fadd_v2bf16_rtn(ptr addrspace(1) %ptr, <2 x i16>
 define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, <2 x half> %data) {
 ; GFX940-LABEL: local_atomic_fadd_v2f16_noret:
 ; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX940-NEXT:    ds_pk_add_f16 v0, v1
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: local_atomic_fadd_v2f16_noret:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    ds_pk_add_f16 v0, v1
 ; GFX12-NEXT:    s_endpgm
   %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0)
@@ -359,10 +359,10 @@ define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half>
 define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, <2 x i16> %data) {
 ; GFX940-LABEL: local_atomic_fadd_v2bf16_noret:
 ; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
 ; GFX940-NEXT:    ds_pk_add_bf16 v0, v1
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
@@ -371,9 +371,9 @@ define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr,
 ;
 ; GFX12-LABEL: local_atomic_fadd_v2bf16_noret:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    ds_pk_add_bf16 v0, v1
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
diff --git a/llvm/test/CodeGen/AMDGPU/fp-classify.ll b/llvm/test/CodeGen/AMDGPU/fp-classify.ll
index 18d2e52e8f900..ca2fa0f20f0f5 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-classify.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-classify.ll
@@ -362,29 +362,29 @@ define amdgpu_kernel void @test_isfinite_not_pattern_2(ptr addrspace(1) nocaptur
 ;
 ; VI-LABEL: test_isfinite_not_pattern_2:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_mov_b32_e32 v0, 0x7f800000
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_cmp_o_f32_e64 s[4:5], s2, s2
-; VI-NEXT:    v_cmp_neq_f32_e64 s[2:3], |s3|, v0
-; VI-NEXT:    s_and_b64 s[2:3], s[4:5], s[2:3]
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[2:3]
-; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_cmp_o_f32_e64 s[0:1], s6, s6
+; VI-NEXT:    v_cmp_neq_f32_e64 s[2:3], |s7|, v0
+; VI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_isfinite_not_pattern_2:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_o_f32_e64 s2, s2, s2
-; GFX11-NEXT:    v_cmp_neq_f32_e64 s3, 0x7f800000, |s3|
+; GFX11-NEXT:    v_cmp_o_f32_e64 s0, s6, s6
+; GFX11-NEXT:    v_cmp_neq_f32_e64 s1, 0x7f800000, |s7|
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_b32 s2, s2, s3
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s2
-; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    s_and_b32 s0, s0, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll
index 587340c7aa342..2928647a9627d 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll
@@ -664,51 +664,51 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre
 ;
 ; GFX10-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-NEXT:    buffer_atomic_fmax v0, v1, s[0:3], 4 offen glc slc
+; GFX10-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-NEXT:    v_mov_b32_e32 v1, s9
+; GFX10-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_store_dword v1, v0, s[6:7]
+; GFX10-NEXT:    global_store_dword v1, v0, s[10:11]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX1030-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc:
 ; GFX1030:       ; %bb.0: ; %main_body
-; GFX1030-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX1030-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
 ; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v0, s4
-; GFX1030-NEXT:    v_mov_b32_e32 v1, s5
-; GFX1030-NEXT:    buffer_atomic_fmax v0, v1, s[0:3], 4 offen glc slc
+; GFX1030-NEXT:    v_mov_b32_e32 v0, s8
+; GFX1030-NEXT:    v_mov_b32_e32 v1, s9
+; GFX1030-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc
 ; GFX1030-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0)
-; GFX1030-NEXT:    global_store_dword v1, v0, s[6:7]
+; GFX1030-NEXT:    global_store_dword v1, v0, s[10:11]
 ; GFX1030-NEXT:    s_endpgm
 ;
 ; GFX1100-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc:
 ; GFX1100:       ; %bb.0: ; %main_body
-; GFX1100-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX1100-NEXT:    s_load_b256 s[4:11], s[0:1], 0x24
 ; GFX1100-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX1100-NEXT:    buffer_atomic_max_f32 v0, v1, s[0:3], 4 offen glc slc
+; GFX1100-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX1100-NEXT:    buffer_atomic_max_f32 v0, v1, s[4:7], 4 offen glc slc
 ; GFX1100-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1100-NEXT:    s_waitcnt vmcnt(0)
-; GFX1100-NEXT:    global_store_b32 v1, v0, s[6:7]
+; GFX1100-NEXT:    global_store_b32 v1, v0, s[10:11]
 ; GFX1100-NEXT:    s_nop 0
 ; GFX1100-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc:
 ; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b256 s[4:11], s[0:1], 0x24
+; GFX12-NEXT:    s_mov_b32 s0, 4
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT:    s_mov_b32 s4, 4
-; GFX12-NEXT:    buffer_atomic_max_num_f32 v0, v1, s[0:3], s4 offen th:TH_ATOMIC_NT_RETURN
+; GFX12-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT:    buffer_atomic_max_num_f32 v0, v1, s[4:7], s0 offen th:TH_ATOMIC_NT_RETURN
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_store_b32 v1, v0, s[6:7]
+; GFX12-NEXT:    global_store_b32 v1, v0, s[10:11]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
@@ -743,37 +743,37 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre
 ;
 ; G_GFX10-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc:
 ; G_GFX10:       ; %bb.0: ; %main_body
-; G_GFX10-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; G_GFX10-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
 ; G_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX10-NEXT:    v_mov_b32_e32 v0, s4
-; G_GFX10-NEXT:    v_mov_b32_e32 v1, s5
-; G_GFX10-NEXT:    buffer_atomic_fmax v0, v1, s[0:3], 4 offen glc slc
+; G_GFX10-NEXT:    v_mov_b32_e32 v0, s8
+; G_GFX10-NEXT:    v_mov_b32_e32 v1, s9
+; G_GFX10-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc
 ; G_GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; G_GFX10-NEXT:    s_waitcnt vmcnt(0)
-; G_GFX10-NEXT:    global_store_dword v1, v0, s[6:7]
+; G_GFX10-NEXT:    global_store_dword v1, v0, s[10:11]
 ; G_GFX10-NEXT:    s_endpgm
 ;
 ; G_GFX1030-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc:
 ; G_GFX1030:       ; %bb.0: ; %main_body
-; G_GFX1030-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; G_GFX1030-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
 ; G_GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX1030-NEXT:    v_mov_b32_e32 v0, s4
-; G_GFX1030-NEXT:    v_mov_b32_e32 v1, s5
-; G_GFX1030-NEXT:    buffer_atomic_fmax v0, v1, s[0:3], 4 offen glc slc
+; G_GFX1030-NEXT:    v_mov_b32_e32 v0, s8
+; G_GFX1030-NEXT:    v_mov_b32_e32 v1, s9
+; G_GFX1030-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc
 ; G_GFX1030-NEXT:    v_mov_b32_e32 v1, 0
 ; G_GFX1030-NEXT:    s_waitcnt vmcnt(0)
-; G_GFX1030-NEXT:    global_store_dword v1, v0, s[6:7]
+; G_GFX1030-NEXT:    global_store_dword v1, v0, s[10:11]
 ; G_GFX1030-NEXT:    s_endpgm
 ;
 ; G_GFX1100-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc:
 ; G_GFX1100:       ; %bb.0: ; %main_body
-; G_GFX1100-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; G_GFX1100-NEXT:    s_load_b256 s[4:11], s[0:1], 0x24
 ; G_GFX1100-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX1100-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; G_GFX1100-NEXT:    buffer_atomic_max_f32 v0, v1, s[0:3], 4 offen glc slc
+; G_GFX1100-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; G_GFX1100-NEXT:    buffer_atomic_max_f32 v0, v1, s[4:7], 4 offen glc slc
 ; G_GFX1100-NEXT:    v_mov_b32_e32 v1, 0
 ; G_GFX1100-NEXT:    s_waitcnt vmcnt(0)
-; G_GFX1100-NEXT:    global_store_b32 v1, v0, s[6:7]
+; G_GFX1100-NEXT:    global_store_b32 v1, v0, s[10:11]
 ; G_GFX1100-NEXT:    s_nop 0
 ; G_GFX1100-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; G_GFX1100-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll
index e3ed0fa491884..f4745a5acbcd6 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll
@@ -50,22 +50,22 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f32(ptr addrspace(8)
 ; GFX1030-LABEL: raw_ptr_buffer_atomic_min_noret_f32:
 ; GFX1030:       ; %bb.0: ; %main_body
 ; GFX1030-NEXT:    s_clause 0x1
-; GFX1030-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX1030-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v0, s4
-; GFX1030-NEXT:    v_mov_b32_e32 v1, s5
-; GFX1030-NEXT:    buffer_atomic_fmin v0, v1, s[0:3], 0 offen
+; GFX1030-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1030-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1030-NEXT:    buffer_atomic_fmin v0, v1, s[4:7], 0 offen
 ; GFX1030-NEXT:    s_endpgm
 ;
 ; GFX1100-LABEL: raw_ptr_buffer_atomic_min_noret_f32:
 ; GFX1100:       ; %bb.0: ; %main_body
 ; GFX1100-NEXT:    s_clause 0x1
-; GFX1100-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
-; GFX1100-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1100-NEXT:    s_load_b64 s[2:3], s[0:1], 0x34
+; GFX1100-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX1100-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX1100-NEXT:    buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen
+; GFX1100-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX1100-NEXT:    buffer_atomic_min_f32 v0, v1, s[4:7], 0 offen
 ; GFX1100-NEXT:    s_nop 0
 ; GFX1100-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-NEXT:    s_endpgm
@@ -408,22 +408,22 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f32(ptr addrspace(8)
 ; GFX1030-LABEL: raw_ptr_buffer_atomic_max_noret_f32:
 ; GFX1030:       ; %bb.0: ; %main_body
 ; GFX1030-NEXT:    s_clause 0x1
-; GFX1030-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX1030-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v0, s4
-; GFX1030-NEXT:    v_mov_b32_e32 v1, s5
-; GFX1030-NEXT:    buffer_atomic_fmax v0, v1, s[0:3], 0 offen
+; GFX1030-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1030-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1030-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 0 offen
 ; GFX1030-NEXT:    s_endpgm
 ;
 ; GFX1100-LABEL: raw_ptr_buffer_atomic_max_noret_f32:
 ; GFX1100:       ; %bb.0: ; %main_body
 ; GFX1100-NEXT:    s_clause 0x1
-; GFX1100-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
-; GFX1100-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1100-NEXT:    s_load_b64 s[2:3], s[0:1], 0x34
+; GFX1100-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX1100-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX1100-NEXT:    buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen
+; GFX1100-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX1100-NEXT:    buffer_atomic_max_f32 v0, v1, s[4:7], 0 offen
 ; GFX1100-NEXT:    s_nop 0
 ; GFX1100-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-NEXT:    s_endpgm
@@ -607,37 +607,37 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrsp
 ;
 ; GFX10-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-NEXT:    buffer_atomic_fmax v0, v1, s[0:3], 4 offen glc slc
+; GFX10-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-NEXT:    v_mov_b32_e32 v1, s9
+; GFX10-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_store_dword v1, v0, s[6:7]
+; GFX10-NEXT:    global_store_dword v1, v0, s[10:11]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX1030-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc:
 ; GFX1030:       ; %bb.0: ; %main_body
-; GFX1030-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX1030-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
 ; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v0, s4
-; GFX1030-NEXT:    v_mov_b32_e32 v1, s5
-; GFX1030-NEXT:    buffer_atomic_fmax v0, v1, s[0:3], 4 offen glc slc
+; GFX1030-NEXT:    v_mov_b32_e32 v0, s8
+; GFX1030-NEXT:    v_mov_b32_e32 v1, s9
+; GFX1030-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc
 ; GFX1030-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0)
-; GFX1030-NEXT:    global_store_dword v1, v0, s[6:7]
+; GFX1030-NEXT:    global_store_dword v1, v0, s[10:11]
 ; GFX1030-NEXT:    s_endpgm
 ;
 ; GFX1100-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc:
 ; GFX1100:       ; %bb.0: ; %main_body
-; GFX1100-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX1100-NEXT:    s_load_b256 s[4:11], s[0:1], 0x24
 ; GFX1100-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX1100-NEXT:    buffer_atomic_max_f32 v0, v1, s[0:3], 4 offen glc slc
+; GFX1100-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX1100-NEXT:    buffer_atomic_max_f32 v0, v1, s[4:7], 4 offen glc slc
 ; GFX1100-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1100-NEXT:    s_waitcnt vmcnt(0)
-; GFX1100-NEXT:    global_store_b32 v1, v0, s[6:7]
+; GFX1100-NEXT:    global_store_b32 v1, v0, s[10:11]
 ; GFX1100-NEXT:    s_nop 0
 ; GFX1100-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-NEXT:    s_endpgm
@@ -672,37 +672,37 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrsp
 ;
 ; G_GFX10-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc:
 ; G_GFX10:       ; %bb.0: ; %main_body
-; G_GFX10-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; G_GFX10-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
 ; G_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX10-NEXT:    v_mov_b32_e32 v0, s4
-; G_GFX10-NEXT:    v_mov_b32_e32 v1, s5
-; G_GFX10-NEXT:    buffer_atomic_fmax v0, v1, s[0:3], 4 offen glc slc
+; G_GFX10-NEXT:    v_mov_b32_e32 v0, s8
+; G_GFX10-NEXT:    v_mov_b32_e32 v1, s9
+; G_GFX10-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc
 ; G_GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; G_GFX10-NEXT:    s_waitcnt vmcnt(0)
-; G_GFX10-NEXT:    global_store_dword v1, v0, s[6:7]
+; G_GFX10-NEXT:    global_store_dword v1, v0, s[10:11]
 ; G_GFX10-NEXT:    s_endpgm
 ;
 ; G_GFX1030-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc:
 ; G_GFX1030:       ; %bb.0: ; %main_body
-; G_GFX1030-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; G_GFX1030-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
 ; G_GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX1030-NEXT:    v_mov_b32_e32 v0, s4
-; G_GFX1030-NEXT:    v_mov_b32_e32 v1, s5
-; G_GFX1030-NEXT:    buffer_atomic_fmax v0, v1, s[0:3], 4 offen glc slc
+; G_GFX1030-NEXT:    v_mov_b32_e32 v0, s8
+; G_GFX1030-NEXT:    v_mov_b32_e32 v1, s9
+; G_GFX1030-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc
 ; G_GFX1030-NEXT:    v_mov_b32_e32 v1, 0
 ; G_GFX1030-NEXT:    s_waitcnt vmcnt(0)
-; G_GFX1030-NEXT:    global_store_dword v1, v0, s[6:7]
+; G_GFX1030-NEXT:    global_store_dword v1, v0, s[10:11]
 ; G_GFX1030-NEXT:    s_endpgm
 ;
 ; G_GFX1100-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc:
 ; G_GFX1100:       ; %bb.0: ; %main_body
-; G_GFX1100-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; G_GFX1100-NEXT:    s_load_b256 s[4:11], s[0:1], 0x24
 ; G_GFX1100-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX1100-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; G_GFX1100-NEXT:    buffer_atomic_max_f32 v0, v1, s[0:3], 4 offen glc slc
+; G_GFX1100-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; G_GFX1100-NEXT:    buffer_atomic_max_f32 v0, v1, s[4:7], 4 offen glc slc
 ; G_GFX1100-NEXT:    v_mov_b32_e32 v1, 0
 ; G_GFX1100-NEXT:    s_waitcnt vmcnt(0)
-; G_GFX1100-NEXT:    global_store_b32 v1, v0, s[6:7]
+; G_GFX1100-NEXT:    global_store_b32 v1, v0, s[10:11]
 ; G_GFX1100-NEXT:    s_nop 0
 ; G_GFX1100-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; G_GFX1100-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll b/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll
index d827ea0503a3b..dc1f8ca0b7d0f 100644
--- a/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll
@@ -28,38 +28,38 @@ define amdgpu_kernel void @test_convert_fp16_to_fp32(ptr addrspace(1) noalias %o
 ;
 ; GFX8-LABEL: test_convert_fp16_to_fp32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8-NEXT:    s_mov_b32 s7, 0xf000
-; GFX8-NEXT:    s_mov_b32 s6, -1
-; GFX8-NEXT:    s_mov_b32 s10, s6
-; GFX8-NEXT:    s_mov_b32 s11, s7
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT:    s_mov_b32 s3, 0xf000
+; GFX8-NEXT:    s_mov_b32 s2, -1
+; GFX8-NEXT:    s_mov_b32 s10, s2
+; GFX8-NEXT:    s_mov_b32 s11, s3
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 s8, s2
-; GFX8-NEXT:    s_mov_b32 s9, s3
+; GFX8-NEXT:    s_mov_b32 s8, s6
+; GFX8-NEXT:    s_mov_b32 s9, s7
 ; GFX8-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
-; GFX8-NEXT:    s_mov_b32 s4, s0
-; GFX8-NEXT:    s_mov_b32 s5, s1
+; GFX8-NEXT:    s_mov_b32 s0, s4
+; GFX8-NEXT:    s_mov_b32 s1, s5
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX8-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_convert_fp16_to_fp32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    s_mov_b32 s6, -1
-; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s10, s6
-; GFX11-NEXT:    s_mov_b32 s11, s7
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s10, s2
+; GFX11-NEXT:    s_mov_b32 s11, s3
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s8, s2
-; GFX11-NEXT:    s_mov_b32 s9, s3
-; GFX11-NEXT:    s_mov_b32 s4, s0
+; GFX11-NEXT:    s_mov_b32 s8, s6
+; GFX11-NEXT:    s_mov_b32 s9, s7
+; GFX11-NEXT:    s_mov_b32 s0, s4
 ; GFX11-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_mov_b32 s5, s1
+; GFX11-NEXT:    s_mov_b32 s1, s5
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll b/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll
index 03b8251ea4640..03ee67dc16678 100644
--- a/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll
@@ -27,41 +27,41 @@ define amdgpu_kernel void @test_convert_fp16_to_fp64(ptr addrspace(1) noalias %o
 ;
 ; GFX8-LABEL: test_convert_fp16_to_fp64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8-NEXT:    s_mov_b32 s7, 0xf000
-; GFX8-NEXT:    s_mov_b32 s6, -1
-; GFX8-NEXT:    s_mov_b32 s10, s6
-; GFX8-NEXT:    s_mov_b32 s11, s7
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT:    s_mov_b32 s3, 0xf000
+; GFX8-NEXT:    s_mov_b32 s2, -1
+; GFX8-NEXT:    s_mov_b32 s10, s2
+; GFX8-NEXT:    s_mov_b32 s11, s3
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 s8, s2
-; GFX8-NEXT:    s_mov_b32 s9, s3
+; GFX8-NEXT:    s_mov_b32 s8, s6
+; GFX8-NEXT:    s_mov_b32 s9, s7
 ; GFX8-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
-; GFX8-NEXT:    s_mov_b32 s4, s0
-; GFX8-NEXT:    s_mov_b32 s5, s1
+; GFX8-NEXT:    s_mov_b32 s0, s4
+; GFX8-NEXT:    s_mov_b32 s1, s5
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX8-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_convert_fp16_to_fp64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    s_mov_b32 s6, -1
-; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s10, s6
-; GFX11-NEXT:    s_mov_b32 s11, s7
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s10, s2
+; GFX11-NEXT:    s_mov_b32 s11, s3
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s8, s2
-; GFX11-NEXT:    s_mov_b32 s9, s3
-; GFX11-NEXT:    s_mov_b32 s4, s0
+; GFX11-NEXT:    s_mov_b32 s8, s6
+; GFX11-NEXT:    s_mov_b32 s9, s7
+; GFX11-NEXT:    s_mov_b32 s0, s4
 ; GFX11-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_mov_b32 s5, s1
+; GFX11-NEXT:    s_mov_b32 s1, s5
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll b/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll
index 8ab82b722445e..4f6ea77205133 100644
--- a/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll
@@ -27,38 +27,38 @@ define amdgpu_kernel void @test_convert_fp32_to_fp16(ptr addrspace(1) noalias %o
 ;
 ; GFX8-LABEL: test_convert_fp32_to_fp16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8-NEXT:    s_mov_b32 s7, 0xf000
-; GFX8-NEXT:    s_mov_b32 s6, -1
-; GFX8-NEXT:    s_mov_b32 s10, s6
-; GFX8-NEXT:    s_mov_b32 s11, s7
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT:    s_mov_b32 s3, 0xf000
+; GFX8-NEXT:    s_mov_b32 s2, -1
+; GFX8-NEXT:    s_mov_b32 s10, s2
+; GFX8-NEXT:    s_mov_b32 s11, s3
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 s8, s2
-; GFX8-NEXT:    s_mov_b32 s9, s3
+; GFX8-NEXT:    s_mov_b32 s8, s6
+; GFX8-NEXT:    s_mov_b32 s9, s7
 ; GFX8-NEXT:    buffer_load_dword v0, off, s[8:11], 0
-; GFX8-NEXT:    s_mov_b32 s4, s0
-; GFX8-NEXT:    s_mov_b32 s5, s1
+; GFX8-NEXT:    s_mov_b32 s0, s4
+; GFX8-NEXT:    s_mov_b32 s1, s5
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX8-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; GFX8-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_convert_fp32_to_fp16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    s_mov_b32 s6, -1
-; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s10, s6
-; GFX11-NEXT:    s_mov_b32 s11, s7
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s10, s2
+; GFX11-NEXT:    s_mov_b32 s11, s3
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s8, s2
-; GFX11-NEXT:    s_mov_b32 s9, s3
-; GFX11-NEXT:    s_mov_b32 s4, s0
+; GFX11-NEXT:    s_mov_b32 s8, s6
+; GFX11-NEXT:    s_mov_b32 s9, s7
+; GFX11-NEXT:    s_mov_b32 s0, s4
 ; GFX11-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_mov_b32 s5, s1
+; GFX11-NEXT:    s_mov_b32 s1, s5
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll
index d610091840b95..a058c1119d4fd 100644
--- a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll
@@ -452,26 +452,26 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inre
 ;
 ; GFX10-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-NEXT:    v_mov_b32_e32 v2, s6
-; GFX10-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX10-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-NEXT:    v_mov_b32_e32 v1, s9
+; GFX10-NEXT:    v_mov_b32_e32 v2, s10
+; GFX10-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc
+; GFX10-NEXT:    v_mov_b32_e32 v2, s11
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ds_write_b64 v2, v[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX1030-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX1030:       ; %bb.0: ; %main_body
-; GFX1030-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX1030-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
 ; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v0, s4
-; GFX1030-NEXT:    v_mov_b32_e32 v1, s5
-; GFX1030-NEXT:    v_mov_b32_e32 v2, s6
-; GFX1030-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX1030-NEXT:    v_mov_b32_e32 v2, s7
+; GFX1030-NEXT:    v_mov_b32_e32 v0, s8
+; GFX1030-NEXT:    v_mov_b32_e32 v1, s9
+; GFX1030-NEXT:    v_mov_b32_e32 v2, s10
+; GFX1030-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc
+; GFX1030-NEXT:    v_mov_b32_e32 v2, s11
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1030-NEXT:    ds_write_b64 v2, v[0:1]
 ; GFX1030-NEXT:    s_endpgm
@@ -506,26 +506,26 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inre
 ;
 ; G_GFX10-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
 ; G_GFX10:       ; %bb.0: ; %main_body
-; G_GFX10-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; G_GFX10-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
 ; G_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX10-NEXT:    v_mov_b32_e32 v0, s4
-; G_GFX10-NEXT:    v_mov_b32_e32 v1, s5
-; G_GFX10-NEXT:    v_mov_b32_e32 v2, s6
-; G_GFX10-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 4 offen glc slc
-; G_GFX10-NEXT:    v_mov_b32_e32 v2, s7
+; G_GFX10-NEXT:    v_mov_b32_e32 v0, s8
+; G_GFX10-NEXT:    v_mov_b32_e32 v1, s9
+; G_GFX10-NEXT:    v_mov_b32_e32 v2, s10
+; G_GFX10-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc
+; G_GFX10-NEXT:    v_mov_b32_e32 v2, s11
 ; G_GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; G_GFX10-NEXT:    ds_write_b64 v2, v[0:1]
 ; G_GFX10-NEXT:    s_endpgm
 ;
 ; G_GFX1030-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
 ; G_GFX1030:       ; %bb.0: ; %main_body
-; G_GFX1030-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; G_GFX1030-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
 ; G_GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX1030-NEXT:    v_mov_b32_e32 v0, s4
-; G_GFX1030-NEXT:    v_mov_b32_e32 v1, s5
-; G_GFX1030-NEXT:    v_mov_b32_e32 v2, s6
-; G_GFX1030-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 4 offen glc slc
-; G_GFX1030-NEXT:    v_mov_b32_e32 v2, s7
+; G_GFX1030-NEXT:    v_mov_b32_e32 v0, s8
+; G_GFX1030-NEXT:    v_mov_b32_e32 v1, s9
+; G_GFX1030-NEXT:    v_mov_b32_e32 v2, s10
+; G_GFX1030-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc
+; G_GFX1030-NEXT:    v_mov_b32_e32 v2, s11
 ; G_GFX1030-NEXT:    s_waitcnt vmcnt(0)
 ; G_GFX1030-NEXT:    ds_write_b64 v2, v[0:1]
 ; G_GFX1030-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll
index 5f501fec24c2e..046c92a2fc63f 100644
--- a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll
@@ -54,14 +54,14 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8)
 ; GFX1030-LABEL: raw_ptr_buffer_atomic_min_noret_f64:
 ; GFX1030:       ; %bb.0: ; %main_body
 ; GFX1030-NEXT:    s_clause 0x2
-; GFX1030-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GFX1030-NEXT:    s_load_dword s6, s[0:1], 0x3c
-; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX1030-NEXT:    s_load_dword s8, s[0:1], 0x3c
+; GFX1030-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v0, s4
-; GFX1030-NEXT:    v_mov_b32_e32 v1, s5
-; GFX1030-NEXT:    v_mov_b32_e32 v2, s6
-; GFX1030-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen
+; GFX1030-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1030-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1030-NEXT:    v_mov_b32_e32 v2, s8
+; GFX1030-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen
 ; GFX1030-NEXT:    s_endpgm
 ;
 ; G_SI-LABEL: raw_ptr_buffer_atomic_min_noret_f64:
@@ -291,14 +291,14 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8)
 ; GFX1030-LABEL: raw_ptr_buffer_atomic_max_noret_f64:
 ; GFX1030:       ; %bb.0: ; %main_body
 ; GFX1030-NEXT:    s_clause 0x2
-; GFX1030-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GFX1030-NEXT:    s_load_dword s6, s[0:1], 0x3c
-; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX1030-NEXT:    s_load_dword s8, s[0:1], 0x3c
+; GFX1030-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v0, s4
-; GFX1030-NEXT:    v_mov_b32_e32 v1, s5
-; GFX1030-NEXT:    v_mov_b32_e32 v2, s6
-; GFX1030-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen
+; GFX1030-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1030-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1030-NEXT:    v_mov_b32_e32 v2, s8
+; GFX1030-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen
 ; GFX1030-NEXT:    s_endpgm
 ;
 ; G_SI-LABEL: raw_ptr_buffer_atomic_max_noret_f64:
@@ -452,26 +452,26 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp
 ;
 ; GFX10-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-NEXT:    v_mov_b32_e32 v2, s6
-; GFX10-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX10-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-NEXT:    v_mov_b32_e32 v1, s9
+; GFX10-NEXT:    v_mov_b32_e32 v2, s10
+; GFX10-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc
+; GFX10-NEXT:    v_mov_b32_e32 v2, s11
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ds_write_b64 v2, v[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX1030-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX1030:       ; %bb.0: ; %main_body
-; GFX1030-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX1030-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
 ; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v0, s4
-; GFX1030-NEXT:    v_mov_b32_e32 v1, s5
-; GFX1030-NEXT:    v_mov_b32_e32 v2, s6
-; GFX1030-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX1030-NEXT:    v_mov_b32_e32 v2, s7
+; GFX1030-NEXT:    v_mov_b32_e32 v0, s8
+; GFX1030-NEXT:    v_mov_b32_e32 v1, s9
+; GFX1030-NEXT:    v_mov_b32_e32 v2, s10
+; GFX1030-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc
+; GFX1030-NEXT:    v_mov_b32_e32 v2, s11
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1030-NEXT:    ds_write_b64 v2, v[0:1]
 ; GFX1030-NEXT:    s_endpgm
@@ -506,26 +506,26 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp
 ;
 ; G_GFX10-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc:
 ; G_GFX10:       ; %bb.0: ; %main_body
-; G_GFX10-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; G_GFX10-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
 ; G_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX10-NEXT:    v_mov_b32_e32 v0, s4
-; G_GFX10-NEXT:    v_mov_b32_e32 v1, s5
-; G_GFX10-NEXT:    v_mov_b32_e32 v2, s6
-; G_GFX10-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 4 offen glc slc
-; G_GFX10-NEXT:    v_mov_b32_e32 v2, s7
+; G_GFX10-NEXT:    v_mov_b32_e32 v0, s8
+; G_GFX10-NEXT:    v_mov_b32_e32 v1, s9
+; G_GFX10-NEXT:    v_mov_b32_e32 v2, s10
+; G_GFX10-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc
+; G_GFX10-NEXT:    v_mov_b32_e32 v2, s11
 ; G_GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; G_GFX10-NEXT:    ds_write_b64 v2, v[0:1]
 ; G_GFX10-NEXT:    s_endpgm
 ;
 ; G_GFX1030-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc:
 ; G_GFX1030:       ; %bb.0: ; %main_body
-; G_GFX1030-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; G_GFX1030-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
 ; G_GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX1030-NEXT:    v_mov_b32_e32 v0, s4
-; G_GFX1030-NEXT:    v_mov_b32_e32 v1, s5
-; G_GFX1030-NEXT:    v_mov_b32_e32 v2, s6
-; G_GFX1030-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 4 offen glc slc
-; G_GFX1030-NEXT:    v_mov_b32_e32 v2, s7
+; G_GFX1030-NEXT:    v_mov_b32_e32 v0, s8
+; G_GFX1030-NEXT:    v_mov_b32_e32 v1, s9
+; G_GFX1030-NEXT:    v_mov_b32_e32 v2, s10
+; G_GFX1030-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc
+; G_GFX1030-NEXT:    v_mov_b32_e32 v2, s11
 ; G_GFX1030-NEXT:    s_waitcnt vmcnt(0)
 ; G_GFX1030-NEXT:    ds_write_b64 v2, v[0:1]
 ; G_GFX1030-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll
index 04ef30bd26aa5..b4fee7017d698 100644
--- a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll
@@ -100,15 +100,15 @@ define amdgpu_kernel void @fp_to_sint_v2i32(ptr addrspace(1) %out, <2 x float> %
 ;
 ; VI-LABEL: fp_to_sint_v2i32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_cvt_i32_f32_e32 v1, s3
-; VI-NEXT:    v_cvt_i32_f32_e32 v0, s2
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT:    v_cvt_i32_f32_e32 v1, s7
+; VI-NEXT:    v_cvt_i32_f32_e32 v0, s6
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: fp_to_sint_v2i32:
@@ -329,24 +329,24 @@ define amdgpu_kernel void @fp_to_sint_v2i64(ptr addrspace(1) %out, <2 x float> %
 ;
 ; VI-LABEL: fp_to_sint_v2i64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    s_mov_b32 s8, 0x2f800000
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_trunc_f32_e32 v0, s3
+; VI-NEXT:    v_trunc_f32_e32 v0, s7
 ; VI-NEXT:    v_mul_f32_e64 v1, |v0|, s8
-; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s0, s4
 ; VI-NEXT:    v_floor_f32_e32 v1, v1
-; VI-NEXT:    s_mov_b32 s0, 0xcf800000
-; VI-NEXT:    v_fma_f32 v2, v1, s0, |v0|
-; VI-NEXT:    v_trunc_f32_e32 v4, s2
+; VI-NEXT:    s_mov_b32 s4, 0xcf800000
+; VI-NEXT:    v_fma_f32 v2, v1, s4, |v0|
+; VI-NEXT:    v_trunc_f32_e32 v4, s6
 ; VI-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; VI-NEXT:    v_mul_f32_e64 v3, |v4|, s8
 ; VI-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; VI-NEXT:    v_floor_f32_e32 v3, v3
 ; VI-NEXT:    v_cvt_u32_f32_e32 v5, v3
-; VI-NEXT:    v_fma_f32 v3, v3, s0, |v4|
+; VI-NEXT:    v_fma_f32 v3, v3, s4, |v4|
 ; VI-NEXT:    v_ashrrev_i32_e32 v0, 31, v0
 ; VI-NEXT:    v_cvt_u32_f32_e32 v6, v3
 ; VI-NEXT:    v_xor_b32_e32 v2, v2, v0
@@ -357,9 +357,9 @@ define amdgpu_kernel void @fp_to_sint_v2i64(ptr addrspace(1) %out, <2 x float> %
 ; VI-NEXT:    v_xor_b32_e32 v0, v6, v1
 ; VI-NEXT:    v_xor_b32_e32 v4, v5, v1
 ; VI-NEXT:    v_sub_u32_e32 v0, vcc, v0, v1
-; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_mov_b32 s1, s5
 ; VI-NEXT:    v_subb_u32_e32 v1, vcc, v4, v1, vcc
-; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: fp_to_sint_v2i64:
diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll b/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll
index 5abf82aa1aab5..f8ede1cd557a7 100644
--- a/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll
@@ -60,15 +60,15 @@ define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i32(ptr addrspace(1) %out, <2 x
 ;
 ; VI-LABEL: fp_to_uint_v2f32_to_v2i32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_cvt_u32_f32_e32 v1, s3
-; VI-NEXT:    v_cvt_u32_f32_e32 v0, s2
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT:    v_cvt_u32_f32_e32 v1, s7
+; VI-NEXT:    v_cvt_u32_f32_e32 v0, s6
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: fp_to_uint_v2f32_to_v2i32:
@@ -264,26 +264,26 @@ define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i64(ptr addrspace(1) %out, <2 x
 ;
 ; VI-LABEL: fp_to_uint_v2f32_to_v2i64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s0, 0xcf800000
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_trunc_f32_e32 v0, s3
-; VI-NEXT:    v_trunc_f32_e32 v4, s2
+; VI-NEXT:    v_trunc_f32_e32 v0, s7
+; VI-NEXT:    v_trunc_f32_e32 v4, s6
 ; VI-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; VI-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v4
 ; VI-NEXT:    v_floor_f32_e32 v5, v1
-; VI-NEXT:    s_mov_b32 s2, 0xcf800000
 ; VI-NEXT:    v_floor_f32_e32 v6, v2
-; VI-NEXT:    v_fma_f32 v0, v5, s2, v0
+; VI-NEXT:    v_fma_f32 v0, v5, s0, v0
 ; VI-NEXT:    v_cvt_u32_f32_e32 v2, v0
-; VI-NEXT:    v_fma_f32 v0, v6, s2, v4
+; VI-NEXT:    v_fma_f32 v0, v6, s0, v4
 ; VI-NEXT:    v_cvt_u32_f32_e32 v3, v5
 ; VI-NEXT:    v_cvt_u32_f32_e32 v1, v6
 ; VI-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: fp_to_uint_v2f32_to_v2i64:
diff --git a/llvm/test/CodeGen/AMDGPU/fshl.ll b/llvm/test/CodeGen/AMDGPU/fshl.ll
index 4ea3323a9dbfc..fecf303d57691 100644
--- a/llvm/test/CodeGen/AMDGPU/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshl.ll
@@ -126,23 +126,23 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) {
 ;
 ; VI-LABEL: fshl_i32_imm:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s3
-; VI-NEXT:    v_alignbit_b32 v2, s2, v0, 25
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s7
+; VI-NEXT:    v_alignbit_b32 v2, s6, v0, 25
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: fshl_i32_imm:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    v_alignbit_b32 v1, s2, v1, 25
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    v_alignbit_b32 v1, s6, v1, 25
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; R600-LABEL: fshl_i32_imm:
@@ -159,20 +159,20 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) {
 ;
 ; GFX10-LABEL: fshl_i32_imm:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_alignbit_b32 v1, s2, s3, 25
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    v_alignbit_b32 v1, s6, s7, 25
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fshl_i32_imm:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_alignbit_b32 v1, s2, s3, 25
-; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    v_alignbit_b32 v1, s6, s7, 25
+; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -720,29 +720,29 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) {
 ;
 ; VI-LABEL: orxor2or1:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b32 s4, s2, 7
-; VI-NEXT:    s_or_b32 s4, s3, s4
-; VI-NEXT:    s_cmp_eq_u32 s4, 0
-; VI-NEXT:    s_cselect_b32 s2, s2, s3
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    s_lshl_b32 s0, s6, 7
+; VI-NEXT:    s_or_b32 s0, s7, s0
+; VI-NEXT:    s_cmp_eq_u32 s0, 0
+; VI-NEXT:    s_cselect_b32 s0, s6, s7
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: orxor2or1:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b32 s4, s2, 7
-; GFX9-NEXT:    s_or_b32 s4, s3, s4
-; GFX9-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX9-NEXT:    s_cselect_b32 s2, s2, s3
-; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_lshl_b32 s0, s6, 7
+; GFX9-NEXT:    s_or_b32 s0, s7, s0
+; GFX9-NEXT:    s_cmp_eq_u32 s0, 0
+; GFX9-NEXT:    s_cselect_b32 s0, s6, s7
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; R600-LABEL: orxor2or1:
@@ -761,29 +761,29 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) {
 ;
 ; GFX10-LABEL: orxor2or1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_lshl_b32 s4, s2, 7
-; GFX10-NEXT:    s_or_b32 s4, s3, s4
-; GFX10-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX10-NEXT:    s_cselect_b32 s2, s2, s3
-; GFX10-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    s_lshl_b32 s0, s6, 7
+; GFX10-NEXT:    s_or_b32 s0, s7, s0
+; GFX10-NEXT:    s_cmp_eq_u32 s0, 0
+; GFX10-NEXT:    s_cselect_b32 s0, s6, s7
+; GFX10-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: orxor2or1:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b32 s4, s2, 7
+; GFX11-NEXT:    s_lshl_b32 s0, s6, 7
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_or_b32 s4, s3, s4
-; GFX11-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX11-NEXT:    s_cselect_b32 s2, s2, s3
+; GFX11-NEXT:    s_or_b32 s0, s7, s0
+; GFX11-NEXT:    s_cmp_eq_u32 s0, 0
+; GFX11-NEXT:    s_cselect_b32 s0, s6, s7
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll
index e8310e73f9a47..a5ea1ee92a048 100644
--- a/llvm/test/CodeGen/AMDGPU/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshr.ll
@@ -86,13 +86,13 @@ define amdgpu_kernel void @fshr_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z
 ; GFX11-LABEL: fshr_i32:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b32 s4, s[0:1], 0x34
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x34
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_alignbit_b32 v0, s2, s3, v0
-; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT:    v_alignbit_b32 v0, s6, s7, v0
+; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -118,23 +118,23 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) {
 ;
 ; VI-LABEL: fshr_i32_imm:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s3
-; VI-NEXT:    v_alignbit_b32 v2, s2, v0, 7
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s7
+; VI-NEXT:    v_alignbit_b32 v2, s6, v0, 7
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: fshr_i32_imm:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    v_alignbit_b32 v1, s2, v1, 7
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    v_alignbit_b32 v1, s6, v1, 7
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; R600-LABEL: fshr_i32_imm:
@@ -151,20 +151,20 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) {
 ;
 ; GFX10-LABEL: fshr_i32_imm:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_alignbit_b32 v1, s2, s3, 7
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    v_alignbit_b32 v1, s6, s7, 7
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fshr_i32_imm:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_alignbit_b32 v1, s2, s3, 7
-; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    v_alignbit_b32 v1, s6, s7, 7
+; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics.ll b/llvm/test/CodeGen/AMDGPU/global_atomics.ll
index dac3a3db7b450..4d585cf56b9a1 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics.ll
@@ -4140,27 +4140,27 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr addrspace(1) %out, i32
 ;
 ; VI-LABEL: atomic_cmpxchg_i32_offset:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; VI-NEXT:    v_mov_b32_e32 v0, s6
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: atomic_cmpxchg_i32_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
@@ -4391,27 +4391,27 @@ define amdgpu_kernel void @atomic_cmpxchg_i32(ptr addrspace(1) %out, i32 %in, i3
 ;
 ; VI-LABEL: atomic_cmpxchg_i32:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0
+; VI-NEXT:    v_mov_b32_e32 v0, s6
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: atomic_cmpxchg_i32:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    global_atomic_cmpswap v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
index 516c92f1640ea..3050da034d236 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
@@ -4648,24 +4648,24 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i
 ;
 ; VI-LABEL: atomic_max_i32_addr64_offset:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_ashr_i32 s5, s3, 31
-; VI-NEXT:    s_mov_b32 s4, s3
-; VI-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
-; VI-NEXT:    s_add_u32 s4, s0, s4
-; VI-NEXT:    s_addc_u32 s5, s1, s5
-; VI-NEXT:    s_load_dword s3, s[4:5], 0x10
-; VI-NEXT:    s_add_u32 s4, s4, 16
-; VI-NEXT:    s_addc_u32 s5, s5, 0
-; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    s_ashr_i32 s1, s7, 31
+; VI-NEXT:    s_mov_b32 s0, s7
+; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; VI-NEXT:    s_add_u32 s2, s4, s0
+; VI-NEXT:    s_addc_u32 s3, s5, s1
+; VI-NEXT:    s_load_dword s4, s[2:3], 0x10
+; VI-NEXT:    s_add_u32 s2, s2, 16
+; VI-NEXT:    s_addc_u32 s3, s3, 0
+; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    s_mov_b64 s[0:1], 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v3, s3
-; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v3, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:  .LBB91_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; VI-NEXT:    v_max_i32_e32 v2, s2, v3
+; VI-NEXT:    v_max_i32_e32 v2, s6, v3
 ; VI-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
@@ -4679,28 +4679,28 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i
 ;
 ; GFX9-LABEL: atomic_max_i32_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s5, s3, 31
-; GFX9-NEXT:    s_mov_b32 s4, s3
-; GFX9-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
-; GFX9-NEXT:    s_add_u32 s0, s0, s4
-; GFX9-NEXT:    s_addc_u32 s1, s1, s5
-; GFX9-NEXT:    s_load_dword s3, s[0:1], 0x10
-; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:    s_ashr_i32 s1, s7, 31
+; GFX9-NEXT:    s_mov_b32 s0, s7
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; GFX9-NEXT:    s_add_u32 s0, s4, s0
+; GFX9-NEXT:    s_addc_u32 s1, s5, s1
+; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x10
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX9-NEXT:  .LBB91_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_max_i32_e32 v0, s2, v1
+; GFX9-NEXT:    v_max_i32_e32 v0, s6, v1
 ; GFX9-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB91_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_endpgm
@@ -4861,22 +4861,22 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in,
 ;
 ; VI-LABEL: atomic_max_i32_addr64:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_ashr_i32 s5, s3, 31
-; VI-NEXT:    s_mov_b32 s4, s3
-; VI-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
-; VI-NEXT:    s_add_u32 s4, s0, s4
-; VI-NEXT:    s_addc_u32 s5, s1, s5
-; VI-NEXT:    s_load_dword s3, s[4:5], 0x0
-; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    s_ashr_i32 s1, s7, 31
+; VI-NEXT:    s_mov_b32 s0, s7
+; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; VI-NEXT:    s_add_u32 s2, s4, s0
+; VI-NEXT:    s_addc_u32 s3, s5, s1
+; VI-NEXT:    s_load_dword s4, s[2:3], 0x0
+; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    s_mov_b64 s[0:1], 0
-; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v3, s3
+; VI-NEXT:    v_mov_b32_e32 v3, s4
 ; VI-NEXT:  .LBB93_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; VI-NEXT:    v_max_i32_e32 v2, s2, v3
+; VI-NEXT:    v_max_i32_e32 v2, s6, v3
 ; VI-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
@@ -4890,28 +4890,28 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in,
 ;
 ; GFX9-LABEL: atomic_max_i32_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s5, s3, 31
-; GFX9-NEXT:    s_mov_b32 s4, s3
-; GFX9-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
-; GFX9-NEXT:    s_add_u32 s0, s0, s4
-; GFX9-NEXT:    s_addc_u32 s1, s1, s5
-; GFX9-NEXT:    s_load_dword s3, s[0:1], 0x0
-; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:    s_ashr_i32 s1, s7, 31
+; GFX9-NEXT:    s_mov_b32 s0, s7
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; GFX9-NEXT:    s_add_u32 s0, s4, s0
+; GFX9-NEXT:    s_addc_u32 s1, s5, s1
+; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX9-NEXT:  .LBB93_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_max_i32_e32 v0, s2, v1
+; GFX9-NEXT:    v_max_i32_e32 v0, s6, v1
 ; GFX9-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB93_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_endpgm
@@ -5901,24 +5901,24 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out,
 ;
 ; VI-LABEL: atomic_umax_i32_addr64_offset:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_ashr_i32 s5, s3, 31
-; VI-NEXT:    s_mov_b32 s4, s3
-; VI-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
-; VI-NEXT:    s_add_u32 s4, s0, s4
-; VI-NEXT:    s_addc_u32 s5, s1, s5
-; VI-NEXT:    s_load_dword s3, s[4:5], 0x10
-; VI-NEXT:    s_add_u32 s4, s4, 16
-; VI-NEXT:    s_addc_u32 s5, s5, 0
-; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    s_ashr_i32 s1, s7, 31
+; VI-NEXT:    s_mov_b32 s0, s7
+; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; VI-NEXT:    s_add_u32 s2, s4, s0
+; VI-NEXT:    s_addc_u32 s3, s5, s1
+; VI-NEXT:    s_load_dword s4, s[2:3], 0x10
+; VI-NEXT:    s_add_u32 s2, s2, 16
+; VI-NEXT:    s_addc_u32 s3, s3, 0
+; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    s_mov_b64 s[0:1], 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v3, s3
-; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v3, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:  .LBB105_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; VI-NEXT:    v_max_u32_e32 v2, s2, v3
+; VI-NEXT:    v_max_u32_e32 v2, s6, v3
 ; VI-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
@@ -5932,28 +5932,28 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out,
 ;
 ; GFX9-LABEL: atomic_umax_i32_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s5, s3, 31
-; GFX9-NEXT:    s_mov_b32 s4, s3
-; GFX9-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
-; GFX9-NEXT:    s_add_u32 s0, s0, s4
-; GFX9-NEXT:    s_addc_u32 s1, s1, s5
-; GFX9-NEXT:    s_load_dword s3, s[0:1], 0x10
-; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:    s_ashr_i32 s1, s7, 31
+; GFX9-NEXT:    s_mov_b32 s0, s7
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; GFX9-NEXT:    s_add_u32 s0, s4, s0
+; GFX9-NEXT:    s_addc_u32 s1, s5, s1
+; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x10
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX9-NEXT:  .LBB105_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_max_u32_e32 v0, s2, v1
+; GFX9-NEXT:    v_max_u32_e32 v0, s6, v1
 ; GFX9-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB105_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_endpgm
@@ -7892,24 +7892,24 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i
 ;
 ; VI-LABEL: atomic_min_i32_addr64_offset:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_ashr_i32 s5, s3, 31
-; VI-NEXT:    s_mov_b32 s4, s3
-; VI-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
-; VI-NEXT:    s_add_u32 s4, s0, s4
-; VI-NEXT:    s_addc_u32 s5, s1, s5
-; VI-NEXT:    s_load_dword s3, s[4:5], 0x10
-; VI-NEXT:    s_add_u32 s4, s4, 16
-; VI-NEXT:    s_addc_u32 s5, s5, 0
-; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    s_ashr_i32 s1, s7, 31
+; VI-NEXT:    s_mov_b32 s0, s7
+; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; VI-NEXT:    s_add_u32 s2, s4, s0
+; VI-NEXT:    s_addc_u32 s3, s5, s1
+; VI-NEXT:    s_load_dword s4, s[2:3], 0x10
+; VI-NEXT:    s_add_u32 s2, s2, 16
+; VI-NEXT:    s_addc_u32 s3, s3, 0
+; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    s_mov_b64 s[0:1], 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v3, s3
-; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v3, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:  .LBB128_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; VI-NEXT:    v_min_i32_e32 v2, s2, v3
+; VI-NEXT:    v_min_i32_e32 v2, s6, v3
 ; VI-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
@@ -7923,28 +7923,28 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i
 ;
 ; GFX9-LABEL: atomic_min_i32_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s5, s3, 31
-; GFX9-NEXT:    s_mov_b32 s4, s3
-; GFX9-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
-; GFX9-NEXT:    s_add_u32 s0, s0, s4
-; GFX9-NEXT:    s_addc_u32 s1, s1, s5
-; GFX9-NEXT:    s_load_dword s3, s[0:1], 0x10
-; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:    s_ashr_i32 s1, s7, 31
+; GFX9-NEXT:    s_mov_b32 s0, s7
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; GFX9-NEXT:    s_add_u32 s0, s4, s0
+; GFX9-NEXT:    s_addc_u32 s1, s5, s1
+; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x10
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX9-NEXT:  .LBB128_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_min_i32_e32 v0, s2, v1
+; GFX9-NEXT:    v_min_i32_e32 v0, s6, v1
 ; GFX9-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB128_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll
index e2d55990473c0..adb803a22815c 100644
--- a/llvm/test/CodeGen/AMDGPU/half.ll
+++ b/llvm/test/CodeGen/AMDGPU/half.ll
@@ -100,13 +100,13 @@ define amdgpu_kernel void @load_v3f16_arg(ptr addrspace(1) %out, <3 x half> %arg
 ;
 ; GFX11-LABEL: load_v3f16_arg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s7
+; GFX11-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1] offset:4
-; GFX11-NEXT:    global_store_b32 v0, v2, s[0:1]
+; GFX11-NEXT:    global_store_b16 v0, v1, s[4:5] offset:4
+; GFX11-NEXT:    global_store_b32 v0, v2, s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -130,11 +130,11 @@ define amdgpu_kernel void @load_v4f16_arg(ptr addrspace(1) %out, <4 x half> %arg
 ;
 ; GFX11-LABEL: load_v4f16_arg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -349,14 +349,14 @@ define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(ptr addrspace(1) %out, <3
 ;
 ; GFX11-LABEL: extload_v3f16_to_v3f32_arg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
 ; GFX11-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshr_b32 s4, s2, 16
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, s2
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, s4
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, s3
-; GFX11-NEXT:    global_store_b96 v3, v[0:2], s[0:1]
+; GFX11-NEXT:    s_lshr_b32 s0, s6, 16
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, s6
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, s0
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, s7
+; GFX11-NEXT:    global_store_b96 v3, v[0:2], s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -398,16 +398,16 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4
 ;
 ; GFX11-LABEL: extload_v4f16_to_v4f32_arg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
 ; GFX11-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshr_b32 s4, s3, 16
-; GFX11-NEXT:    s_lshr_b32 s5, s2, 16
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, s2
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, s4
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, s5
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, s3
-; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
+; GFX11-NEXT:    s_lshr_b32 s0, s7, 16
+; GFX11-NEXT:    s_lshr_b32 s1, s6, 16
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, s6
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, s0
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, s1
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, s7
+; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -649,12 +649,12 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3
 ;
 ; GFX11-LABEL: extload_v3f16_to_v3f64_arg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshr_b32 s4, s2, 16
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, s3
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, s4
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v6, s2
+; GFX11-NEXT:    s_lshr_b32 s0, s6, 16
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, s7
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, s0
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v6, s6
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_cvt_f64_f32_e32 v[4:5], v0
 ; GFX11-NEXT:    v_cvt_f64_f32_e32 v[2:3], v1
@@ -662,8 +662,8 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3
 ; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v6
 ; GFX11-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_store_b64 v6, v[4:5], s[0:1] offset:16
-; GFX11-NEXT:    global_store_b128 v6, v[0:3], s[0:1]
+; GFX11-NEXT:    global_store_b64 v6, v[4:5], s[4:5] offset:16
+; GFX11-NEXT:    global_store_b128 v6, v[0:3], s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -725,14 +725,14 @@ define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(ptr addrspace(1) %out, <4
 ;
 ; GFX11-LABEL: extload_v4f16_to_v4f64_arg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshr_b32 s5, s3, 16
-; GFX11-NEXT:    s_lshr_b32 s4, s2, 16
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, s3
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, s5
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, s2
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v8, s4
+; GFX11-NEXT:    s_lshr_b32 s1, s7, 16
+; GFX11-NEXT:    s_lshr_b32 s0, s6, 16
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, s7
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, s1
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, s6
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v8, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cvt_f64_f32_e32 v[4:5], v2
 ; GFX11-NEXT:    v_cvt_f64_f32_e32 v[6:7], v3
@@ -741,8 +741,8 @@ define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(ptr addrspace(1) %out, <4
 ; GFX11-NEXT:    v_cvt_f64_f32_e32 v[2:3], v8
 ; GFX11-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_store_b128 v8, v[4:7], s[0:1] offset:16
-; GFX11-NEXT:    global_store_b128 v8, v[0:3], s[0:1]
+; GFX11-NEXT:    global_store_b128 v8, v[4:7], s[4:5] offset:16
+; GFX11-NEXT:    global_store_b128 v8, v[0:3], s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -2693,11 +2693,11 @@ define amdgpu_kernel void @fadd_v2f16(ptr addrspace(1) %out, <2 x half> %a, <2 x
 ;
 ; GFX11-LABEL: fadd_v2f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_pk_add_f16 v1, s2, s3
-; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    v_pk_add_f16 v1, s6, s7
+; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
index f736ca7cd625a..18d5c057d156a 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
@@ -85,19 +85,19 @@ entry:
 define amdgpu_kernel void @float2_inselt(ptr addrspace(1) %out, <2 x float> %vec, i32 %sel) {
 ; GCN-LABEL: float2_inselt:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_dword s4, s[0:1], 0x34
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dword s2, s[0:1], 0x34
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_cmp_lg_u32 s4, 1
-; GCN-NEXT:    v_mov_b32_e32 v0, s3
+; GCN-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN-NEXT:    v_mov_b32_e32 v0, s7
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmp_lg_u32 s4, 0
+; GCN-NEXT:    s_cmp_lg_u32 s2, 0
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, 1.0, v0, vcc
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_mov_b32_e32 v3, s1
+; GCN-NEXT:    v_mov_b32_e32 v2, s4
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, 1.0, v0, vcc
-; GCN-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-NEXT:    v_mov_b32_e32 v3, s5
 ; GCN-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN-NEXT:    s_endpgm
 entry:
@@ -289,20 +289,20 @@ entry:
 define amdgpu_kernel void @half4_inselt(ptr addrspace(1) %out, <4 x half> %vec, i32 %sel) {
 ; GCN-LABEL: half4_inselt:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_dword s6, s[0:1], 0x34
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NEXT:    s_mov_b32 s4, 0x3c003c00
-; GCN-NEXT:    s_mov_b32 s5, s4
+; GCN-NEXT:    s_load_dword s2, s[0:1], 0x34
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NEXT:    s_mov_b32 s0, 0x3c003c00
+; GCN-NEXT:    s_mov_b32 s1, s0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_lshl_b32 s6, s6, 4
-; GCN-NEXT:    s_lshl_b64 s[6:7], 0xffff, s6
-; GCN-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GCN-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
-; GCN-NEXT:    s_or_b64 s[2:3], s[4:5], s[2:3]
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    v_mov_b32_e32 v2, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s1
-; GCN-NEXT:    v_mov_b32_e32 v3, s3
+; GCN-NEXT:    s_lshl_b32 s2, s2, 4
+; GCN-NEXT:    s_lshl_b64 s[2:3], 0xffff, s2
+; GCN-NEXT:    s_andn2_b64 s[6:7], s[6:7], s[2:3]
+; GCN-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
+; GCN-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v3, s1
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GCN-NEXT:    s_endpgm
 entry:
@@ -314,16 +314,16 @@ entry:
 define amdgpu_kernel void @half2_inselt(ptr addrspace(1) %out, <2 x half> %vec, i32 %sel) {
 ; GCN-LABEL: half2_inselt:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_lshl_b32 s3, s3, 4
-; GCN-NEXT:    s_lshl_b32 s3, 0xffff, s3
-; GCN-NEXT:    s_andn2_b32 s2, s2, s3
-; GCN-NEXT:    s_and_b32 s3, s3, 0x3c003c00
-; GCN-NEXT:    s_or_b32 s2, s3, s2
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    v_mov_b32_e32 v1, s1
-; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    s_lshl_b32 s0, s7, 4
+; GCN-NEXT:    s_lshl_b32 s0, 0xffff, s0
+; GCN-NEXT:    s_andn2_b32 s1, s6, s0
+; GCN-NEXT:    s_and_b32 s0, s0, 0x3c003c00
+; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    flat_store_dword v[0:1], v2
 ; GCN-NEXT:    s_endpgm
 entry:
@@ -397,16 +397,16 @@ entry:
 define amdgpu_kernel void @short2_inselt(ptr addrspace(1) %out, <2 x i16> %vec, i32 %sel) {
 ; GCN-LABEL: short2_inselt:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_lshl_b32 s3, s3, 4
-; GCN-NEXT:    s_lshl_b32 s3, 0xffff, s3
-; GCN-NEXT:    s_andn2_b32 s2, s2, s3
-; GCN-NEXT:    s_and_b32 s3, s3, 0x10001
-; GCN-NEXT:    s_or_b32 s2, s3, s2
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    v_mov_b32_e32 v1, s1
-; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    s_lshl_b32 s0, s7, 4
+; GCN-NEXT:    s_lshl_b32 s0, 0xffff, s0
+; GCN-NEXT:    s_andn2_b32 s1, s6, s0
+; GCN-NEXT:    s_and_b32 s0, s0, 0x10001
+; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    flat_store_dword v[0:1], v2
 ; GCN-NEXT:    s_endpgm
 entry:
@@ -418,20 +418,20 @@ entry:
 define amdgpu_kernel void @short4_inselt(ptr addrspace(1) %out, <4 x i16> %vec, i32 %sel) {
 ; GCN-LABEL: short4_inselt:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_dword s6, s[0:1], 0x34
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NEXT:    s_mov_b32 s4, 0x10001
-; GCN-NEXT:    s_mov_b32 s5, s4
+; GCN-NEXT:    s_load_dword s2, s[0:1], 0x34
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NEXT:    s_mov_b32 s0, 0x10001
+; GCN-NEXT:    s_mov_b32 s1, s0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_lshl_b32 s6, s6, 4
-; GCN-NEXT:    s_lshl_b64 s[6:7], 0xffff, s6
-; GCN-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GCN-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
-; GCN-NEXT:    s_or_b64 s[2:3], s[4:5], s[2:3]
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    v_mov_b32_e32 v2, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s1
-; GCN-NEXT:    v_mov_b32_e32 v3, s3
+; GCN-NEXT:    s_lshl_b32 s2, s2, 4
+; GCN-NEXT:    s_lshl_b64 s[2:3], 0xffff, s2
+; GCN-NEXT:    s_andn2_b64 s[6:7], s[6:7], s[2:3]
+; GCN-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
+; GCN-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v3, s1
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GCN-NEXT:    s_endpgm
 entry:
@@ -443,19 +443,19 @@ entry:
 define amdgpu_kernel void @byte8_inselt(ptr addrspace(1) %out, <8 x i8> %vec, i32 %sel) {
 ; GCN-LABEL: byte8_inselt:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_dword s4, s[0:1], 0x34
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dword s2, s[0:1], 0x34
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_lshl_b32 s4, s4, 3
-; GCN-NEXT:    s_lshl_b64 s[4:5], 0xff, s4
-; GCN-NEXT:    s_and_b32 s7, s5, 0x1010101
-; GCN-NEXT:    s_and_b32 s6, s4, 0x1010101
-; GCN-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[4:5]
-; GCN-NEXT:    s_or_b64 s[2:3], s[6:7], s[2:3]
-; GCN-NEXT:    v_mov_b32_e32 v3, s1
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-NEXT:    s_lshl_b32 s0, s2, 3
+; GCN-NEXT:    s_lshl_b64 s[0:1], 0xff, s0
+; GCN-NEXT:    s_and_b32 s3, s1, 0x1010101
+; GCN-NEXT:    s_and_b32 s2, s0, 0x1010101
+; GCN-NEXT:    s_andn2_b64 s[0:1], s[6:7], s[0:1]
+; GCN-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
+; GCN-NEXT:    v_mov_b32_e32 v2, s4
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    v_mov_b32_e32 v3, s5
 ; GCN-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN-NEXT:    s_endpgm
 entry:
@@ -962,34 +962,34 @@ entry:
 define amdgpu_kernel void @bit4_inselt(ptr addrspace(1) %out, <4 x i1> %vec, i32 %sel) {
 ; GCN-LABEL: bit4_inselt:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_mov_b32 s4, SCRATCH_RSRC_DWORD0
-; GCN-NEXT:    s_mov_b32 s5, SCRATCH_RSRC_DWORD1
-; GCN-NEXT:    s_mov_b32 s6, -1
-; GCN-NEXT:    s_mov_b32 s7, 0xe80000
-; GCN-NEXT:    s_add_u32 s4, s4, s3
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NEXT:    s_addc_u32 s5, s5, 0
+; GCN-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GCN-NEXT:    s_mov_b32 s10, -1
+; GCN-NEXT:    s_mov_b32 s11, 0xe80000
+; GCN-NEXT:    s_add_u32 s8, s8, s3
+; GCN-NEXT:    s_addc_u32 s9, s9, 0
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_and_b32 s3, s3, 3
-; GCN-NEXT:    v_mov_b32_e32 v1, s2
-; GCN-NEXT:    v_lshrrev_b16_e64 v2, 1, s2
-; GCN-NEXT:    v_lshrrev_b16_e64 v3, 2, s2
-; GCN-NEXT:    v_lshrrev_b16_e64 v4, 3, s2
-; GCN-NEXT:    v_or_b32_e32 v0, s3, v0
+; GCN-NEXT:    s_and_b32 s0, s7, 3
+; GCN-NEXT:    v_mov_b32_e32 v1, s6
+; GCN-NEXT:    v_lshrrev_b16_e64 v2, 1, s6
+; GCN-NEXT:    v_lshrrev_b16_e64 v3, 2, s6
+; GCN-NEXT:    v_lshrrev_b16_e64 v4, 3, s6
+; GCN-NEXT:    v_or_b32_e32 v0, s0, v0
 ; GCN-NEXT:    v_and_b32_e32 v2, 1, v2
 ; GCN-NEXT:    v_and_b32_e32 v3, 3, v3
 ; GCN-NEXT:    v_and_b32_e32 v4, 1, v4
-; GCN-NEXT:    buffer_store_byte v1, off, s[4:7], 0
-; GCN-NEXT:    buffer_store_byte v4, off, s[4:7], 0 offset:3
-; GCN-NEXT:    buffer_store_byte v3, off, s[4:7], 0 offset:2
-; GCN-NEXT:    buffer_store_byte v2, off, s[4:7], 0 offset:1
+; GCN-NEXT:    buffer_store_byte v1, off, s[8:11], 0
+; GCN-NEXT:    buffer_store_byte v4, off, s[8:11], 0 offset:3
+; GCN-NEXT:    buffer_store_byte v3, off, s[8:11], 0 offset:2
+; GCN-NEXT:    buffer_store_byte v2, off, s[8:11], 0 offset:1
 ; GCN-NEXT:    v_mov_b32_e32 v1, 1
-; GCN-NEXT:    buffer_store_byte v1, v0, s[4:7], 0 offen
-; GCN-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
-; GCN-NEXT:    buffer_load_ubyte v1, off, s[4:7], 0 offset:1
-; GCN-NEXT:    buffer_load_ubyte v2, off, s[4:7], 0 offset:2
-; GCN-NEXT:    buffer_load_ubyte v3, off, s[4:7], 0 offset:3
+; GCN-NEXT:    buffer_store_byte v1, v0, s[8:11], 0 offen
+; GCN-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
+; GCN-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0 offset:1
+; GCN-NEXT:    buffer_load_ubyte v2, off, s[8:11], 0 offset:2
+; GCN-NEXT:    buffer_load_ubyte v3, off, s[8:11], 0 offset:3
 ; GCN-NEXT:    s_waitcnt vmcnt(3)
 ; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GCN-NEXT:    s_waitcnt vmcnt(2)
@@ -1004,8 +1004,8 @@ define amdgpu_kernel void @bit4_inselt(ptr addrspace(1) %out, <4 x i1> %vec, i32
 ; GCN-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GCN-NEXT:    v_or_b32_e32 v0, v0, v3
 ; GCN-NEXT:    v_and_b32_e32 v2, 15, v0
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-NEXT:    flat_store_byte v[0:1], v2
 ; GCN-NEXT:    s_endpgm
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
index df03e89370377..13134601cc33d 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
@@ -436,193 +436,193 @@ entry:
 define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ; GFX9-LABEL: udiv_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX9-NEXT:    s_sub_i32 s4, 0, s3
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s7
+; GFX9-NEXT:    s_sub_i32 s0, 0, s7
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_readfirstlane_b32 s5, v0
-; GFX9-NEXT:    s_mul_i32 s4, s4, s5
-; GFX9-NEXT:    s_mul_hi_u32 s4, s5, s4
-; GFX9-NEXT:    s_add_i32 s5, s5, s4
-; GFX9-NEXT:    s_mul_hi_u32 s4, s2, s5
-; GFX9-NEXT:    s_mul_i32 s5, s4, s3
-; GFX9-NEXT:    s_sub_i32 s2, s2, s5
-; GFX9-NEXT:    s_add_i32 s6, s4, 1
-; GFX9-NEXT:    s_sub_i32 s5, s2, s3
-; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
-; GFX9-NEXT:    s_cselect_b32 s4, s6, s4
-; GFX9-NEXT:    s_cselect_b32 s2, s5, s2
-; GFX9-NEXT:    s_add_i32 s5, s4, 1
-; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
-; GFX9-NEXT:    s_cselect_b32 s2, s5, s4
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX9-NEXT:    s_mul_i32 s0, s0, s1
+; GFX9-NEXT:    s_mul_hi_u32 s0, s1, s0
+; GFX9-NEXT:    s_add_i32 s1, s1, s0
+; GFX9-NEXT:    s_mul_hi_u32 s0, s6, s1
+; GFX9-NEXT:    s_mul_i32 s1, s0, s7
+; GFX9-NEXT:    s_sub_i32 s1, s6, s1
+; GFX9-NEXT:    s_add_i32 s2, s0, 1
+; GFX9-NEXT:    s_sub_i32 s3, s1, s7
+; GFX9-NEXT:    s_cmp_ge_u32 s1, s7
+; GFX9-NEXT:    s_cselect_b32 s0, s2, s0
+; GFX9-NEXT:    s_cselect_b32 s1, s3, s1
+; GFX9-NEXT:    s_add_i32 s2, s0, 1
+; GFX9-NEXT:    s_cmp_ge_u32 s1, s7
+; GFX9-NEXT:    s_cselect_b32 s0, s2, s0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX90A-LABEL: udiv_i32:
 ; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, 0
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX90A-NEXT:    s_sub_i32 s4, 0, s3
+; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s7
+; GFX90A-NEXT:    s_sub_i32 s0, 0, s7
 ; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX90A-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX90A-NEXT:    v_readfirstlane_b32 s5, v0
-; GFX90A-NEXT:    s_mul_i32 s4, s4, s5
-; GFX90A-NEXT:    s_mul_hi_u32 s4, s5, s4
-; GFX90A-NEXT:    s_add_i32 s5, s5, s4
-; GFX90A-NEXT:    s_mul_hi_u32 s4, s2, s5
-; GFX90A-NEXT:    s_mul_i32 s5, s4, s3
-; GFX90A-NEXT:    s_sub_i32 s2, s2, s5
-; GFX90A-NEXT:    s_add_i32 s6, s4, 1
-; GFX90A-NEXT:    s_sub_i32 s5, s2, s3
-; GFX90A-NEXT:    s_cmp_ge_u32 s2, s3
-; GFX90A-NEXT:    s_cselect_b32 s4, s6, s4
-; GFX90A-NEXT:    s_cselect_b32 s2, s5, s2
-; GFX90A-NEXT:    s_add_i32 s5, s4, 1
-; GFX90A-NEXT:    s_cmp_ge_u32 s2, s3
-; GFX90A-NEXT:    s_cselect_b32 s2, s5, s4
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX90A-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX90A-NEXT:    s_mul_i32 s0, s0, s1
+; GFX90A-NEXT:    s_mul_hi_u32 s0, s1, s0
+; GFX90A-NEXT:    s_add_i32 s1, s1, s0
+; GFX90A-NEXT:    s_mul_hi_u32 s0, s6, s1
+; GFX90A-NEXT:    s_mul_i32 s1, s0, s7
+; GFX90A-NEXT:    s_sub_i32 s1, s6, s1
+; GFX90A-NEXT:    s_add_i32 s2, s0, 1
+; GFX90A-NEXT:    s_sub_i32 s3, s1, s7
+; GFX90A-NEXT:    s_cmp_ge_u32 s1, s7
+; GFX90A-NEXT:    s_cselect_b32 s0, s2, s0
+; GFX90A-NEXT:    s_cselect_b32 s1, s3, s1
+; GFX90A-NEXT:    s_add_i32 s2, s0, 1
+; GFX90A-NEXT:    s_cmp_ge_u32 s1, s7
+; GFX90A-NEXT:    s_cselect_b32 s0, s2, s0
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: udiv_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX10-NEXT:    s_sub_i32 s5, 0, s3
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s7
+; GFX10-NEXT:    s_sub_i32 s1, 0, s7
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX10-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-NEXT:    s_mul_i32 s5, s5, s4
-; GFX10-NEXT:    s_mul_hi_u32 s5, s4, s5
-; GFX10-NEXT:    s_add_i32 s4, s4, s5
-; GFX10-NEXT:    s_mul_hi_u32 s4, s2, s4
-; GFX10-NEXT:    s_mul_i32 s5, s4, s3
-; GFX10-NEXT:    s_sub_i32 s2, s2, s5
-; GFX10-NEXT:    s_add_i32 s5, s4, 1
-; GFX10-NEXT:    s_sub_i32 s6, s2, s3
-; GFX10-NEXT:    s_cmp_ge_u32 s2, s3
-; GFX10-NEXT:    s_cselect_b32 s4, s5, s4
-; GFX10-NEXT:    s_cselect_b32 s2, s6, s2
-; GFX10-NEXT:    s_add_i32 s5, s4, 1
-; GFX10-NEXT:    s_cmp_ge_u32 s2, s3
-; GFX10-NEXT:    s_cselect_b32 s2, s5, s4
-; GFX10-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    s_mul_i32 s1, s1, s0
+; GFX10-NEXT:    s_mul_hi_u32 s1, s0, s1
+; GFX10-NEXT:    s_add_i32 s0, s0, s1
+; GFX10-NEXT:    s_mul_hi_u32 s0, s6, s0
+; GFX10-NEXT:    s_mul_i32 s1, s0, s7
+; GFX10-NEXT:    s_add_i32 s2, s0, 1
+; GFX10-NEXT:    s_sub_i32 s1, s6, s1
+; GFX10-NEXT:    s_sub_i32 s3, s1, s7
+; GFX10-NEXT:    s_cmp_ge_u32 s1, s7
+; GFX10-NEXT:    s_cselect_b32 s0, s2, s0
+; GFX10-NEXT:    s_cselect_b32 s1, s3, s1
+; GFX10-NEXT:    s_add_i32 s2, s0, 1
+; GFX10-NEXT:    s_cmp_ge_u32 s1, s7
+; GFX10-NEXT:    s_cselect_b32 s0, s2, s0
+; GFX10-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-FLATSCR-LABEL: udiv_i32:
 ; GFX9-FLATSCR:       ; %bb.0:
-; GFX9-FLATSCR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-FLATSCR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-FLATSCR-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX9-FLATSCR-NEXT:    s_sub_i32 s4, 0, s3
+; GFX9-FLATSCR-NEXT:    v_cvt_f32_u32_e32 v0, s7
+; GFX9-FLATSCR-NEXT:    s_sub_i32 s0, 0, s7
 ; GFX9-FLATSCR-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-FLATSCR-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-FLATSCR-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-FLATSCR-NEXT:    v_readfirstlane_b32 s5, v0
-; GFX9-FLATSCR-NEXT:    s_mul_i32 s4, s4, s5
-; GFX9-FLATSCR-NEXT:    s_mul_hi_u32 s4, s5, s4
-; GFX9-FLATSCR-NEXT:    s_add_i32 s5, s5, s4
-; GFX9-FLATSCR-NEXT:    s_mul_hi_u32 s4, s2, s5
-; GFX9-FLATSCR-NEXT:    s_mul_i32 s5, s4, s3
-; GFX9-FLATSCR-NEXT:    s_sub_i32 s2, s2, s5
-; GFX9-FLATSCR-NEXT:    s_add_i32 s6, s4, 1
-; GFX9-FLATSCR-NEXT:    s_sub_i32 s5, s2, s3
-; GFX9-FLATSCR-NEXT:    s_cmp_ge_u32 s2, s3
-; GFX9-FLATSCR-NEXT:    s_cselect_b32 s4, s6, s4
-; GFX9-FLATSCR-NEXT:    s_cselect_b32 s2, s5, s2
-; GFX9-FLATSCR-NEXT:    s_add_i32 s5, s4, 1
-; GFX9-FLATSCR-NEXT:    s_cmp_ge_u32 s2, s3
-; GFX9-FLATSCR-NEXT:    s_cselect_b32 s2, s5, s4
-; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-FLATSCR-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-FLATSCR-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX9-FLATSCR-NEXT:    s_mul_i32 s0, s0, s1
+; GFX9-FLATSCR-NEXT:    s_mul_hi_u32 s0, s1, s0
+; GFX9-FLATSCR-NEXT:    s_add_i32 s1, s1, s0
+; GFX9-FLATSCR-NEXT:    s_mul_hi_u32 s0, s6, s1
+; GFX9-FLATSCR-NEXT:    s_mul_i32 s1, s0, s7
+; GFX9-FLATSCR-NEXT:    s_sub_i32 s1, s6, s1
+; GFX9-FLATSCR-NEXT:    s_add_i32 s2, s0, 1
+; GFX9-FLATSCR-NEXT:    s_sub_i32 s3, s1, s7
+; GFX9-FLATSCR-NEXT:    s_cmp_ge_u32 s1, s7
+; GFX9-FLATSCR-NEXT:    s_cselect_b32 s0, s2, s0
+; GFX9-FLATSCR-NEXT:    s_cselect_b32 s1, s3, s1
+; GFX9-FLATSCR-NEXT:    s_add_i32 s2, s0, 1
+; GFX9-FLATSCR-NEXT:    s_cmp_ge_u32 s1, s7
+; GFX9-FLATSCR-NEXT:    s_cselect_b32 s0, s2, s0
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-FLATSCR-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: udiv_i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX11-NEXT:    s_sub_i32 s5, 0, s3
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, s7
+; GFX11-NEXT:    s_sub_i32 s1, 0, s7
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX11-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    s_mul_i32 s5, s5, s4
+; GFX11-NEXT:    s_mul_i32 s1, s1, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_mul_hi_u32 s5, s4, s5
-; GFX11-NEXT:    s_add_i32 s4, s4, s5
+; GFX11-NEXT:    s_mul_hi_u32 s1, s0, s1
+; GFX11-NEXT:    s_add_i32 s0, s0, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_mul_hi_u32 s4, s2, s4
-; GFX11-NEXT:    s_mul_i32 s5, s4, s3
+; GFX11-NEXT:    s_mul_hi_u32 s0, s6, s0
+; GFX11-NEXT:    s_mul_i32 s1, s0, s7
+; GFX11-NEXT:    s_add_i32 s2, s0, 1
+; GFX11-NEXT:    s_sub_i32 s1, s6, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_sub_i32 s2, s2, s5
-; GFX11-NEXT:    s_add_i32 s5, s4, 1
-; GFX11-NEXT:    s_sub_i32 s6, s2, s3
-; GFX11-NEXT:    s_cmp_ge_u32 s2, s3
-; GFX11-NEXT:    s_cselect_b32 s4, s5, s4
-; GFX11-NEXT:    s_cselect_b32 s2, s6, s2
-; GFX11-NEXT:    s_add_i32 s5, s4, 1
-; GFX11-NEXT:    s_cmp_ge_u32 s2, s3
-; GFX11-NEXT:    s_cselect_b32 s2, s5, s4
+; GFX11-NEXT:    s_sub_i32 s3, s1, s7
+; GFX11-NEXT:    s_cmp_ge_u32 s1, s7
+; GFX11-NEXT:    s_cselect_b32 s0, s2, s0
+; GFX11-NEXT:    s_cselect_b32 s1, s3, s1
+; GFX11-NEXT:    s_add_i32 s2, s0, 1
+; GFX11-NEXT:    s_cmp_ge_u32 s1, s7
+; GFX11-NEXT:    s_cselect_b32 s0, s2, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_mov_b32_e32 v1, s2
-; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: udiv_i32:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_cvt_f32_u32 s4, s3
-; GFX12-NEXT:    s_sub_co_i32 s5, 0, s3
+; GFX12-NEXT:    s_cvt_f32_u32 s0, s7
+; GFX12-NEXT:    s_sub_co_i32 s1, 0, s7
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
-; GFX12-NEXT:    v_rcp_iflag_f32_e32 v0, s4
-; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX12-NEXT:    v_rcp_iflag_f32_e32 v0, s0
+; GFX12-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_3)
-; GFX12-NEXT:    s_mul_f32 s4, s4, 0x4f7ffffe
-; GFX12-NEXT:    s_cvt_u32_f32 s4, s4
+; GFX12-NEXT:    s_mul_f32 s0, s0, 0x4f7ffffe
+; GFX12-NEXT:    s_cvt_u32_f32 s0, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_mul_i32 s5, s5, s4
-; GFX12-NEXT:    s_mul_hi_u32 s5, s4, s5
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_add_co_i32 s4, s4, s5
-; GFX12-NEXT:    s_mul_hi_u32 s4, s2, s4
+; GFX12-NEXT:    s_mul_i32 s1, s1, s0
+; GFX12-NEXT:    s_mul_hi_u32 s1, s0, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_mul_i32 s5, s4, s3
-; GFX12-NEXT:    s_sub_co_i32 s2, s2, s5
-; GFX12-NEXT:    s_add_co_i32 s5, s4, 1
-; GFX12-NEXT:    s_sub_co_i32 s6, s2, s3
-; GFX12-NEXT:    s_cmp_ge_u32 s2, s3
-; GFX12-NEXT:    s_cselect_b32 s4, s5, s4
-; GFX12-NEXT:    s_cselect_b32 s2, s6, s2
-; GFX12-NEXT:    s_add_co_i32 s5, s4, 1
-; GFX12-NEXT:    s_cmp_ge_u32 s2, s3
-; GFX12-NEXT:    s_cselect_b32 s2, s5, s4
+; GFX12-NEXT:    s_add_co_i32 s0, s0, s1
+; GFX12-NEXT:    s_mul_hi_u32 s0, s6, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_mul_i32 s1, s0, s7
+; GFX12-NEXT:    s_add_co_i32 s2, s0, 1
+; GFX12-NEXT:    s_sub_co_i32 s1, s6, s1
+; GFX12-NEXT:    s_sub_co_i32 s3, s1, s7
+; GFX12-NEXT:    s_cmp_ge_u32 s1, s7
+; GFX12-NEXT:    s_cselect_b32 s0, s2, s0
+; GFX12-NEXT:    s_cselect_b32 s1, s3, s1
+; GFX12-NEXT:    s_add_co_i32 s2, s0, 1
+; GFX12-NEXT:    s_cmp_ge_u32 s1, s7
+; GFX12-NEXT:    s_cselect_b32 s0, s2, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT:    global_store_b32 v0, v1, s[4:5]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    s_endpgm
   %r = udiv i32 %x, %y
diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
index 69f181fcede30..a344128d94fcf 100644
--- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
@@ -731,12 +731,12 @@ define amdgpu_kernel void @v2i32_arg(ptr addrspace(1) nocapture %out, <2 x i32>
 ;
 ; VI-LABEL: v2i32_arg:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_mov_b32_e32 v3, s3
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v2, s6
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v3, s7
 ; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
 ;
@@ -794,12 +794,12 @@ define amdgpu_kernel void @v2f32_arg(ptr addrspace(1) nocapture %out, <2 x float
 ;
 ; VI-LABEL: v2f32_arg:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_mov_b32_e32 v3, s3
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v2, s6
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v3, s7
 ; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
 ;
@@ -998,16 +998,16 @@ define amdgpu_kernel void @v3i16_arg(ptr addrspace(1) nocapture %out, <3 x i16>
 ;
 ; VI-LABEL: v3i16_arg:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_add_u32 s4, s0, 4
-; VI-NEXT:    s_addc_u32 s5, s1, 0
-; VI-NEXT:    v_mov_b32_e32 v2, s4
-; VI-NEXT:    v_mov_b32_e32 v4, s3
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v3, s5
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_mov_b32_e32 v5, s2
+; VI-NEXT:    s_add_u32 s0, s4, 4
+; VI-NEXT:    s_addc_u32 s1, s5, 0
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v4, s7
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v5, s6
 ; VI-NEXT:    flat_store_short v[2:3], v4
 ; VI-NEXT:    flat_store_dword v[0:1], v5
 ; VI-NEXT:    s_endpgm
@@ -1332,12 +1332,12 @@ define amdgpu_kernel void @v4i16_arg(ptr addrspace(1) %out, <4 x i16> %in) {
 ;
 ; VI-LABEL: v4i16_arg:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_mov_b32_e32 v3, s3
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v2, s6
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v3, s7
 ; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
 ;
@@ -1545,16 +1545,16 @@ define amdgpu_kernel void @v5i8_arg(ptr addrspace(1) nocapture %out, <5 x i8> %i
 ;
 ; VI-LABEL: v5i8_arg:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_add_u32 s4, s0, 4
-; VI-NEXT:    s_addc_u32 s5, s1, 0
-; VI-NEXT:    v_mov_b32_e32 v2, s4
-; VI-NEXT:    v_mov_b32_e32 v4, s3
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v3, s5
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_mov_b32_e32 v5, s2
+; VI-NEXT:    s_add_u32 s0, s4, 4
+; VI-NEXT:    s_addc_u32 s1, s5, 0
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v4, s7
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v5, s6
 ; VI-NEXT:    flat_store_byte v[2:3], v4
 ; VI-NEXT:    flat_store_dword v[0:1], v5
 ; VI-NEXT:    s_endpgm
@@ -2397,12 +2397,12 @@ define amdgpu_kernel void @v8i8_arg(ptr addrspace(1) %out, <8 x i8> %in) {
 ;
 ; VI-LABEL: v8i8_arg:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_mov_b32_e32 v3, s3
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v2, s6
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v3, s7
 ; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
 ;
@@ -4539,19 +4539,19 @@ define amdgpu_kernel void @i65_arg(ptr addrspace(1) nocapture %out, i65 %in) nou
 ;
 ; VI-LABEL: i65_arg:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dword s4, s[0:1], 0x34
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dword s2, s[0:1], 0x34
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_and_b32 s4, s4, 1
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    s_add_u32 s0, s0, 8
-; VI-NEXT:    s_addc_u32 s1, s1, 0
+; VI-NEXT:    s_and_b32 s1, s2, 1
+; VI-NEXT:    s_add_u32 s0, s4, 8
+; VI-NEXT:    v_mov_b32_e32 v6, s1
+; VI-NEXT:    s_addc_u32 s1, s5, 0
 ; VI-NEXT:    v_mov_b32_e32 v5, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_mov_b32_e32 v6, s4
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v2, s6
 ; VI-NEXT:    v_mov_b32_e32 v4, s0
-; VI-NEXT:    v_mov_b32_e32 v3, s3
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v3, s7
 ; VI-NEXT:    flat_store_byte v[4:5], v6
 ; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
@@ -5565,18 +5565,18 @@ define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) {
 ;
 ; VI-LABEL: array_3xi32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_mov_b32_e32 v2, s6
 ; VI-NEXT:    flat_store_short v[0:1], v0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    flat_store_dword v[0:1], v1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s5
 ; VI-NEXT:    flat_store_dword v[0:1], v0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
index 920ff8a927e2d..01a1ab41c5cac 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
@@ -21,41 +21,41 @@ define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, float %x
 ;
 ; VI-LABEL: s_cvt_pkrtz_v2f16_f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s3
-; VI-NEXT:    v_cvt_pkrtz_f16_f32 v2, s2, v0
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s7
+; VI-NEXT:    v_cvt_pkrtz_f16_f32 v2, s6, v0
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: s_cvt_pkrtz_v2f16_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    v_cvt_pkrtz_f16_f32 v1, s2, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    v_cvt_pkrtz_f16_f32 v1, s6, v1
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: s_cvt_pkrtz_v2f16_f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_cvt_pkrtz_f16_f32_e64 v1, s2, s3
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    v_cvt_pkrtz_f16_f32_e64 v1, s6, s7
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_cvt_pkrtz_v2f16_f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_cvt_pk_rtz_f16_f32_e64 v1, s2, s3
-; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    v_cvt_pk_rtz_f16_f32_e64 v1, s6, s7
+; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll
index 17b941c59fd3f..a737c5e7dd265 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll
@@ -15,47 +15,47 @@ declare half @llvm.fabs.f16(half) #0
 define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float %src, float %a) {
 ; SDAG-GFX11-LABEL: v_fcmp_f32_oeq_with_fabs:
 ; SDAG-GFX11:       ; %bb.0:
-; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; SDAG-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT:    v_cmp_eq_f32_e64 s2, s2, |s3|
+; SDAG-GFX11-NEXT:    v_cmp_eq_f32_e64 s0, s6, |s7|
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
 ; SDAG-GFX11-NEXT:    s_nop 0
 ; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f32_oeq_with_fabs:
 ; SDAG-GFX10:       ; %bb.0:
-; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_eq_f32_e64 s2, s2, |s3|
-; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT:    v_cmp_eq_f32_e64 s0, s6, |s7|
+; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; SDAG-GFX10-NEXT:    s_endpgm
 ;
 ; GISEL-GFX11-LABEL: v_fcmp_f32_oeq_with_fabs:
 ; GISEL-GFX11:       ; %bb.0:
-; GISEL-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT:    v_cmp_eq_f32_e64 s2, s2, |s3|
+; GISEL-GFX11-NEXT:    v_cmp_eq_f32_e64 s0, s6, |s7|
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
 ; GISEL-GFX11-NEXT:    s_nop 0
 ; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f32_oeq_with_fabs:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_eq_f32_e64 s2, s2, |s3|
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT:    v_cmp_eq_f32_e64 s0, s6, |s7|
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %temp = call float @llvm.fabs.f32(float %a)
   %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float %temp, i32 1)
@@ -66,47 +66,47 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float
 define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace(1) %out, float %src, float %a) {
 ; SDAG-GFX11-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs:
 ; SDAG-GFX11:       ; %bb.0:
-; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; SDAG-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT:    v_cmp_eq_f32_e64 s2, |s2|, |s3|
+; SDAG-GFX11-NEXT:    v_cmp_eq_f32_e64 s0, |s6|, |s7|
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
 ; SDAG-GFX11-NEXT:    s_nop 0
 ; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs:
 ; SDAG-GFX10:       ; %bb.0:
-; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_eq_f32_e64 s2, |s2|, |s3|
-; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT:    v_cmp_eq_f32_e64 s0, |s6|, |s7|
+; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; SDAG-GFX10-NEXT:    s_endpgm
 ;
 ; GISEL-GFX11-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs:
 ; GISEL-GFX11:       ; %bb.0:
-; GISEL-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT:    v_cmp_eq_f32_e64 s2, |s2|, |s3|
+; GISEL-GFX11-NEXT:    v_cmp_eq_f32_e64 s0, |s6|, |s7|
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
 ; GISEL-GFX11-NEXT:    s_nop 0
 ; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_eq_f32_e64 s2, |s2|, |s3|
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT:    v_cmp_eq_f32_e64 s0, |s6|, |s7|
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %temp = call float @llvm.fabs.f32(float %a)
   %src_input = call float @llvm.fabs.f32(float %src)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll
index ce055d6527996..7d41cf1c5bcb8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll
@@ -16,53 +16,53 @@ declare half @llvm.fabs.f16(half) #0
 define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float %src, float %a) {
 ; GFX11-LABEL: v_fcmp_f32_oeq_with_fabs:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_eq_f32_e64 s[2:3], s2, |s3|
+; GFX11-NEXT:    v_cmp_eq_f32_e64 s[0:1], s6, |s7|
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mov_b32_e32 v0, s2
-; GFX11-NEXT:    v_mov_b32_e32 v1, s3
-; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_fcmp_f32_oeq_with_fabs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s3
-; GFX9-NEXT:    v_cmp_eq_f32_e64 s[2:3], s2, |v0|
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s7
+; GFX9-NEXT:    v_cmp_eq_f32_e64 s[0:1], s6, |v0|
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_fcmp_f32_oeq_with_fabs:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT:    v_mov_b32_e32 v0, s3
-; VI-SDAG-NEXT:    v_cmp_eq_f32_e64 s[2:3], s2, |v0|
-; VI-SDAG-NEXT:    v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT:    v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT:    v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT:    v_mov_b32_e32 v0, s7
+; VI-SDAG-NEXT:    v_cmp_eq_f32_e64 s[0:1], s6, |v0|
+; VI-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; VI-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-SDAG-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-SDAG-NEXT:    s_endpgm
 ;
 ; VI-GISEL-LABEL: v_fcmp_f32_oeq_with_fabs:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s3
-; VI-GISEL-NEXT:    v_cmp_eq_f32_e64 s[2:3], s2, |v0|
-; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT:    v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s7
+; VI-GISEL-NEXT:    v_cmp_eq_f32_e64 s[0:1], s6, |v0|
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    s_endpgm
   %temp = call float @llvm.fabs.f32(float %a)
@@ -74,53 +74,53 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float
 define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace(1) %out, float %src, float %a) {
 ; GFX11-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_eq_f32_e64 s[2:3], |s2|, |s3|
+; GFX11-NEXT:    v_cmp_eq_f32_e64 s[0:1], |s6|, |s7|
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mov_b32_e32 v0, s2
-; GFX11-NEXT:    v_mov_b32_e32 v1, s3
-; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s3
-; GFX9-NEXT:    v_cmp_eq_f32_e64 s[2:3], |s2|, |v0|
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s7
+; GFX9-NEXT:    v_cmp_eq_f32_e64 s[0:1], |s6|, |v0|
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT:    v_mov_b32_e32 v0, s3
-; VI-SDAG-NEXT:    v_cmp_eq_f32_e64 s[2:3], |s2|, |v0|
-; VI-SDAG-NEXT:    v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT:    v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT:    v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT:    v_mov_b32_e32 v0, s7
+; VI-SDAG-NEXT:    v_cmp_eq_f32_e64 s[0:1], |s6|, |v0|
+; VI-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; VI-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-SDAG-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-SDAG-NEXT:    s_endpgm
 ;
 ; VI-GISEL-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s3
-; VI-GISEL-NEXT:    v_cmp_eq_f32_e64 s[2:3], |s2|, |v0|
-; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT:    v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s7
+; VI-GISEL-NEXT:    v_cmp_eq_f32_e64 s[0:1], |s6|, |v0|
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    s_endpgm
   %temp = call float @llvm.fabs.f32(float %a)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.csub.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.csub.ll
index d7dd0ce58a08f..4a66b761306f3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.csub.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.csub.ll
@@ -6,7 +6,7 @@ declare i32 @llvm.amdgcn.global.atomic.csub(ptr addrspace(1), i32)
 
 ; GCN-LABEL: {{^}}global_atomic_csub_rtn:
 ; PREGFX12: global_atomic_csub v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9:]+}}, s{{\[[0-9]+:[0-9]+\]}} glc
-; GFX12PLUS: global_atomic_sub_clamp_u32 v0, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX12PLUS: global_atomic_sub_clamp_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} th:TH_ATOMIC_RETURN
 define amdgpu_kernel void @global_atomic_csub_rtn(ptr addrspace(1) %ptr, i32 %data) {
 main_body:
   %ret = call i32 @llvm.amdgcn.global.atomic.csub(ptr addrspace(1) %ptr, i32 %data)
@@ -15,7 +15,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}global_atomic_csub_no_rtn:
 ; PREGFX12: global_atomic_csub v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}
-; GFX12PLUS: global_atomic_sub_clamp_u32 v0, v1, s[0:1]
+; GFX12PLUS: global_atomic_sub_clamp_u32 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @global_atomic_csub_no_rtn(ptr addrspace(1) %ptr, i32 %data) #0 {
 main_body:
   %ret = call i32 @llvm.amdgcn.global.atomic.csub(ptr addrspace(1) %ptr, i32 %data)
@@ -24,7 +24,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}global_atomic_csub_off4_rtn:
 ; PREGFX12: global_atomic_csub v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:4 glc
-; GFX12PLUS: global_atomic_sub_clamp_u32 v0, v0, v1, s[0:1] offset:4 th:TH_ATOMIC_RETURN
+; GFX12PLUS: global_atomic_sub_clamp_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:4 th:TH_ATOMIC_RETURN
 define amdgpu_kernel void @global_atomic_csub_off4_rtn(ptr addrspace(1) %ptr, i32 %data) {
 main_body:
   %p = getelementptr i32, ptr addrspace(1) %ptr, i64 1
@@ -34,7 +34,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}global_atomic_csub_off4_no_rtn:
 ; PREGFX12: global_atomic_csub v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:4
-; GFX12PLUS: global_atomic_sub_clamp_u32 v0, v1, s[0:1] offset:4
+; GFX12PLUS: global_atomic_sub_clamp_u32 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:4
 define amdgpu_kernel void @global_atomic_csub_off4_no_rtn(ptr addrspace(1) %ptr, i32 %data) #0 {
 main_body:
   %p = getelementptr i32, ptr addrspace(1) %ptr, i64 1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll
index 309fd99031155..ae61b58e95ac7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll
@@ -1743,32 +1743,32 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) {
 define amdgpu_kernel void @v_icmp_i1_ne0(ptr addrspace(1) %out, i32 %a, i32 %b) {
 ; GFX11-LABEL: v_icmp_i1_ne0:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_cmp_gt_u32 s2, 1
-; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
-; GFX11-NEXT:    s_cmp_gt_u32 s3, 2
-; GFX11-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX11-NEXT:    s_cmp_gt_u32 s6, 1
+; GFX11-NEXT:    s_cselect_b32 s0, -1, 0
+; GFX11-NEXT:    s_cmp_gt_u32 s7, 2
+; GFX11-NEXT:    s_cselect_b32 s1, -1, 0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_b32 s2, s2, s3
-; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    s_and_b32 s0, s0, s1
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_icmp_i1_ne0:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_cmp_gt_u32 s2, 1
-; GFX10-NEXT:    s_cselect_b32 s2, -1, 0
-; GFX10-NEXT:    s_cmp_gt_u32 s3, 2
-; GFX10-NEXT:    s_cselect_b32 s3, -1, 0
-; GFX10-NEXT:    s_and_b32 s2, s2, s3
-; GFX10-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    s_cmp_gt_u32 s6, 1
+; GFX10-NEXT:    s_cselect_b32 s0, -1, 0
+; GFX10-NEXT:    s_cmp_gt_u32 s7, 2
+; GFX10-NEXT:    s_cselect_b32 s1, -1, 0
+; GFX10-NEXT:    s_and_b32 s0, s0, s1
+; GFX10-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
   %c0 = icmp ugt i32 %a, 1
   %c1 = icmp ugt i32 %b, 2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll
index 5f979e0177f58..54931ac345130 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll
@@ -1952,51 +1952,51 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) {
 define amdgpu_kernel void @v_icmp_i1_ne0(ptr addrspace(1) %out, i32 %a, i32 %b) {
 ; GFX11-LABEL: v_icmp_i1_ne0:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_cmp_gt_u32 s2, 1
-; GFX11-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX11-NEXT:    s_cmp_gt_u32 s3, 2
+; GFX11-NEXT:    s_cmp_gt_u32 s6, 1
+; GFX11-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GFX11-NEXT:    s_cmp_gt_u32 s7, 2
 ; GFX11-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_b64 s[2:3], s[4:5], s[2:3]
-; GFX11-NEXT:    v_mov_b32_e32 v0, s2
-; GFX11-NEXT:    v_mov_b32_e32 v1, s3
-; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; VI-LABEL: v_icmp_i1_ne0:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_cmp_gt_u32 s2, 1
-; VI-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; VI-NEXT:    s_cmp_gt_u32 s3, 2
+; VI-NEXT:    s_cmp_gt_u32 s6, 1
+; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT:    s_cmp_gt_u32 s7, 2
 ; VI-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; VI-NEXT:    s_and_b64 s[2:3], s[4:5], s[2:3]
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_mov_b32_e32 v3, s3
+; VI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_icmp_i1_ne0:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_cmp_gt_u32 s2, 1
-; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT:    s_cmp_gt_u32 s3, 2
+; GFX9-NEXT:    s_cmp_gt_u32 s6, 1
+; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GFX9-NEXT:    s_cmp_gt_u32 s7, 2
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GFX9-NEXT:    s_and_b64 s[2:3], s[4:5], s[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %c0 = icmp ugt i32 %a, 1
   %c1 = icmp ugt i32 %b, 2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
index dba67a03c000e..b0706025f0b68 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
@@ -14,12 +14,12 @@ entry:
 define amdgpu_kernel void @test_iglp_opt_mfma_gemm(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 {
 ; GCN-LABEL: test_iglp_opt_mfma_gemm:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
 ; GCN-NEXT:    v_mov_b32_e32 v3, 2.0
 ; GCN-NEXT:    ; iglp_opt mask(0x00000000)
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_add_u32_e32 v1, s0, v0
+; GCN-NEXT:    v_add_u32_e32 v1, s2, v0
 ; GCN-NEXT:    v_add_u32_e32 v2, 0x6000, v1
 ; GCN-NEXT:    ds_read_b128 a[28:31], v2 offset:57456
 ; GCN-NEXT:    ds_read_b128 a[24:27], v2 offset:57440
@@ -44,7 +44,7 @@ define amdgpu_kernel void @test_iglp_opt_mfma_gemm(ptr addrspace(3) noalias %in,
 ; GCN-NEXT:    ds_read_b128 a[152:155], v1 offset:96
 ; GCN-NEXT:    ds_read_b128 a[68:71], v1 offset:24592
 ; GCN-NEXT:    ds_read_b128 a[64:67], v1 offset:24576
-; GCN-NEXT:    v_add_u32_e32 v0, s1, v0
+; GCN-NEXT:    v_add_u32_e32 v0, s3, v0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(4)
 ; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[32:63], v2, v3, a[32:63]
 ; GCN-NEXT:    ds_read_b128 a[148:151], v1 offset:80
@@ -80,7 +80,7 @@ define amdgpu_kernel void @test_iglp_opt_mfma_gemm(ptr addrspace(3) noalias %in,
 ; GCN-NEXT:    ds_write_b128 v0, a[136:139] offset:32
 ; GCN-NEXT:    ds_write_b128 v0, a[132:135] offset:16
 ; GCN-NEXT:    ds_write_b128 v0, a[128:131]
-; GCN-NEXT:    v_mov_b32_e32 v0, s1
+; GCN-NEXT:    v_mov_b32_e32 v0, s3
 ; GCN-NEXT:    s_waitcnt lgkmcnt(8)
 ; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[64:95], v2, v3, a[64:95]
 ; GCN-NEXT:    ds_write_b128 v0, a[56:59] offset:24672
@@ -151,13 +151,13 @@ entry:
 define amdgpu_kernel void @test_iglp_opt_rev_mfma_gemm(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 {
 ; GCN-LABEL: test_iglp_opt_rev_mfma_gemm:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
 ; GCN-NEXT:    v_mov_b32_e32 v2, 1.0
 ; GCN-NEXT:    v_mov_b32_e32 v3, 2.0
 ; GCN-NEXT:    ; iglp_opt mask(0x00000001)
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_add_u32_e32 v1, s0, v0
+; GCN-NEXT:    v_add_u32_e32 v1, s2, v0
 ; GCN-NEXT:    ds_read_b128 a[28:31], v1 offset:112
 ; GCN-NEXT:    ds_read_b128 a[24:27], v1 offset:96
 ; GCN-NEXT:    ds_read_b128 a[20:23], v1 offset:80
@@ -176,7 +176,7 @@ define amdgpu_kernel void @test_iglp_opt_rev_mfma_gemm(ptr addrspace(3) noalias
 ; GCN-NEXT:    ds_read_b128 a[136:139], v1 offset:8224
 ; GCN-NEXT:    ds_read_b128 a[132:135], v1 offset:8208
 ; GCN-NEXT:    ds_read_b128 a[128:131], v1 offset:8192
-; GCN-NEXT:    v_add_u32_e32 v0, s1, v0
+; GCN-NEXT:    v_add_u32_e32 v0, s3, v0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[128:159], v2, v3, a[128:159]
 ; GCN-NEXT:    ds_read_b128 a[124:127], v1 offset:24688
@@ -218,7 +218,7 @@ define amdgpu_kernel void @test_iglp_opt_rev_mfma_gemm(ptr addrspace(3) noalias
 ; GCN-NEXT:    ds_write_b128 v0, a[8:11] offset:32
 ; GCN-NEXT:    ds_write_b128 v0, a[4:7] offset:16
 ; GCN-NEXT:    ds_write_b128 v0, a[0:3]
-; GCN-NEXT:    v_mov_b32_e32 v0, s1
+; GCN-NEXT:    v_mov_b32_e32 v0, s3
 ; GCN-NEXT:    ds_write_b128 v0, a[152:155] offset:8288
 ; GCN-NEXT:    ds_write_b128 v0, a[156:159] offset:8304
 ; GCN-NEXT:    ds_write_b128 v0, a[144:147] offset:8256
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll
index 8d0397c936a03..3a77b3bc9cb95 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll
@@ -163,7 +163,7 @@ main_body:
 define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> inreg %tdescr) {
 ; GFX1013-LABEL: image_bvh_intersect_ray_nsa_reassign:
 ; GFX1013:       ; %bb.0: ; %main_body
-; GFX1013-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX1013-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
 ; GFX1013-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX1013-NEXT:    v_mov_b32_e32 v6, 4.0
 ; GFX1013-NEXT:    v_mov_b32_e32 v7, 0x40a00000
@@ -171,10 +171,10 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr,
 ; GFX1013-NEXT:    v_mov_b32_e32 v9, 0x40e00000
 ; GFX1013-NEXT:    v_mov_b32_e32 v10, 0x41000000
 ; GFX1013-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1013-NEXT:    v_add_co_u32 v2, s0, s0, v0
-; GFX1013-NEXT:    v_add_co_ci_u32_e64 v3, s0, s1, 0, s0
-; GFX1013-NEXT:    v_add_co_u32 v4, s0, s2, v0
-; GFX1013-NEXT:    v_add_co_ci_u32_e64 v5, s0, s3, 0, s0
+; GFX1013-NEXT:    v_add_co_u32 v2, s0, s4, v0
+; GFX1013-NEXT:    v_add_co_ci_u32_e64 v3, s0, s5, 0, s0
+; GFX1013-NEXT:    v_add_co_u32 v4, s0, s6, v0
+; GFX1013-NEXT:    v_add_co_ci_u32_e64 v5, s0, s7, 0, s0
 ; GFX1013-NEXT:    flat_load_dword v0, v[2:3]
 ; GFX1013-NEXT:    flat_load_dword v1, v[4:5]
 ; GFX1013-NEXT:    v_mov_b32_e32 v2, 0
@@ -182,14 +182,14 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr,
 ; GFX1013-NEXT:    v_mov_b32_e32 v4, 2.0
 ; GFX1013-NEXT:    v_mov_b32_e32 v5, 0x40400000
 ; GFX1013-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1013-NEXT:    image_bvh_intersect_ray v[0:3], v[0:10], s[4:7]
+; GFX1013-NEXT:    image_bvh_intersect_ray v[0:3], v[0:10], s[8:11]
 ; GFX1013-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1013-NEXT:    flat_store_dwordx4 v[0:1], v[0:3]
 ; GFX1013-NEXT:    s_endpgm
 ;
 ; GFX1030-LABEL: image_bvh_intersect_ray_nsa_reassign:
 ; GFX1030:       ; %bb.0: ; %main_body
-; GFX1030-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX1030-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
 ; GFX1030-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX1030-NEXT:    v_mov_b32_e32 v10, 0x41000000
 ; GFX1030-NEXT:    v_mov_b32_e32 v9, 0x40e00000
@@ -199,33 +199,33 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr,
 ; GFX1030-NEXT:    v_mov_b32_e32 v5, 0x40400000
 ; GFX1030-NEXT:    v_mov_b32_e32 v4, 2.0
 ; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1030-NEXT:    v_add_co_u32 v0, s0, s0, v2
-; GFX1030-NEXT:    v_add_co_ci_u32_e64 v1, null, s1, 0, s0
-; GFX1030-NEXT:    v_add_co_u32 v2, s0, s2, v2
-; GFX1030-NEXT:    v_add_co_ci_u32_e64 v3, null, s3, 0, s0
+; GFX1030-NEXT:    v_add_co_u32 v0, s0, s4, v2
+; GFX1030-NEXT:    v_add_co_ci_u32_e64 v1, null, s5, 0, s0
+; GFX1030-NEXT:    v_add_co_u32 v2, s0, s6, v2
+; GFX1030-NEXT:    v_add_co_ci_u32_e64 v3, null, s7, 0, s0
 ; GFX1030-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX1030-NEXT:    flat_load_dword v1, v[2:3]
 ; GFX1030-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1030-NEXT:    v_mov_b32_e32 v3, 1.0
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    image_bvh_intersect_ray v[0:3], v[0:10], s[4:7]
+; GFX1030-NEXT:    image_bvh_intersect_ray v[0:3], v[0:10], s[8:11]
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1030-NEXT:    flat_store_dwordx4 v[0:1], v[0:3]
 ; GFX1030-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: image_bvh_intersect_ray_nsa_reassign:
 ; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b256 s[4:11], s[0:1], 0x24
 ; GFX11-NEXT:    v_dual_mov_b32 v5, 0x40a00000 :: v_dual_lshlrev_b32 v2, 2, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX11-NEXT:    v_mov_b32_e32 v8, 2.0
 ; GFX11-NEXT:    v_dual_mov_b32 v4, 4.0 :: v_dual_mov_b32 v7, 1.0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v0, s0, s0, v2
+; GFX11-NEXT:    v_add_co_u32 v0, s0, s4, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, s1, 0, s0
-; GFX11-NEXT:    v_add_co_u32 v2, s0, s2, v2
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, s3, 0, s0
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, s5, 0, s0
+; GFX11-NEXT:    v_add_co_u32 v2, s0, s6, v2
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, s7, 0, s0
 ; GFX11-NEXT:    flat_load_b32 v9, v[0:1]
 ; GFX11-NEXT:    flat_load_b32 v10, v[2:3]
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0x40c00000
@@ -233,7 +233,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr,
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0x41000000
 ; GFX11-NEXT:    v_mov_b32_e32 v3, 0x40400000
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    image_bvh_intersect_ray v[0:3], [v9, v10, v[6:8], v[3:5], v[0:2]], s[4:7]
+; GFX11-NEXT:    image_bvh_intersect_ray v[0:3], [v9, v10, v[6:8], v[3:5], v[0:2]], s[8:11]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b128 v[0:1], v[0:3]
 ; GFX11-NEXT:    s_endpgm
@@ -260,15 +260,15 @@ main_body:
 define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> inreg %tdescr) {
 ; GFX1013-LABEL: image_bvh_intersect_ray_a16_nsa_reassign:
 ; GFX1013:       ; %bb.0: ; %main_body
-; GFX1013-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX1013-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
 ; GFX1013-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX1013-NEXT:    v_mov_b32_e32 v6, 0x46004500
 ; GFX1013-NEXT:    v_mov_b32_e32 v7, 0x48004700
 ; GFX1013-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1013-NEXT:    v_add_co_u32 v2, s0, s0, v0
-; GFX1013-NEXT:    v_add_co_ci_u32_e64 v3, s0, s1, 0, s0
-; GFX1013-NEXT:    v_add_co_u32 v4, s0, s2, v0
-; GFX1013-NEXT:    v_add_co_ci_u32_e64 v5, s0, s3, 0, s0
+; GFX1013-NEXT:    v_add_co_u32 v2, s0, s4, v0
+; GFX1013-NEXT:    v_add_co_ci_u32_e64 v3, s0, s5, 0, s0
+; GFX1013-NEXT:    v_add_co_u32 v4, s0, s6, v0
+; GFX1013-NEXT:    v_add_co_ci_u32_e64 v5, s0, s7, 0, s0
 ; GFX1013-NEXT:    flat_load_dword v0, v[2:3]
 ; GFX1013-NEXT:    flat_load_dword v1, v[4:5]
 ; GFX1013-NEXT:    v_mov_b32_e32 v2, 0
@@ -276,53 +276,53 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_
 ; GFX1013-NEXT:    v_mov_b32_e32 v4, 2.0
 ; GFX1013-NEXT:    v_mov_b32_e32 v5, 0x44004200
 ; GFX1013-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1013-NEXT:    image_bvh_intersect_ray v[0:3], v[0:7], s[4:7] a16
+; GFX1013-NEXT:    image_bvh_intersect_ray v[0:3], v[0:7], s[8:11] a16
 ; GFX1013-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1013-NEXT:    flat_store_dwordx4 v[0:1], v[0:3]
 ; GFX1013-NEXT:    s_endpgm
 ;
 ; GFX1030-LABEL: image_bvh_intersect_ray_a16_nsa_reassign:
 ; GFX1030:       ; %bb.0: ; %main_body
-; GFX1030-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX1030-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
 ; GFX1030-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX1030-NEXT:    v_mov_b32_e32 v4, 2.0
 ; GFX1030-NEXT:    v_mov_b32_e32 v5, 0x44004200
 ; GFX1030-NEXT:    v_mov_b32_e32 v6, 0x46004500
 ; GFX1030-NEXT:    v_mov_b32_e32 v7, 0x48004700
 ; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1030-NEXT:    v_add_co_u32 v0, s0, s0, v2
-; GFX1030-NEXT:    v_add_co_ci_u32_e64 v1, null, s1, 0, s0
-; GFX1030-NEXT:    v_add_co_u32 v2, s0, s2, v2
-; GFX1030-NEXT:    v_add_co_ci_u32_e64 v3, null, s3, 0, s0
+; GFX1030-NEXT:    v_add_co_u32 v0, s0, s4, v2
+; GFX1030-NEXT:    v_add_co_ci_u32_e64 v1, null, s5, 0, s0
+; GFX1030-NEXT:    v_add_co_u32 v2, s0, s6, v2
+; GFX1030-NEXT:    v_add_co_ci_u32_e64 v3, null, s7, 0, s0
 ; GFX1030-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX1030-NEXT:    flat_load_dword v1, v[2:3]
 ; GFX1030-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1030-NEXT:    v_mov_b32_e32 v3, 1.0
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    image_bvh_intersect_ray v[0:3], v[0:7], s[4:7] a16
+; GFX1030-NEXT:    image_bvh_intersect_ray v[0:3], v[0:7], s[8:11] a16
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1030-NEXT:    flat_store_dwordx4 v[0:1], v[0:3]
 ; GFX1030-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: image_bvh_intersect_ray_a16_nsa_reassign:
 ; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b256 s[4:11], s[0:1], 0x24
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX11-NEXT:    v_dual_mov_b32 v4, 1.0 :: v_dual_mov_b32 v5, 2.0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_u32 v0, s0, s0, v2
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, s1, 0, s0
-; GFX11-NEXT:    v_add_co_u32 v2, s0, s2, v2
+; GFX11-NEXT:    v_add_co_u32 v0, s0, s4, v2
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, s5, 0, s0
+; GFX11-NEXT:    v_add_co_u32 v2, s0, s6, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, s3, 0, s0
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, s7, 0, s0
 ; GFX11-NEXT:    flat_load_b32 v6, v[0:1]
 ; GFX11-NEXT:    flat_load_b32 v7, v[2:3]
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0x47004400
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0x46004200 :: v_dual_mov_b32 v3, 0
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0x48004500
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    image_bvh_intersect_ray v[0:3], [v6, v7, v[3:5], v[0:2]], s[4:7] a16
+; GFX11-NEXT:    image_bvh_intersect_ray v[0:3], [v6, v7, v[3:5], v[0:2]], s[8:11] a16
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b128 v[0:1], v[0:3]
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
index 265d64f47bb23..634159aec9db5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
@@ -85,12 +85,12 @@ define amdgpu_kernel void @v_permlane16_b32_vii(ptr addrspace(1) %out, i32 %src0
 ;
 ; GFX12-LABEL: v_permlane16_b32_vii:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_permlane16_b32 v0, v0, 1, 2
-; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT:    global_store_b32 v1, v0, s[4:5]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
@@ -131,13 +131,13 @@ define amdgpu_kernel void @v_permlane16_b32_vll(ptr addrspace(1) %out, i32 %src0
 ;
 ; GFX12-LABEL: v_permlane16_b32_vll:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
+; GFX12-NEXT:    s_movk_i32 s0, 0x1234
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX12-NEXT:    s_movk_i32 s2, 0x1234
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    v_permlane16_b32 v0, v0, s2, 0xc1d1
-; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_permlane16_b32 v0, v0, s0, 0xc1d1
+; GFX12-NEXT:    global_store_b32 v1, v0, s[4:5]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
@@ -203,35 +203,35 @@ define amdgpu_kernel void @v_permlane16_b32_vvv(ptr addrspace(1) %out, i32 %src0
 ;
 ; GFX12-SDAG-LABEL: v_permlane16_b32_vvv:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
 ; GFX12-SDAG-NEXT:    v_bfe_u32 v0, v0, 10, 10
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s3, v1
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, s2
-; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s1, v0
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v1, s3, s2
-; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, s6
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, s1
+; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[4:5]
 ; GFX12-SDAG-NEXT:    s_nop 0
 ; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlane16_b32_vvv:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
 ; GFX12-GISEL-NEXT:    v_bfe_u32 v0, v0, 10, 10
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s1, v0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s0, v1
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s3, s4
-; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s1
+; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[4:5]
 ; GFX12-GISEL-NEXT:    s_nop 0
 ; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
@@ -245,76 +245,76 @@ define amdgpu_kernel void @v_permlane16_b32_vvv(ptr addrspace(1) %out, i32 %src0
 define amdgpu_kernel void @v_permlane16_b32_vvs(ptr addrspace(1) %out, i32 %src0, i32 %src2) {
 ; GFX10-SDAG-LABEL: v_permlane16_b32_vvs:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    s_mov_b32 null, 0
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-SDAG-NEXT:    v_permlane16_b32 v1, v1, s2, s3
-; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, s7
+; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_permlane16_b32_vvs:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_mov_b32 null, 0
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, s4, s3
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s7
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_permlane16_b32_vvs:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, s2
-; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v1, s2, s3
-; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, s7
+; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[4:5]
 ; GFX11-SDAG-NEXT:    s_nop 0
 ; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlane16_b32_vvs:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX11-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_permlane16_b32 v0, v0, s4, s3
-; GFX11-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s7
+; GFX11-GISEL-NEXT:    global_store_b32 v1, v0, s[4:5]
 ; GFX11-GISEL-NEXT:    s_nop 0
 ; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlane16_b32_vvs:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, s2
-; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v1, s2, s3
-; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, s7
+; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[4:5]
 ; GFX12-SDAG-NEXT:    s_nop 0
 ; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlane16_b32_vvs:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX12-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s4, s3
-; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s7
+; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[4:5]
 ; GFX12-GISEL-NEXT:    s_nop 0
 ; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
@@ -325,84 +325,72 @@ define amdgpu_kernel void @v_permlane16_b32_vvs(ptr addrspace(1) %out, i32 %src0
 }
 
 define amdgpu_kernel void @v_permlane16_b32_vsv(ptr addrspace(1) %out, i32 %src0, i32 %src1) {
-; GFX10-SDAG-LABEL: v_permlane16_b32_vsv:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-SDAG-NEXT:    v_permlane16_b32 v0, v0, s3, s2
-; GFX10-SDAG-NEXT:    global_store_dword v1, v0, s[0:1]
-; GFX10-SDAG-NEXT:    s_endpgm
-;
-; GFX10-GISEL-LABEL: v_permlane16_b32_vsv:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, s3, s4
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
-; GFX10-GISEL-NEXT:    s_endpgm
+; GFX10-LABEL: v_permlane16_b32_vsv:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_mov_b32 null, 0
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-NEXT:    v_permlane16_b32 v0, v0, s7, s0
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_permlane16_b32_vsv:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-SDAG-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, s2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v1, s3, s2
-; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v1, s7, s0
+; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[4:5]
 ; GFX11-SDAG-NEXT:    s_nop 0
 ; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlane16_b32_vsv:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-GISEL-NEXT:    v_bfe_u32 v0, v0, 10, 10
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX11-GISEL-NEXT:    v_permlane16_b32 v0, v0, s3, s4
-; GFX11-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v0, v0, s7, s0
+; GFX11-GISEL-NEXT:    global_store_b32 v1, v0, s[4:5]
 ; GFX11-GISEL-NEXT:    s_nop 0
 ; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlane16_b32_vsv:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, s2
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v1, s3, s2
-; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v1, s7, s0
+; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[4:5]
 ; GFX12-SDAG-NEXT:    s_nop 0
 ; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlane16_b32_vsv:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    v_bfe_u32 v0, v0, 10, 10
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s3, s4
-; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s7, s0
+; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[4:5]
 ; GFX12-GISEL-NEXT:    s_nop 0
 ; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
@@ -621,12 +609,12 @@ define amdgpu_kernel void @v_permlanex16_b32_vii(ptr addrspace(1) %out, i32 %src
 ;
 ; GFX12-LABEL: v_permlanex16_b32_vii:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_permlanex16_b32 v0, v0, 1, 2
-; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT:    global_store_b32 v1, v0, s[4:5]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
@@ -667,13 +655,13 @@ define amdgpu_kernel void @v_permlanex16_b32_vll(ptr addrspace(1) %out, i32 %src
 ;
 ; GFX12-LABEL: v_permlanex16_b32_vll:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
+; GFX12-NEXT:    s_movk_i32 s0, 0x1234
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX12-NEXT:    s_movk_i32 s2, 0x1234
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s2, 0xc1d1
-; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s0, 0xc1d1
+; GFX12-NEXT:    global_store_b32 v1, v0, s[4:5]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
@@ -739,35 +727,35 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv(ptr addrspace(1) %out, i32 %src
 ;
 ; GFX12-SDAG-LABEL: v_permlanex16_b32_vvv:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
 ; GFX12-SDAG-NEXT:    v_bfe_u32 v0, v0, 10, 10
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s3, v1
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, s2
-; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s1, v0
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s3, s2
-; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, s6
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, s1
+; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[4:5]
 ; GFX12-SDAG-NEXT:    s_nop 0
 ; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlanex16_b32_vvv:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
 ; GFX12-GISEL-NEXT:    v_bfe_u32 v0, v0, 10, 10
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s1, v0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s0, v1
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s3, s4
-; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
+; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[4:5]
 ; GFX12-GISEL-NEXT:    s_nop 0
 ; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
@@ -781,76 +769,76 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv(ptr addrspace(1) %out, i32 %src
 define amdgpu_kernel void @v_permlanex16_b32_vvs(ptr addrspace(1) %out, i32 %src0, i32 %src2) {
 ; GFX10-SDAG-LABEL: v_permlanex16_b32_vvs:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    s_mov_b32 null, 0
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s2, s3
-; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, s7
+; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_permlanex16_b32_vvs:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_mov_b32 null, 0
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s4, s3
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s7
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_permlanex16_b32_vvs:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, s2
-; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s2, s3
-; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, s7
+; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[4:5]
 ; GFX11-SDAG-NEXT:    s_nop 0
 ; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlanex16_b32_vvs:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX11-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s4, s3
-; GFX11-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s7
+; GFX11-GISEL-NEXT:    global_store_b32 v1, v0, s[4:5]
 ; GFX11-GISEL-NEXT:    s_nop 0
 ; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlanex16_b32_vvs:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, s2
-; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s2, s3
-; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, s7
+; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[4:5]
 ; GFX12-SDAG-NEXT:    s_nop 0
 ; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlanex16_b32_vvs:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX12-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s4, s3
-; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s7
+; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[4:5]
 ; GFX12-GISEL-NEXT:    s_nop 0
 ; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
@@ -861,84 +849,72 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs(ptr addrspace(1) %out, i32 %src
 }
 
 define amdgpu_kernel void @v_permlanex16_b32_vsv(ptr addrspace(1) %out, i32 %src0, i32 %src1) {
-; GFX10-SDAG-LABEL: v_permlanex16_b32_vsv:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s3, s2
-; GFX10-SDAG-NEXT:    global_store_dword v1, v0, s[0:1]
-; GFX10-SDAG-NEXT:    s_endpgm
-;
-; GFX10-GISEL-LABEL: v_permlanex16_b32_vsv:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s3, s4
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
-; GFX10-GISEL-NEXT:    s_endpgm
+; GFX10-LABEL: v_permlanex16_b32_vsv:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_mov_b32 null, 0
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-NEXT:    v_permlanex16_b32 v0, v0, s7, s0
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_permlanex16_b32_vsv:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-SDAG-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, s2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s3, s2
-; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s7, s0
+; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[4:5]
 ; GFX11-SDAG-NEXT:    s_nop 0
 ; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlanex16_b32_vsv:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-GISEL-NEXT:    v_bfe_u32 v0, v0, 10, 10
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX11-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s3, s4
-; GFX11-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s7, s0
+; GFX11-GISEL-NEXT:    global_store_b32 v1, v0, s[4:5]
 ; GFX11-GISEL-NEXT:    s_nop 0
 ; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlanex16_b32_vsv:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, s2
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s3, s2
-; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s7, s0
+; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[4:5]
 ; GFX12-SDAG-NEXT:    s_nop 0
 ; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlanex16_b32_vsv:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    v_bfe_u32 v0, v0, 10, 10
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s3, s4
-; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s7, s0
+; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[4:5]
 ; GFX12-GISEL-NEXT:    s_nop 0
 ; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll
index 973678291e263..77a975f7abe0f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll
@@ -10,26 +10,26 @@ declare i32 @llvm.amdgcn.workitem.id.y()
 define amdgpu_kernel void @v_permlane16var_b32_vv(ptr addrspace(1) %out, i32 %src0, i32 %src1) {
 ; GFX12-SDAG-LABEL: v_permlane16var_b32_vv:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlane16_var_b32 v0, v0, v1
-; GFX12-SDAG-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX12-SDAG-NEXT:    global_store_b32 v2, v0, s[4:5]
 ; GFX12-SDAG-NEXT:    s_nop 0
 ; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlane16var_b32_vv:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_permlane16_var_b32 v0, v0, v1
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[4:5]
 ; GFX12-GISEL-NEXT:    s_nop 0
 ; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
@@ -41,26 +41,26 @@ define amdgpu_kernel void @v_permlane16var_b32_vv(ptr addrspace(1) %out, i32 %sr
 define amdgpu_kernel void @v_permlane16var_b32_vi(ptr addrspace(1) %out, i32 %src0) {
 ; GFX12-SDAG-LABEL: v_permlane16var_b32_vi:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 1
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s6
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlane16_var_b32 v1, v1, v0
-; GFX12-SDAG-NEXT:    global_store_b32 v2, v1, s[0:1]
+; GFX12-SDAG-NEXT:    global_store_b32 v2, v1, s[4:5]
 ; GFX12-SDAG-NEXT:    s_nop 0
 ; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlane16var_b32_vi:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, s2
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, s6
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_permlane16_var_b32 v1, v1, v0
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-GISEL-NEXT:    global_store_b32 v0, v1, s[4:5]
 ; GFX12-GISEL-NEXT:    s_nop 0
 ; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
@@ -72,26 +72,26 @@ define amdgpu_kernel void @v_permlane16var_b32_vi(ptr addrspace(1) %out, i32 %sr
 define amdgpu_kernel void @v_permlane16var_b32_vl(ptr addrspace(1) %out, i32 %src0) {
 ; GFX12-SDAG-LABEL: v_permlane16var_b32_vl:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 0xc1d1
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s6
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlane16_var_b32 v1, v1, v0
-; GFX12-SDAG-NEXT:    global_store_b32 v2, v1, s[0:1]
+; GFX12-SDAG-NEXT:    global_store_b32 v2, v1, s[4:5]
 ; GFX12-SDAG-NEXT:    s_nop 0
 ; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlane16var_b32_vl:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, 0xc1d1 :: v_dual_mov_b32 v1, s2
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, 0xc1d1 :: v_dual_mov_b32 v1, s6
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_permlane16_var_b32 v1, v1, v0
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-GISEL-NEXT:    global_store_b32 v0, v1, s[4:5]
 ; GFX12-GISEL-NEXT:    s_nop 0
 ; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
@@ -103,25 +103,25 @@ define amdgpu_kernel void @v_permlane16var_b32_vl(ptr addrspace(1) %out, i32 %sr
 define amdgpu_kernel void @v_permlane16var_b32_vvv(ptr addrspace(1) %out, i32 %src0) {
 ; GFX12-SDAG-LABEL: v_permlane16var_b32_vvv:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s6
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlane16_var_b32 v1, v1, v0
-; GFX12-SDAG-NEXT:    global_store_b32 v2, v1, s[0:1]
+; GFX12-SDAG-NEXT:    global_store_b32 v2, v1, s[4:5]
 ; GFX12-SDAG-NEXT:    s_nop 0
 ; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlane16var_b32_vvv:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_permlane16_var_b32 v1, v1, v0
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-GISEL-NEXT:    global_store_b32 v0, v1, s[4:5]
 ; GFX12-GISEL-NEXT:    s_nop 0
 ; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
@@ -134,26 +134,26 @@ define amdgpu_kernel void @v_permlane16var_b32_vvv(ptr addrspace(1) %out, i32 %s
 define amdgpu_kernel void @v_permlane16var_b32_vv_fi(ptr addrspace(1) %out, i32 %src0, i32 %src1) {
 ; GFX12-SDAG-LABEL: v_permlane16var_b32_vv_fi:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlane16_var_b32 v0, v0, v1 op_sel:[1,0]
-; GFX12-SDAG-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX12-SDAG-NEXT:    global_store_b32 v2, v0, s[4:5]
 ; GFX12-SDAG-NEXT:    s_nop 0
 ; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlane16var_b32_vv_fi:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_permlane16_var_b32 v0, v0, v1 op_sel:[1,0]
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[4:5]
 ; GFX12-GISEL-NEXT:    s_nop 0
 ; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
@@ -165,26 +165,26 @@ define amdgpu_kernel void @v_permlane16var_b32_vv_fi(ptr addrspace(1) %out, i32
 define amdgpu_kernel void @v_permlane16var_b32_vv_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1) {
 ; GFX12-SDAG-LABEL: v_permlane16var_b32_vv_bc:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlane16_var_b32 v0, v0, v1 op_sel:[0,1]
-; GFX12-SDAG-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX12-SDAG-NEXT:    global_store_b32 v2, v0, s[4:5]
 ; GFX12-SDAG-NEXT:    s_nop 0
 ; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlane16var_b32_vv_bc:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_permlane16_var_b32 v0, v0, v1 op_sel:[0,1]
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[4:5]
 ; GFX12-GISEL-NEXT:    s_nop 0
 ; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
@@ -196,26 +196,26 @@ define amdgpu_kernel void @v_permlane16var_b32_vv_bc(ptr addrspace(1) %out, i32
 define amdgpu_kernel void @v_permlane16var_b32_vv_fi_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1) {
 ; GFX12-SDAG-LABEL: v_permlane16var_b32_vv_fi_bc:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlane16_var_b32 v0, v0, v1 op_sel:[1,1]
-; GFX12-SDAG-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX12-SDAG-NEXT:    global_store_b32 v2, v0, s[4:5]
 ; GFX12-SDAG-NEXT:    s_nop 0
 ; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlane16var_b32_vv_fi_bc:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_permlane16_var_b32 v0, v0, v1 op_sel:[1,1]
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[4:5]
 ; GFX12-GISEL-NEXT:    s_nop 0
 ; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
@@ -227,26 +227,26 @@ define amdgpu_kernel void @v_permlane16var_b32_vv_fi_bc(ptr addrspace(1) %out, i
 define amdgpu_kernel void @v_permlanex16var_b32_vv(ptr addrspace(1) %out, i32 %src0, i32 %src1) {
 ; GFX12-SDAG-LABEL: v_permlanex16var_b32_vv:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlanex16_var_b32 v0, v0, v1
-; GFX12-SDAG-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX12-SDAG-NEXT:    global_store_b32 v2, v0, s[4:5]
 ; GFX12-SDAG-NEXT:    s_nop 0
 ; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vv:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_permlanex16_var_b32 v0, v0, v1
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[4:5]
 ; GFX12-GISEL-NEXT:    s_nop 0
 ; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
@@ -258,26 +258,26 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv(ptr addrspace(1) %out, i32 %s
 define amdgpu_kernel void @v_permlanex16var_b32_vi(ptr addrspace(1) %out, i32 %src0) {
 ; GFX12-SDAG-LABEL: v_permlanex16var_b32_vi:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 1
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s6
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlanex16_var_b32 v1, v1, v0
-; GFX12-SDAG-NEXT:    global_store_b32 v2, v1, s[0:1]
+; GFX12-SDAG-NEXT:    global_store_b32 v2, v1, s[4:5]
 ; GFX12-SDAG-NEXT:    s_nop 0
 ; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vi:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, s2
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, s6
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_permlanex16_var_b32 v1, v1, v0
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-GISEL-NEXT:    global_store_b32 v0, v1, s[4:5]
 ; GFX12-GISEL-NEXT:    s_nop 0
 ; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
@@ -289,26 +289,26 @@ define amdgpu_kernel void @v_permlanex16var_b32_vi(ptr addrspace(1) %out, i32 %s
 define amdgpu_kernel void @v_permlanex16var_b32_vl(ptr addrspace(1) %out, i32 %src0) {
 ; GFX12-SDAG-LABEL: v_permlanex16var_b32_vl:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 0xc1d1
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s6
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlanex16_var_b32 v1, v1, v0
-; GFX12-SDAG-NEXT:    global_store_b32 v2, v1, s[0:1]
+; GFX12-SDAG-NEXT:    global_store_b32 v2, v1, s[4:5]
 ; GFX12-SDAG-NEXT:    s_nop 0
 ; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vl:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, 0xc1d1 :: v_dual_mov_b32 v1, s2
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, 0xc1d1 :: v_dual_mov_b32 v1, s6
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_permlanex16_var_b32 v1, v1, v0
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-GISEL-NEXT:    global_store_b32 v0, v1, s[4:5]
 ; GFX12-GISEL-NEXT:    s_nop 0
 ; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
@@ -320,25 +320,25 @@ define amdgpu_kernel void @v_permlanex16var_b32_vl(ptr addrspace(1) %out, i32 %s
 define amdgpu_kernel void @v_permlanex16var_b32_vvv(ptr addrspace(1) %out, i32 %src0) {
 ; GFX12-SDAG-LABEL: v_permlanex16var_b32_vvv:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s6
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlanex16_var_b32 v1, v1, v0
-; GFX12-SDAG-NEXT:    global_store_b32 v2, v1, s[0:1]
+; GFX12-SDAG-NEXT:    global_store_b32 v2, v1, s[4:5]
 ; GFX12-SDAG-NEXT:    s_nop 0
 ; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vvv:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_permlanex16_var_b32 v1, v1, v0
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-GISEL-NEXT:    global_store_b32 v0, v1, s[4:5]
 ; GFX12-GISEL-NEXT:    s_nop 0
 ; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
@@ -351,26 +351,26 @@ define amdgpu_kernel void @v_permlanex16var_b32_vvv(ptr addrspace(1) %out, i32 %
 define amdgpu_kernel void @v_permlanex16var_b32_vv_fi(ptr addrspace(1) %out, i32 %src0, i32 %src1) {
 ; GFX12-SDAG-LABEL: v_permlanex16var_b32_vv_fi:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,0]
-; GFX12-SDAG-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX12-SDAG-NEXT:    global_store_b32 v2, v0, s[4:5]
 ; GFX12-SDAG-NEXT:    s_nop 0
 ; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vv_fi:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,0]
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[4:5]
 ; GFX12-GISEL-NEXT:    s_nop 0
 ; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
@@ -382,26 +382,26 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv_fi(ptr addrspace(1) %out, i32
 define amdgpu_kernel void @v_permlanex16var_b32_vv_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1) {
 ; GFX12-SDAG-LABEL: v_permlanex16var_b32_vv_bc:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlanex16_var_b32 v0, v0, v1 op_sel:[0,1]
-; GFX12-SDAG-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX12-SDAG-NEXT:    global_store_b32 v2, v0, s[4:5]
 ; GFX12-SDAG-NEXT:    s_nop 0
 ; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vv_bc:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_permlanex16_var_b32 v0, v0, v1 op_sel:[0,1]
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[4:5]
 ; GFX12-GISEL-NEXT:    s_nop 0
 ; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
@@ -413,26 +413,26 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv_bc(ptr addrspace(1) %out, i32
 define amdgpu_kernel void @v_permlanex16var_b32_vv_fi_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1) {
 ; GFX12-SDAG-LABEL: v_permlanex16var_b32_vv_fi_bc:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,1]
-; GFX12-SDAG-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX12-SDAG-NEXT:    global_store_b32 v2, v0, s[4:5]
 ; GFX12-SDAG-NEXT:    s_nop 0
 ; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vv_fi_bc:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,1]
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[4:5]
 ; GFX12-GISEL-NEXT:    s_nop 0
 ; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
index 47c021769aa56..bb3418ca5de99 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
@@ -91,58 +91,58 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 {
 ;
 ; VARIANT4-LABEL: test_barrier:
 ; VARIANT4:       ; %bb.0: ; %entry
-; VARIANT4-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
+; VARIANT4-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
 ; VARIANT4-NEXT:    v_lshlrev_b32_e32 v3, 2, v0
 ; VARIANT4-NEXT:    s_wait_kmcnt 0x0
-; VARIANT4-NEXT:    v_xad_u32 v1, v0, -1, s2
-; VARIANT4-NEXT:    global_store_b32 v3, v0, s[0:1]
+; VARIANT4-NEXT:    v_xad_u32 v1, v0, -1, s6
+; VARIANT4-NEXT:    global_store_b32 v3, v0, s[4:5]
 ; VARIANT4-NEXT:    s_wait_storecnt 0x0
 ; VARIANT4-NEXT:    s_barrier_signal -1
 ; VARIANT4-NEXT:    s_barrier_wait -1
 ; VARIANT4-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; VARIANT4-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; VARIANT4-NEXT:    v_lshlrev_b64_e32 v[1:2], 2, v[1:2]
-; VARIANT4-NEXT:    v_add_co_u32 v1, vcc_lo, s0, v1
+; VARIANT4-NEXT:    v_add_co_u32 v1, vcc_lo, s4, v1
 ; VARIANT4-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; VARIANT4-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, s1, v2, vcc_lo
+; VARIANT4-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, s5, v2, vcc_lo
 ; VARIANT4-NEXT:    global_load_b32 v0, v[1:2], off
 ; VARIANT4-NEXT:    s_wait_loadcnt 0x0
-; VARIANT4-NEXT:    global_store_b32 v3, v0, s[0:1]
+; VARIANT4-NEXT:    global_store_b32 v3, v0, s[4:5]
 ; VARIANT4-NEXT:    s_nop 0
 ; VARIANT4-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; VARIANT4-NEXT:    s_endpgm
 ;
 ; VARIANT5-LABEL: test_barrier:
 ; VARIANT5:       ; %bb.0: ; %entry
-; VARIANT5-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
+; VARIANT5-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
 ; VARIANT5-NEXT:    v_lshlrev_b32_e32 v3, 2, v0
 ; VARIANT5-NEXT:    s_wait_kmcnt 0x0
-; VARIANT5-NEXT:    v_xad_u32 v1, v0, -1, s2
-; VARIANT5-NEXT:    global_store_b32 v3, v0, s[0:1]
+; VARIANT5-NEXT:    v_xad_u32 v1, v0, -1, s6
+; VARIANT5-NEXT:    global_store_b32 v3, v0, s[4:5]
 ; VARIANT5-NEXT:    s_barrier_signal -1
 ; VARIANT5-NEXT:    s_barrier_wait -1
 ; VARIANT5-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; VARIANT5-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; VARIANT5-NEXT:    v_lshlrev_b64_e32 v[1:2], 2, v[1:2]
-; VARIANT5-NEXT:    v_add_co_u32 v1, vcc_lo, s0, v1
+; VARIANT5-NEXT:    v_add_co_u32 v1, vcc_lo, s4, v1
 ; VARIANT5-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; VARIANT5-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, s1, v2, vcc_lo
+; VARIANT5-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, s5, v2, vcc_lo
 ; VARIANT5-NEXT:    global_load_b32 v0, v[1:2], off
 ; VARIANT5-NEXT:    s_wait_loadcnt 0x0
-; VARIANT5-NEXT:    global_store_b32 v3, v0, s[0:1]
+; VARIANT5-NEXT:    global_store_b32 v3, v0, s[4:5]
 ; VARIANT5-NEXT:    s_nop 0
 ; VARIANT5-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; VARIANT5-NEXT:    s_endpgm
 ;
 ; VARIANT6-LABEL: test_barrier:
 ; VARIANT6:       ; %bb.0: ; %entry
-; VARIANT6-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
+; VARIANT6-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
 ; VARIANT6-NEXT:    v_lshlrev_b32_e32 v5, 2, v0
 ; VARIANT6-NEXT:    s_wait_kmcnt 0x0
-; VARIANT6-NEXT:    s_sub_co_i32 s2, s2, 1
-; VARIANT6-NEXT:    v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0
-; VARIANT6-NEXT:    v_sub_nc_u32_e32 v1, s2, v0
-; VARIANT6-NEXT:    global_store_b32 v5, v0, s[0:1]
+; VARIANT6-NEXT:    s_sub_co_i32 s0, s6, 1
+; VARIANT6-NEXT:    v_dual_mov_b32 v3, s4 :: v_dual_mov_b32 v4, s5
+; VARIANT6-NEXT:    v_sub_nc_u32_e32 v1, s0, v0
+; VARIANT6-NEXT:    global_store_b32 v5, v0, s[4:5]
 ; VARIANT6-NEXT:    s_wait_storecnt 0x0
 ; VARIANT6-NEXT:    s_barrier_signal -1
 ; VARIANT6-NEXT:    s_barrier_wait -1
@@ -154,7 +154,7 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 {
 ; VARIANT6-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v4, v2, vcc_lo
 ; VARIANT6-NEXT:    global_load_b32 v0, v[1:2], off
 ; VARIANT6-NEXT:    s_wait_loadcnt 0x0
-; VARIANT6-NEXT:    global_store_b32 v5, v0, s[0:1]
+; VARIANT6-NEXT:    global_store_b32 v5, v0, s[4:5]
 ; VARIANT6-NEXT:    s_nop 0
 ; VARIANT6-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; VARIANT6-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll
index 38a34ec6daf73..3eb226193051d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll
@@ -518,33 +518,33 @@ define void @test2_s_barrier_signal_isfirst_var(ptr addrspace(1) %a, ptr addrspa
 define amdgpu_kernel void @test1_s_barrier_init(ptr addrspace(1) %out, i32 %mbrCnt) #0 {
 ; GCN-LABEL: test1_s_barrier_init:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
+; GCN-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
 ; GCN-NEXT:    v_mul_u32_u24_e32 v1, v0, v0
 ; GCN-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
 ; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GCN-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
 ; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    s_lshl_b32 s2, s2, 16
-; GCN-NEXT:    global_store_b32 v3, v2, s[0:1]
-; GCN-NEXT:    s_mov_b32 m0, s2
+; GCN-NEXT:    s_lshl_b32 s0, s6, 16
+; GCN-NEXT:    global_store_b32 v3, v2, s[4:5]
+; GCN-NEXT:    s_mov_b32 m0, s0
 ; GCN-NEXT:    s_barrier_init -1
-; GCN-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GCN-NEXT:    global_store_b32 v3, v0, s[4:5]
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GCN-NEXT:    s_endpgm
 ;
 ; GLOBAL-ISEL-LABEL: test1_s_barrier_init:
 ; GLOBAL-ISEL:       ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
 ; GLOBAL-ISEL-NEXT:    v_mul_lo_u32 v1, v0, v0
 ; GLOBAL-ISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
 ; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GLOBAL-ISEL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
 ; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT:    s_lshl_b32 m0, 16, s2
-; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
+; GLOBAL-ISEL-NEXT:    s_lshl_b32 m0, 16, s6
+; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v2, s[4:5]
 ; GLOBAL-ISEL-NEXT:    s_barrier_init -1
-; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v0, s[4:5]
 ; GLOBAL-ISEL-NEXT:    s_nop 0
 ; GLOBAL-ISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GLOBAL-ISEL-NEXT:    s_endpgm
@@ -562,33 +562,33 @@ entry:
 define amdgpu_kernel void @test2_s_barrier_init(ptr addrspace(1) %out, i32 %mbrCnt) #0 {
 ; GCN-LABEL: test2_s_barrier_init:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
+; GCN-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
 ; GCN-NEXT:    v_mul_u32_u24_e32 v1, v0, v0
 ; GCN-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
 ; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GCN-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
 ; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    s_lshl_b32 s2, s2, 16
-; GCN-NEXT:    global_store_b32 v3, v2, s[0:1]
-; GCN-NEXT:    s_mov_b32 m0, s2
+; GCN-NEXT:    s_lshl_b32 s0, s6, 16
+; GCN-NEXT:    global_store_b32 v3, v2, s[4:5]
+; GCN-NEXT:    s_mov_b32 m0, s0
 ; GCN-NEXT:    s_barrier_init 1
-; GCN-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GCN-NEXT:    global_store_b32 v3, v0, s[4:5]
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GCN-NEXT:    s_endpgm
 ;
 ; GLOBAL-ISEL-LABEL: test2_s_barrier_init:
 ; GLOBAL-ISEL:       ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
 ; GLOBAL-ISEL-NEXT:    v_mul_lo_u32 v1, v0, v0
 ; GLOBAL-ISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
 ; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GLOBAL-ISEL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
 ; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT:    s_lshl_b32 m0, 16, s2
-; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
+; GLOBAL-ISEL-NEXT:    s_lshl_b32 m0, 16, s6
+; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v2, s[4:5]
 ; GLOBAL-ISEL-NEXT:    s_barrier_init 1
-; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v0, s[4:5]
 ; GLOBAL-ISEL-NEXT:    s_nop 0
 ; GLOBAL-ISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GLOBAL-ISEL-NEXT:    s_endpgm
@@ -606,33 +606,33 @@ entry:
 define amdgpu_kernel void @test3_s_barrier_init(ptr addrspace(1) %out, i32 %mbrCnt) #0 {
 ; GCN-LABEL: test3_s_barrier_init:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
+; GCN-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
 ; GCN-NEXT:    v_mul_u32_u24_e32 v1, v0, v0
 ; GCN-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
 ; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GCN-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
 ; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    s_lshl_b32 s2, s2, 16
-; GCN-NEXT:    global_store_b32 v3, v2, s[0:1]
-; GCN-NEXT:    s_mov_b32 m0, s2
+; GCN-NEXT:    s_lshl_b32 s0, s6, 16
+; GCN-NEXT:    global_store_b32 v3, v2, s[4:5]
+; GCN-NEXT:    s_mov_b32 m0, s0
 ; GCN-NEXT:    s_barrier_init 0
-; GCN-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GCN-NEXT:    global_store_b32 v3, v0, s[4:5]
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GCN-NEXT:    s_endpgm
 ;
 ; GLOBAL-ISEL-LABEL: test3_s_barrier_init:
 ; GLOBAL-ISEL:       ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
 ; GLOBAL-ISEL-NEXT:    v_mul_lo_u32 v1, v0, v0
 ; GLOBAL-ISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
 ; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GLOBAL-ISEL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
 ; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT:    s_lshl_b32 m0, 16, s2
-; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
+; GLOBAL-ISEL-NEXT:    s_lshl_b32 m0, 16, s6
+; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v2, s[4:5]
 ; GLOBAL-ISEL-NEXT:    s_barrier_init 0
-; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v0, s[4:5]
 ; GLOBAL-ISEL-NEXT:    s_nop 0
 ; GLOBAL-ISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GLOBAL-ISEL-NEXT:    s_endpgm
@@ -650,35 +650,35 @@ entry:
 define amdgpu_kernel void @test4_s_barrier_init(ptr addrspace(1) %out, i32 %bar, i32 %mbrCnt) #0 {
 ; GCN-LABEL: test4_s_barrier_init:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GCN-NEXT:    v_mul_u32_u24_e32 v1, v0, v0
 ; GCN-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
 ; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
 ; GCN-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
 ; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    s_lshl_b32 s3, s3, 16
-; GCN-NEXT:    global_store_b32 v3, v2, s[0:1]
-; GCN-NEXT:    s_or_b32 s2, s2, s3
-; GCN-NEXT:    s_mov_b32 m0, s2
+; GCN-NEXT:    s_lshl_b32 s0, s7, 16
+; GCN-NEXT:    global_store_b32 v3, v2, s[4:5]
+; GCN-NEXT:    s_or_b32 s0, s6, s0
+; GCN-NEXT:    s_mov_b32 m0, s0
 ; GCN-NEXT:    s_barrier_init m0
-; GCN-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GCN-NEXT:    global_store_b32 v3, v0, s[4:5]
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GCN-NEXT:    s_endpgm
 ;
 ; GLOBAL-ISEL-LABEL: test4_s_barrier_init:
 ; GLOBAL-ISEL:       ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GLOBAL-ISEL-NEXT:    v_mul_lo_u32 v1, v0, v0
 ; GLOBAL-ISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
 ; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GLOBAL-ISEL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
 ; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT:    s_lshl_b32 s3, 16, s3
-; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
-; GLOBAL-ISEL-NEXT:    s_or_b32 m0, s2, s3
+; GLOBAL-ISEL-NEXT:    s_lshl_b32 s0, 16, s7
+; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v2, s[4:5]
+; GLOBAL-ISEL-NEXT:    s_or_b32 m0, s6, s0
 ; GLOBAL-ISEL-NEXT:    s_barrier_init m0
-; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v0, s[4:5]
 ; GLOBAL-ISEL-NEXT:    s_nop 0
 ; GLOBAL-ISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GLOBAL-ISEL-NEXT:    s_endpgm
@@ -852,33 +852,33 @@ entry:
 define amdgpu_kernel void @test4_s_barrier_join_m0(ptr addrspace(1) %out, i32 %bar) #0 {
 ; GCN-LABEL: test4_s_barrier_join_m0:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
+; GCN-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
 ; GCN-NEXT:    v_mul_u32_u24_e32 v2, v0, v0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    v_lshlrev_b32_e32 v3, 2, v0
 ; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GCN-NEXT:    v_sub_nc_u32_e32 v0, v2, v0
 ; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    s_mov_b32 m0, s2
-; GCN-NEXT:    global_store_b32 v3, v1, s[0:1]
+; GCN-NEXT:    s_mov_b32 m0, s6
+; GCN-NEXT:    global_store_b32 v3, v1, s[4:5]
 ; GCN-NEXT:    s_barrier_join m0
-; GCN-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GCN-NEXT:    global_store_b32 v3, v0, s[4:5]
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GCN-NEXT:    s_endpgm
 ;
 ; GLOBAL-ISEL-LABEL: test4_s_barrier_join_m0:
 ; GLOBAL-ISEL:       ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
 ; GLOBAL-ISEL-NEXT:    v_mul_lo_u32 v1, v0, v0
 ; GLOBAL-ISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
 ; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GLOBAL-ISEL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
 ; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT:    s_mov_b32 m0, s2
-; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
+; GLOBAL-ISEL-NEXT:    s_mov_b32 m0, s6
+; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v2, s[4:5]
 ; GLOBAL-ISEL-NEXT:    s_barrier_join m0
-; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v0, s[4:5]
 ; GLOBAL-ISEL-NEXT:    s_nop 0
 ; GLOBAL-ISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GLOBAL-ISEL-NEXT:    s_endpgm
@@ -1098,33 +1098,33 @@ entry:
 define amdgpu_kernel void @test4_s_wakeup_barrier_m0(ptr addrspace(1) %out, i32 %bar) #0 {
 ; GCN-LABEL: test4_s_wakeup_barrier_m0:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
+; GCN-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
 ; GCN-NEXT:    v_mul_u32_u24_e32 v2, v0, v0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    v_lshlrev_b32_e32 v3, 2, v0
 ; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GCN-NEXT:    v_sub_nc_u32_e32 v0, v2, v0
 ; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    s_mov_b32 m0, s2
-; GCN-NEXT:    global_store_b32 v3, v1, s[0:1]
+; GCN-NEXT:    s_mov_b32 m0, s6
+; GCN-NEXT:    global_store_b32 v3, v1, s[4:5]
 ; GCN-NEXT:    s_wakeup_barrier m0
-; GCN-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GCN-NEXT:    global_store_b32 v3, v0, s[4:5]
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GCN-NEXT:    s_endpgm
 ;
 ; GLOBAL-ISEL-LABEL: test4_s_wakeup_barrier_m0:
 ; GLOBAL-ISEL:       ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
 ; GLOBAL-ISEL-NEXT:    v_mul_lo_u32 v1, v0, v0
 ; GLOBAL-ISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
 ; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GLOBAL-ISEL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
 ; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT:    s_mov_b32 m0, s2
-; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
+; GLOBAL-ISEL-NEXT:    s_mov_b32 m0, s6
+; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v2, s[4:5]
 ; GLOBAL-ISEL-NEXT:    s_wakeup_barrier m0
-; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v0, s[4:5]
 ; GLOBAL-ISEL-NEXT:    s_nop 0
 ; GLOBAL-ISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GLOBAL-ISEL-NEXT:    s_endpgm
@@ -1278,32 +1278,32 @@ entry:
 define amdgpu_kernel void @test4_s_get_barrier_state_m0(ptr addrspace(1) %out, i32 %bar) #0 {
 ; GCN-LABEL: test4_s_get_barrier_state_m0:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
+; GCN-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
 ; GCN-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    s_mov_b32 m0, s2
-; GCN-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GCN-NEXT:    s_get_barrier_state s2, m0
+; GCN-NEXT:    s_mov_b32 m0, s6
+; GCN-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GCN-NEXT:    s_get_barrier_state s0, m0
 ; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_2)
-; GCN-NEXT:    v_mov_b32_e32 v1, s2
-; GCN-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GCN-NEXT:    v_mov_b32_e32 v1, s0
+; GCN-NEXT:    global_store_b32 v0, v1, s[4:5]
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GCN-NEXT:    s_endpgm
 ;
 ; GLOBAL-ISEL-LABEL: test4_s_get_barrier_state_m0:
 ; GLOBAL-ISEL:       ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
 ; GLOBAL-ISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT:    s_mov_b32 m0, s2
-; GLOBAL-ISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GLOBAL-ISEL-NEXT:    s_get_barrier_state s2, m0
+; GLOBAL-ISEL-NEXT:    s_mov_b32 m0, s6
+; GLOBAL-ISEL-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GLOBAL-ISEL-NEXT:    s_get_barrier_state s0, m0
 ; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
 ; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_2)
-; GLOBAL-ISEL-NEXT:    v_mov_b32_e32 v1, s2
-; GLOBAL-ISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GLOBAL-ISEL-NEXT:    v_mov_b32_e32 v1, s0
+; GLOBAL-ISEL-NEXT:    global_store_b32 v0, v1, s[4:5]
 ; GLOBAL-ISEL-NEXT:    s_nop 0
 ; GLOBAL-ISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GLOBAL-ISEL-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll
index c2e74eb05d164..673e9bf08eb2b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll
@@ -5,12 +5,12 @@
 define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_cluster(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 {
 ; GCN-LABEL: test_sched_group_barrier_pipeline_WMMA_cluster:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GCN-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
 ; GCN-NEXT:    v_lshlrev_b32_e32 v40, 5, v0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GCN-NEXT:    v_add_nc_u32_e32 v32, s0, v40
-; GCN-NEXT:    v_dual_mov_b32 v81, s1 :: v_dual_add_nc_u32 v80, s1, v40
+; GCN-NEXT:    v_add_nc_u32_e32 v32, s2, v40
+; GCN-NEXT:    v_dual_mov_b32 v81, s3 :: v_dual_add_nc_u32 v80, s3, v40
 ; GCN-NEXT:    ds_load_b128 v[4:7], v32 offset:16
 ; GCN-NEXT:    ds_load_b128 v[12:15], v32 offset:2064
 ; GCN-NEXT:    ds_load_b128 v[20:23], v32 offset:6160
@@ -72,12 +72,12 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_cluster(ptr ad
 ;
 ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_WMMA_cluster:
 ; EXACTCUTOFF:       ; %bb.0: ; %entry
-; EXACTCUTOFF-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; EXACTCUTOFF-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
 ; EXACTCUTOFF-NEXT:    v_lshlrev_b32_e32 v40, 5, v0
 ; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
 ; EXACTCUTOFF-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; EXACTCUTOFF-NEXT:    v_add_nc_u32_e32 v32, s0, v40
-; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v81, s1 :: v_dual_add_nc_u32 v80, s1, v40
+; EXACTCUTOFF-NEXT:    v_add_nc_u32_e32 v32, s2, v40
+; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v81, s3 :: v_dual_add_nc_u32 v80, s3, v40
 ; EXACTCUTOFF-NEXT:    ds_load_b128 v[4:7], v32 offset:16
 ; EXACTCUTOFF-NEXT:    ds_load_b128 v[12:15], v32 offset:2064
 ; EXACTCUTOFF-NEXT:    ds_load_b128 v[20:23], v32 offset:6160
@@ -175,12 +175,12 @@ entry:
 define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_interleave(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 {
 ; GCN-LABEL: test_sched_group_barrier_pipeline_WMMA_interleave:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GCN-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
 ; GCN-NEXT:    v_lshlrev_b32_e32 v16, 5, v0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GCN-NEXT:    v_add_nc_u32_e32 v17, s0, v16
-; GCN-NEXT:    v_add_nc_u32_e32 v16, s1, v16
+; GCN-NEXT:    v_add_nc_u32_e32 v17, s2, v16
+; GCN-NEXT:    v_add_nc_u32_e32 v16, s3, v16
 ; GCN-NEXT:    ds_load_b128 v[4:7], v17 offset:16
 ; GCN-NEXT:    ds_load_b128 v[0:3], v17
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(2) SyncID(0)
@@ -196,7 +196,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_interleave(ptr
 ; GCN-NEXT:    ds_store_b128 v16, v[8:11]
 ; GCN-NEXT:    ds_load_b128 v[4:7], v17 offset:2064
 ; GCN-NEXT:    ds_load_b128 v[0:3], v17 offset:2048
-; GCN-NEXT:    v_mov_b32_e32 v16, s1
+; GCN-NEXT:    v_mov_b32_e32 v16, s3
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000200) size(2) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(2) SyncID(0)
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
@@ -256,12 +256,12 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_interleave(ptr
 ;
 ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_WMMA_interleave:
 ; EXACTCUTOFF:       ; %bb.0: ; %entry
-; EXACTCUTOFF-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; EXACTCUTOFF-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
 ; EXACTCUTOFF-NEXT:    v_lshlrev_b32_e32 v16, 5, v0
 ; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
 ; EXACTCUTOFF-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; EXACTCUTOFF-NEXT:    v_add_nc_u32_e32 v17, s0, v16
-; EXACTCUTOFF-NEXT:    v_add_nc_u32_e32 v16, s1, v16
+; EXACTCUTOFF-NEXT:    v_add_nc_u32_e32 v17, s2, v16
+; EXACTCUTOFF-NEXT:    v_add_nc_u32_e32 v16, s3, v16
 ; EXACTCUTOFF-NEXT:    ds_load_b128 v[4:7], v17 offset:16
 ; EXACTCUTOFF-NEXT:    ds_load_b128 v[0:3], v17
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000100) size(2) SyncID(0)
@@ -277,7 +277,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_interleave(ptr
 ; EXACTCUTOFF-NEXT:    ds_store_b128 v16, v[8:11]
 ; EXACTCUTOFF-NEXT:    ds_load_b128 v[4:7], v17 offset:2064
 ; EXACTCUTOFF-NEXT:    ds_load_b128 v[0:3], v17 offset:2048
-; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v16, s1
+; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v16, s3
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000200) size(2) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000100) size(2) SyncID(0)
 ; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll
index fdcb1773d0a3f..bc29441edc247 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll
@@ -7,13 +7,13 @@ declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16..i16(
 define amdgpu_kernel void @test_sched_group_barrier_pipeline_SWMMAC_cluster(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 {
 ; GCN-LABEL: test_sched_group_barrier_pipeline_SWMMAC_cluster:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GCN-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
 ; GCN-NEXT:    v_lshlrev_b32_e32 v28, 4, v0
 ; GCN-NEXT:    v_mov_b32_e32 v48, 0
 ; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GCN-NEXT:    v_add_nc_u32_e32 v0, s0, v28
-; GCN-NEXT:    v_dual_mov_b32 v50, s1 :: v_dual_add_nc_u32 v49, s1, v28
+; GCN-NEXT:    v_add_nc_u32_e32 v0, s2, v28
+; GCN-NEXT:    v_dual_mov_b32 v50, s3 :: v_dual_add_nc_u32 v49, s3, v28
 ; GCN-NEXT:    ds_load_b128 v[8:11], v0
 ; GCN-NEXT:    ds_load_b128 v[12:15], v0 offset:512
 ; GCN-NEXT:    ds_load_b128 v[16:19], v0 offset:1536
@@ -58,13 +58,13 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_SWMMAC_cluster(ptr
 ;
 ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_SWMMAC_cluster:
 ; EXACTCUTOFF:       ; %bb.0: ; %entry
-; EXACTCUTOFF-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; EXACTCUTOFF-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
 ; EXACTCUTOFF-NEXT:    v_lshlrev_b32_e32 v28, 4, v0
 ; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v48, 0
 ; EXACTCUTOFF-NEXT:    s_wait_kmcnt 0x0
 ; EXACTCUTOFF-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; EXACTCUTOFF-NEXT:    v_add_nc_u32_e32 v0, s0, v28
-; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v50, s1 :: v_dual_add_nc_u32 v49, s1, v28
+; EXACTCUTOFF-NEXT:    v_add_nc_u32_e32 v0, s2, v28
+; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v50, s3 :: v_dual_add_nc_u32 v49, s3, v28
 ; EXACTCUTOFF-NEXT:    ds_load_b128 v[8:11], v0
 ; EXACTCUTOFF-NEXT:    ds_load_b128 v[12:15], v0 offset:512
 ; EXACTCUTOFF-NEXT:    ds_load_b128 v[16:19], v0 offset:1536
@@ -147,11 +147,11 @@ entry:
 define amdgpu_kernel void @test_sched_group_barrier_pipeline_SWMMAC_interleaved(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 {
 ; GCN-LABEL: test_sched_group_barrier_pipeline_SWMMAC_interleaved:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GCN-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v18, 0
 ; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    v_lshl_add_u32 v17, v0, 5, s0
-; GCN-NEXT:    v_lshl_add_u32 v0, v0, 4, s1
+; GCN-NEXT:    v_lshl_add_u32 v17, v0, 5, s2
+; GCN-NEXT:    v_lshl_add_u32 v0, v0, 4, s3
 ; GCN-NEXT:    ds_load_b128 v[9:12], v17 offset:1024
 ; GCN-NEXT:    ds_load_b128 v[1:4], v17
 ; GCN-NEXT:    ds_load_b128 v[5:8], v17 offset:16
@@ -165,7 +165,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_SWMMAC_interleaved(
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
 ; GCN-NEXT:    ds_store_b128 v0, v[13:16]
 ; GCN-NEXT:    ds_load_b128 v[9:12], v17 offset:2560
-; GCN-NEXT:    v_mov_b32_e32 v0, s1
+; GCN-NEXT:    v_mov_b32_e32 v0, s3
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
 ; GCN-NEXT:    s_wait_dscnt 0x0
@@ -210,11 +210,11 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_SWMMAC_interleaved(
 ;
 ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_SWMMAC_interleaved:
 ; EXACTCUTOFF:       ; %bb.0: ; %entry
-; EXACTCUTOFF-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; EXACTCUTOFF-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
 ; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v18, 0
 ; EXACTCUTOFF-NEXT:    s_wait_kmcnt 0x0
-; EXACTCUTOFF-NEXT:    v_lshl_add_u32 v17, v0, 5, s0
-; EXACTCUTOFF-NEXT:    v_lshl_add_u32 v0, v0, 4, s1
+; EXACTCUTOFF-NEXT:    v_lshl_add_u32 v17, v0, 5, s2
+; EXACTCUTOFF-NEXT:    v_lshl_add_u32 v0, v0, 4, s3
 ; EXACTCUTOFF-NEXT:    ds_load_b128 v[9:12], v17 offset:1024
 ; EXACTCUTOFF-NEXT:    ds_load_b128 v[1:4], v17
 ; EXACTCUTOFF-NEXT:    ds_load_b128 v[5:8], v17 offset:16
@@ -228,7 +228,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_SWMMAC_interleaved(
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ds_store_b128 v0, v[13:16]
 ; EXACTCUTOFF-NEXT:    ds_load_b128 v[9:12], v17 offset:2560
-; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v0, s1
+; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v0, s3
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    s_wait_dscnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
index 10f09b6390aba..ae5b62ffb285b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
@@ -29,18 +29,18 @@ entry:
 define amdgpu_kernel void @test_sched_group_barrier_pipeline_READ_VALU_WRITE(ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %out) #0 {
 ; GCN-LABEL: test_sched_group_barrier_pipeline_READ_VALU_WRITE:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GCN-NEXT:    v_lshlrev_b32_e32 v32, 7, v0
-; GCN-NEXT:    ; kill: killed $sgpr0_sgpr1
+; GCN-NEXT:    ; kill: killed $sgpr4_sgpr5
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    global_load_dwordx4 v[0:3], v32, s[0:1]
-; GCN-NEXT:    global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
-; GCN-NEXT:    global_load_dwordx4 v[8:11], v32, s[0:1] offset:32
-; GCN-NEXT:    global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
-; GCN-NEXT:    global_load_dwordx4 v[16:19], v32, s[0:1] offset:64
-; GCN-NEXT:    global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
-; GCN-NEXT:    global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
-; GCN-NEXT:    global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
+; GCN-NEXT:    global_load_dwordx4 v[0:3], v32, s[4:5]
+; GCN-NEXT:    global_load_dwordx4 v[4:7], v32, s[4:5] offset:16
+; GCN-NEXT:    global_load_dwordx4 v[8:11], v32, s[4:5] offset:32
+; GCN-NEXT:    global_load_dwordx4 v[12:15], v32, s[4:5] offset:48
+; GCN-NEXT:    global_load_dwordx4 v[16:19], v32, s[4:5] offset:64
+; GCN-NEXT:    global_load_dwordx4 v[20:23], v32, s[4:5] offset:80
+; GCN-NEXT:    global_load_dwordx4 v[24:27], v32, s[4:5] offset:96
+; GCN-NEXT:    global_load_dwordx4 v[28:31], v32, s[4:5] offset:112
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000020) size(8) SyncID(0)
 ; GCN-NEXT:    s_waitcnt vmcnt(7)
 ; GCN-NEXT:    v_mul_lo_u32 v3, v3, v3
@@ -82,32 +82,32 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_READ_VALU_WRITE(ptr
 ; GCN-NEXT:    v_mul_lo_u32 v30, v30, v30
 ; GCN-NEXT:    v_mul_lo_u32 v29, v29, v29
 ; GCN-NEXT:    v_mul_lo_u32 v28, v28, v28
-; GCN-NEXT:    global_store_dwordx4 v32, v[28:31], s[2:3] offset:112
-; GCN-NEXT:    global_store_dwordx4 v32, v[24:27], s[2:3] offset:96
-; GCN-NEXT:    global_store_dwordx4 v32, v[20:23], s[2:3] offset:80
-; GCN-NEXT:    global_store_dwordx4 v32, v[16:19], s[2:3] offset:64
-; GCN-NEXT:    global_store_dwordx4 v32, v[12:15], s[2:3] offset:48
-; GCN-NEXT:    global_store_dwordx4 v32, v[8:11], s[2:3] offset:32
-; GCN-NEXT:    global_store_dwordx4 v32, v[4:7], s[2:3] offset:16
-; GCN-NEXT:    global_store_dwordx4 v32, v[0:3], s[2:3]
+; GCN-NEXT:    global_store_dwordx4 v32, v[28:31], s[6:7] offset:112
+; GCN-NEXT:    global_store_dwordx4 v32, v[24:27], s[6:7] offset:96
+; GCN-NEXT:    global_store_dwordx4 v32, v[20:23], s[6:7] offset:80
+; GCN-NEXT:    global_store_dwordx4 v32, v[16:19], s[6:7] offset:64
+; GCN-NEXT:    global_store_dwordx4 v32, v[12:15], s[6:7] offset:48
+; GCN-NEXT:    global_store_dwordx4 v32, v[8:11], s[6:7] offset:32
+; GCN-NEXT:    global_store_dwordx4 v32, v[4:7], s[6:7] offset:16
+; GCN-NEXT:    global_store_dwordx4 v32, v[0:3], s[6:7]
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000002) size(30) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000040) size(8) SyncID(0)
 ; GCN-NEXT:    s_endpgm
 ;
 ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_READ_VALU_WRITE:
 ; EXACTCUTOFF:       ; %bb.0:
-; EXACTCUTOFF-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; EXACTCUTOFF-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; EXACTCUTOFF-NEXT:    v_lshlrev_b32_e32 v32, 7, v0
-; EXACTCUTOFF-NEXT:    ; kill: killed $sgpr0_sgpr1
+; EXACTCUTOFF-NEXT:    ; kill: killed $sgpr4_sgpr5
 ; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[0:3], v32, s[0:1]
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[8:11], v32, s[0:1] offset:32
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[16:19], v32, s[0:1] offset:64
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[0:3], v32, s[4:5]
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[4:7], v32, s[4:5] offset:16
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[8:11], v32, s[4:5] offset:32
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[12:15], v32, s[4:5] offset:48
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[16:19], v32, s[4:5] offset:64
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[20:23], v32, s[4:5] offset:80
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[24:27], v32, s[4:5] offset:96
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[28:31], v32, s[4:5] offset:112
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000020) size(8) SyncID(0)
 ; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(7)
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v3, v3, v3
@@ -149,14 +149,14 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_READ_VALU_WRITE(ptr
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v30, v30, v30
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v29, v29, v29
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v28, v28, v28
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[28:31], s[2:3] offset:112
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[24:27], s[2:3] offset:96
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[20:23], s[2:3] offset:80
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[16:19], s[2:3] offset:64
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[12:15], s[2:3] offset:48
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[8:11], s[2:3] offset:32
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[4:7], s[2:3] offset:16
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[0:3], s[2:3]
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[28:31], s[6:7] offset:112
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[24:27], s[6:7] offset:96
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[20:23], s[6:7] offset:80
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[16:19], s[6:7] offset:64
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[12:15], s[6:7] offset:48
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[8:11], s[6:7] offset:32
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[4:7], s[6:7] offset:16
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[0:3], s[6:7]
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000002) size(30) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000040) size(8) SyncID(0)
 ; EXACTCUTOFF-NEXT:    s_endpgm
@@ -178,17 +178,17 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_READ_VALU_WRITE(ptr
 define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VALU(ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %out) #0 {
 ; GCN-LABEL: test_sched_group_barrier_pipeline_alternating_READ_VALU:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GCN-NEXT:    v_lshlrev_b32_e32 v32, 7, v0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    global_load_dwordx4 v[28:31], v32, s[0:1] offset:16
-; GCN-NEXT:    global_load_dwordx4 v[8:11], v32, s[0:1] offset:96
+; GCN-NEXT:    global_load_dwordx4 v[28:31], v32, s[4:5] offset:16
+; GCN-NEXT:    global_load_dwordx4 v[8:11], v32, s[4:5] offset:96
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_mul_lo_u32 v29, v29, v29
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mul_lo_u32 v9, v9, v9
-; GCN-NEXT:    global_load_dwordx4 v[0:3], v32, s[0:1]
+; GCN-NEXT:    global_load_dwordx4 v[0:3], v32, s[4:5]
 ; GCN-NEXT:    v_mul_lo_u32 v8, v8, v8
 ; GCN-NEXT:    v_mul_lo_u32 v28, v28, v28
 ; GCN-NEXT:    v_mul_lo_u32 v31, v31, v31
@@ -198,12 +198,12 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mul_lo_u32 v3, v3, v3
 ; GCN-NEXT:    v_mul_lo_u32 v2, v2, v2
-; GCN-NEXT:    global_load_dwordx4 v[4:7], v32, s[0:1] offset:112
+; GCN-NEXT:    global_load_dwordx4 v[4:7], v32, s[4:5] offset:112
 ; GCN-NEXT:    v_mul_lo_u32 v1, v1, v1
 ; GCN-NEXT:    v_mul_lo_u32 v0, v0, v0
 ; GCN-NEXT:    v_mul_lo_u32 v11, v11, v11
 ; GCN-NEXT:    v_mul_lo_u32 v10, v10, v10
-; GCN-NEXT:    global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
+; GCN-NEXT:    global_load_dwordx4 v[12:15], v32, s[4:5] offset:48
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
@@ -218,11 +218,11 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mul_lo_u32 v13, v13, v13
 ; GCN-NEXT:    v_mul_lo_u32 v15, v15, v15
-; GCN-NEXT:    global_load_dwordx4 v[16:19], v32, s[0:1] offset:80
+; GCN-NEXT:    global_load_dwordx4 v[16:19], v32, s[4:5] offset:80
 ; GCN-NEXT:    v_mul_lo_u32 v14, v14, v14
 ; GCN-NEXT:    v_mul_lo_u32 v12, v12, v12
-; GCN-NEXT:    global_load_dwordx4 v[20:23], v32, s[0:1] offset:64
-; GCN-NEXT:    global_load_dwordx4 v[24:27], v32, s[0:1] offset:32
+; GCN-NEXT:    global_load_dwordx4 v[20:23], v32, s[4:5] offset:64
+; GCN-NEXT:    global_load_dwordx4 v[24:27], v32, s[4:5] offset:32
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
@@ -242,14 +242,14 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; GCN-NEXT:    v_mul_lo_u32 v21, v21, v21
 ; GCN-NEXT:    v_mul_lo_u32 v20, v20, v20
 ; GCN-NEXT:    v_mul_lo_u32 v16, v16, v16
-; GCN-NEXT:    global_store_dwordx4 v32, v[4:7], s[2:3] offset:112
-; GCN-NEXT:    global_store_dwordx4 v32, v[8:11], s[2:3] offset:96
-; GCN-NEXT:    global_store_dwordx4 v32, v[16:19], s[2:3] offset:80
-; GCN-NEXT:    global_store_dwordx4 v32, v[20:23], s[2:3] offset:64
-; GCN-NEXT:    global_store_dwordx4 v32, v[12:15], s[2:3] offset:48
-; GCN-NEXT:    global_store_dwordx4 v32, v[24:27], s[2:3] offset:32
-; GCN-NEXT:    global_store_dwordx4 v32, v[28:31], s[2:3] offset:16
-; GCN-NEXT:    global_store_dwordx4 v32, v[0:3], s[2:3]
+; GCN-NEXT:    global_store_dwordx4 v32, v[4:7], s[6:7] offset:112
+; GCN-NEXT:    global_store_dwordx4 v32, v[8:11], s[6:7] offset:96
+; GCN-NEXT:    global_store_dwordx4 v32, v[16:19], s[6:7] offset:80
+; GCN-NEXT:    global_store_dwordx4 v32, v[20:23], s[6:7] offset:64
+; GCN-NEXT:    global_store_dwordx4 v32, v[12:15], s[6:7] offset:48
+; GCN-NEXT:    global_store_dwordx4 v32, v[24:27], s[6:7] offset:32
+; GCN-NEXT:    global_store_dwordx4 v32, v[28:31], s[6:7] offset:16
+; GCN-NEXT:    global_store_dwordx4 v32, v[0:3], s[6:7]
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
@@ -258,17 +258,17 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ;
 ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_alternating_READ_VALU:
 ; EXACTCUTOFF:       ; %bb.0:
-; EXACTCUTOFF-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; EXACTCUTOFF-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; EXACTCUTOFF-NEXT:    v_lshlrev_b32_e32 v32, 7, v0
 ; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[28:31], v32, s[0:1] offset:16
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[8:11], v32, s[0:1] offset:96
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[28:31], v32, s[4:5] offset:16
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[8:11], v32, s[4:5] offset:96
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(1)
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v29, v29, v29
 ; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(0)
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v9, v9, v9
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[0:3], v32, s[0:1]
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[0:3], v32, s[4:5]
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v8, v8, v8
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v28, v28, v28
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v31, v31, v31
@@ -278,12 +278,12 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(0)
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v3, v3, v3
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v2, v2, v2
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[4:7], v32, s[0:1] offset:112
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[4:7], v32, s[4:5] offset:112
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v1, v1, v1
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v0, v0, v0
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v11, v11, v11
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v10, v10, v10
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[12:15], v32, s[4:5] offset:48
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
@@ -298,11 +298,11 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(0)
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v13, v13, v13
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v15, v15, v15
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[16:19], v32, s[0:1] offset:80
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[16:19], v32, s[4:5] offset:80
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v14, v14, v14
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v12, v12, v12
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[20:23], v32, s[0:1] offset:64
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[24:27], v32, s[0:1] offset:32
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[20:23], v32, s[4:5] offset:64
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[24:27], v32, s[4:5] offset:32
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
@@ -322,14 +322,14 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v21, v21, v21
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v20, v20, v20
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v16, v16, v16
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[4:7], s[2:3] offset:112
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[8:11], s[2:3] offset:96
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[16:19], s[2:3] offset:80
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[20:23], s[2:3] offset:64
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[12:15], s[2:3] offset:48
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[24:27], s[2:3] offset:32
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[28:31], s[2:3] offset:16
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[0:3], s[2:3]
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[4:7], s[6:7] offset:112
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[8:11], s[6:7] offset:96
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[16:19], s[6:7] offset:80
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[20:23], s[6:7] offset:64
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[12:15], s[6:7] offset:48
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[24:27], s[6:7] offset:32
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[28:31], s[6:7] offset:16
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[0:3], s[6:7]
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
@@ -381,18 +381,18 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VALU_WRITE(ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %out) #0 {
 ; GCN-LABEL: test_sched_group_barrier_pipeline_alternating_READ_VALU_WRITE:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GCN-NEXT:    v_lshlrev_b32_e32 v16, 7, v0
-; GCN-NEXT:    ; kill: killed $sgpr0_sgpr1
+; GCN-NEXT:    ; kill: killed $sgpr4_sgpr5
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    global_load_dwordx4 v[12:15], v16, s[0:1] offset:32
-; GCN-NEXT:    global_load_dwordx4 v[4:7], v16, s[0:1] offset:48
+; GCN-NEXT:    global_load_dwordx4 v[12:15], v16, s[4:5] offset:32
+; GCN-NEXT:    global_load_dwordx4 v[4:7], v16, s[4:5] offset:48
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_mul_lo_u32 v13, v13, v13
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mul_lo_u32 v7, v7, v7
-; GCN-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1]
+; GCN-NEXT:    global_load_dwordx4 v[0:3], v16, s[4:5]
 ; GCN-NEXT:    v_mul_lo_u32 v6, v6, v6
 ; GCN-NEXT:    v_mul_lo_u32 v12, v12, v12
 ; GCN-NEXT:    v_mul_lo_u32 v15, v15, v15
@@ -403,25 +403,25 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; GCN-NEXT:    v_mul_lo_u32 v2, v2, v2
 ; GCN-NEXT:    v_mul_lo_u32 v1, v1, v1
 ; GCN-NEXT:    v_mul_lo_u32 v0, v0, v0
-; GCN-NEXT:    global_store_dwordx4 v16, v[0:3], s[2:3]
-; GCN-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1] offset:112
+; GCN-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7]
+; GCN-NEXT:    global_load_dwordx4 v[0:3], v16, s[4:5] offset:112
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mul_lo_u32 v3, v3, v3
 ; GCN-NEXT:    v_mul_lo_u32 v2, v2, v2
 ; GCN-NEXT:    v_mul_lo_u32 v1, v1, v1
 ; GCN-NEXT:    v_mul_lo_u32 v0, v0, v0
-; GCN-NEXT:    global_store_dwordx4 v16, v[0:3], s[2:3] offset:112
-; GCN-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1] offset:96
+; GCN-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7] offset:112
+; GCN-NEXT:    global_load_dwordx4 v[0:3], v16, s[4:5] offset:96
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mul_lo_u32 v3, v3, v3
 ; GCN-NEXT:    v_mul_lo_u32 v2, v2, v2
 ; GCN-NEXT:    v_mul_lo_u32 v1, v1, v1
 ; GCN-NEXT:    v_mul_lo_u32 v0, v0, v0
-; GCN-NEXT:    global_store_dwordx4 v16, v[0:3], s[2:3] offset:96
+; GCN-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7] offset:96
 ; GCN-NEXT:    v_mul_lo_u32 v5, v5, v5
 ; GCN-NEXT:    v_mul_lo_u32 v4, v4, v4
-; GCN-NEXT:    global_store_dwordx4 v16, v[4:7], s[2:3] offset:48
-; GCN-NEXT:    global_load_dwordx4 v[4:7], v16, s[0:1] offset:64
+; GCN-NEXT:    global_store_dwordx4 v16, v[4:7], s[6:7] offset:48
+; GCN-NEXT:    global_load_dwordx4 v[4:7], v16, s[4:5] offset:64
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
@@ -430,9 +430,9 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; GCN-NEXT:    v_mul_lo_u32 v6, v6, v6
 ; GCN-NEXT:    v_mul_lo_u32 v5, v5, v5
 ; GCN-NEXT:    v_mul_lo_u32 v4, v4, v4
-; GCN-NEXT:    global_store_dwordx4 v16, v[4:7], s[2:3] offset:64
-; GCN-NEXT:    global_store_dwordx4 v16, v[12:15], s[2:3] offset:32
-; GCN-NEXT:    global_load_dwordx4 v[8:11], v16, s[0:1] offset:16
+; GCN-NEXT:    global_store_dwordx4 v16, v[4:7], s[6:7] offset:64
+; GCN-NEXT:    global_store_dwordx4 v16, v[12:15], s[6:7] offset:32
+; GCN-NEXT:    global_load_dwordx4 v[8:11], v16, s[4:5] offset:16
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
@@ -452,15 +452,15 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; GCN-NEXT:    v_mul_lo_u32 v8, v8, v8
 ; GCN-NEXT:    v_mul_lo_u32 v11, v11, v11
 ; GCN-NEXT:    v_mul_lo_u32 v10, v10, v10
-; GCN-NEXT:    global_store_dwordx4 v16, v[8:11], s[2:3] offset:16
-; GCN-NEXT:    global_load_dwordx4 v[8:11], v16, s[0:1] offset:80
+; GCN-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7] offset:16
+; GCN-NEXT:    global_load_dwordx4 v[8:11], v16, s[4:5] offset:80
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mul_lo_u32 v11, v11, v11
 ; GCN-NEXT:    v_mul_lo_u32 v10, v10, v10
 ; GCN-NEXT:    v_mul_lo_u32 v9, v9, v9
 ; GCN-NEXT:    v_mul_lo_u32 v8, v8, v8
-; GCN-NEXT:    global_store_dwordx4 v16, v[8:11], s[2:3] offset:80
+; GCN-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7] offset:80
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
@@ -469,18 +469,18 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ;
 ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_alternating_READ_VALU_WRITE:
 ; EXACTCUTOFF:       ; %bb.0:
-; EXACTCUTOFF-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; EXACTCUTOFF-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; EXACTCUTOFF-NEXT:    v_lshlrev_b32_e32 v16, 7, v0
-; EXACTCUTOFF-NEXT:    ; kill: killed $sgpr0_sgpr1
+; EXACTCUTOFF-NEXT:    ; kill: killed $sgpr4_sgpr5
 ; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[12:15], v16, s[0:1] offset:32
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[4:7], v16, s[0:1] offset:48
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[12:15], v16, s[4:5] offset:32
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[4:7], v16, s[4:5] offset:48
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(1)
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v13, v13, v13
 ; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(0)
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v7, v7, v7
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1]
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[0:3], v16, s[4:5]
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v6, v6, v6
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v12, v12, v12
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v15, v15, v15
@@ -491,25 +491,25 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v2, v2, v2
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v1, v1, v1
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v0, v0, v0
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[0:3], s[2:3]
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1] offset:112
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7]
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[0:3], v16, s[4:5] offset:112
 ; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(0)
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v3, v3, v3
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v2, v2, v2
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v1, v1, v1
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v0, v0, v0
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[0:3], s[2:3] offset:112
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1] offset:96
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7] offset:112
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[0:3], v16, s[4:5] offset:96
 ; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(0)
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v3, v3, v3
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v2, v2, v2
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v1, v1, v1
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v0, v0, v0
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[0:3], s[2:3] offset:96
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7] offset:96
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v5, v5, v5
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v4, v4, v4
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[4:7], s[2:3] offset:48
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[4:7], v16, s[0:1] offset:64
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[4:7], s[6:7] offset:48
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[4:7], v16, s[4:5] offset:64
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
@@ -518,9 +518,9 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v6, v6, v6
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v5, v5, v5
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v4, v4, v4
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[4:7], s[2:3] offset:64
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[12:15], s[2:3] offset:32
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[8:11], v16, s[0:1] offset:16
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[4:7], s[6:7] offset:64
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[12:15], s[6:7] offset:32
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[8:11], v16, s[4:5] offset:16
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
@@ -540,15 +540,15 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v8, v8, v8
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v11, v11, v11
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v10, v10, v10
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[8:11], s[2:3] offset:16
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[8:11], v16, s[0:1] offset:80
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7] offset:16
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[8:11], v16, s[4:5] offset:80
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
 ; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(0)
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v11, v11, v11
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v10, v10, v10
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v9, v9, v9
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v8, v8, v8
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[8:11], s[2:3] offset:80
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7] offset:80
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
@@ -614,10 +614,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 {
 ; GCN-LABEL: test_sched_group_barrier_pipeline_MFMA_cluster:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_add_u32_e32 v1, s0, v0
+; GCN-NEXT:    v_add_u32_e32 v1, s2, v0
 ; GCN-NEXT:    ds_read_b128 a[28:31], v1 offset:112
 ; GCN-NEXT:    ds_read_b128 a[24:27], v1 offset:96
 ; GCN-NEXT:    ds_read_b128 a[20:23], v1 offset:80
@@ -661,7 +661,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad
 ; GCN-NEXT:    ds_read_b128 a[136:139], v2 offset:57376
 ; GCN-NEXT:    ds_read_b128 a[140:143], v2 offset:57392
 ; GCN-NEXT:    v_mov_b32_e32 v2, 2.0
-; GCN-NEXT:    v_add_u32_e32 v0, s1, v0
+; GCN-NEXT:    v_add_u32_e32 v0, s3, v0
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(40) SyncID(0)
 ; GCN-NEXT:    s_waitcnt lgkmcnt(14)
 ; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
@@ -681,7 +681,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad
 ; GCN-NEXT:    ds_write_b128 v0, a[8:11] offset:32
 ; GCN-NEXT:    ds_write_b128 v0, a[4:7] offset:16
 ; GCN-NEXT:    ds_write_b128 v0, a[0:3]
-; GCN-NEXT:    v_mov_b32_e32 v0, s1
+; GCN-NEXT:    v_mov_b32_e32 v0, s3
 ; GCN-NEXT:    ds_write_b128 v0, a[56:59] offset:8288
 ; GCN-NEXT:    ds_write_b128 v0, a[60:63] offset:8304
 ; GCN-NEXT:    ds_write_b128 v0, a[48:51] offset:8256
@@ -720,10 +720,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad
 ;
 ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_MFMA_cluster:
 ; EXACTCUTOFF:       ; %bb.0: ; %entry
-; EXACTCUTOFF-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; EXACTCUTOFF-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; EXACTCUTOFF-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
 ; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
-; EXACTCUTOFF-NEXT:    v_add_u32_e32 v1, s0, v0
+; EXACTCUTOFF-NEXT:    v_add_u32_e32 v1, s2, v0
 ; EXACTCUTOFF-NEXT:    ds_read_b128 a[28:31], v1 offset:112
 ; EXACTCUTOFF-NEXT:    ds_read_b128 a[24:27], v1 offset:96
 ; EXACTCUTOFF-NEXT:    ds_read_b128 a[20:23], v1 offset:80
@@ -767,7 +767,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad
 ; EXACTCUTOFF-NEXT:    ds_read_b128 a[136:139], v2 offset:57376
 ; EXACTCUTOFF-NEXT:    ds_read_b128 a[140:143], v2 offset:57392
 ; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v2, 2.0
-; EXACTCUTOFF-NEXT:    v_add_u32_e32 v0, s1, v0
+; EXACTCUTOFF-NEXT:    v_add_u32_e32 v0, s3, v0
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000100) size(40) SyncID(0)
 ; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(14)
 ; EXACTCUTOFF-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
@@ -787,7 +787,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad
 ; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[8:11] offset:32
 ; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[4:7] offset:16
 ; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[0:3]
-; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v0, s1
+; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v0, s3
 ; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[56:59] offset:8288
 ; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[60:63] offset:8304
 ; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[48:51] offset:8256
@@ -862,12 +862,12 @@ entry:
 define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 {
 ; GCN-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GCN-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
 ; GCN-NEXT:    v_mov_b32_e32 v2, 1.0
 ; GCN-NEXT:    v_mov_b32_e32 v3, 2.0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_add_u32_e32 v0, s0, v1
+; GCN-NEXT:    v_add_u32_e32 v0, s2, v1
 ; GCN-NEXT:    ds_read_b128 a[28:31], v0 offset:112
 ; GCN-NEXT:    ds_read_b128 a[24:27], v0 offset:96
 ; GCN-NEXT:    ds_read_b128 a[20:23], v0 offset:80
@@ -878,7 +878,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; GCN-NEXT:    ds_read_b128 a[12:15], v0 offset:48
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
-; GCN-NEXT:    v_add_u32_e32 v1, s1, v1
+; GCN-NEXT:    v_add_u32_e32 v1, s3, v1
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
 ; GCN-NEXT:    s_nop 7
@@ -902,7 +902,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; GCN-NEXT:    ds_read_b128 a[0:3], v0 offset:8192
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
-; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
@@ -995,12 +995,12 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ;
 ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave:
 ; EXACTCUTOFF:       ; %bb.0: ; %entry
-; EXACTCUTOFF-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; EXACTCUTOFF-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; EXACTCUTOFF-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
 ; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v2, 1.0
 ; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v3, 2.0
 ; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
-; EXACTCUTOFF-NEXT:    v_add_u32_e32 v0, s0, v1
+; EXACTCUTOFF-NEXT:    v_add_u32_e32 v0, s2, v1
 ; EXACTCUTOFF-NEXT:    ds_read_b128 a[28:31], v0 offset:112
 ; EXACTCUTOFF-NEXT:    ds_read_b128 a[24:27], v0 offset:96
 ; EXACTCUTOFF-NEXT:    ds_read_b128 a[20:23], v0 offset:80
@@ -1011,7 +1011,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; EXACTCUTOFF-NEXT:    ds_read_b128 a[12:15], v0 offset:48
 ; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
 ; EXACTCUTOFF-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
-; EXACTCUTOFF-NEXT:    v_add_u32_e32 v1, s1, v1
+; EXACTCUTOFF-NEXT:    v_add_u32_e32 v1, s3, v1
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    s_nop 7
@@ -1035,7 +1035,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; EXACTCUTOFF-NEXT:    ds_read_b128 a[0:3], v0 offset:8192
 ; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
 ; EXACTCUTOFF-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
-; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v1, s1
+; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v1, s3
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
index fc33206845a71..8f8994e78fd06 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
@@ -211,21 +211,21 @@ define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half>
 define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> %in) {
 ; GCN-LABEL: set_inactive_v2i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GCN-NEXT:    s_mov_b32 s8, 1
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_mov_b32 s9, s8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    s_mov_b32 s4, s0
-; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    s_mov_b32 s0, s4
+; GCN-NEXT:    s_mov_b32 s1, s5
+; GCN-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN-NEXT:    s_not_b64 exec, exec
 ; GCN-NEXT:    v_mov_b32_e32 v0, s8
 ; GCN-NEXT:    v_mov_b32_e32 v1, s9
 ; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
   %tmp = call <2 x i32> @llvm.amdgcn.set.inactive.v2i32(<2 x i32> %in, <2 x i32> <i32 1, i32 1>) #0
   store <2 x i32> %tmp, ptr addrspace(1) %out
@@ -235,21 +235,21 @@ define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> %
 define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
 ; GCN-LABEL: set_inactive_v2f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GCN-NEXT:    s_mov_b32 s8, 1.0
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_mov_b32 s9, s8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    s_mov_b32 s4, s0
-; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    s_mov_b32 s0, s4
+; GCN-NEXT:    s_mov_b32 s1, s5
+; GCN-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN-NEXT:    s_not_b64 exec, exec
 ; GCN-NEXT:    v_mov_b32_e32 v0, s8
 ; GCN-NEXT:    v_mov_b32_e32 v1, s9
 ; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
   %tmp = call <2 x float> @llvm.amdgcn.set.inactive.v2f32(<2 x float> %in, <2 x float> <float 1.0, float 1.0>) #0
   store <2 x float> %tmp, ptr addrspace(1) %out
@@ -279,21 +279,21 @@ define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloa
 define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> %in) {
 ; GCN-LABEL: set_inactive_v4i16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GCN-NEXT:    s_mov_b32 s8, 0x10001
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_mov_b32 s9, s8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    s_mov_b32 s4, s0
-; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    s_mov_b32 s0, s4
+; GCN-NEXT:    s_mov_b32 s1, s5
+; GCN-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN-NEXT:    s_not_b64 exec, exec
 ; GCN-NEXT:    v_mov_b32_e32 v0, s8
 ; GCN-NEXT:    v_mov_b32_e32 v1, s9
 ; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
   %tmp = call <4 x i16> @llvm.amdgcn.set.inactive.v4i16(<4 x i16> %in, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) #0
   store <4 x i16> %tmp, ptr addrspace(1) %out
@@ -303,21 +303,21 @@ define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> %
 define amdgpu_kernel void @set_inactive_v4f16(ptr addrspace(1) %out, <4 x half> %in) {
 ; GCN-LABEL: set_inactive_v4f16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GCN-NEXT:    s_mov_b32 s8, 0x3c003c00
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_mov_b32 s9, s8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    s_mov_b32 s4, s0
-; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    s_mov_b32 s0, s4
+; GCN-NEXT:    s_mov_b32 s1, s5
+; GCN-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN-NEXT:    s_not_b64 exec, exec
 ; GCN-NEXT:    v_mov_b32_e32 v0, s8
 ; GCN-NEXT:    v_mov_b32_e32 v1, s9
 ; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
   %tmp = call <4 x half> @llvm.amdgcn.set.inactive.v4f16(<4 x half> %in, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>) #0
   store <4 x half> %tmp, ptr addrspace(1) %out
@@ -327,21 +327,21 @@ define amdgpu_kernel void @set_inactive_v4f16(ptr addrspace(1) %out, <4 x half>
 define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloat> %in) {
 ; GCN-LABEL: set_inactive_v4bf16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GCN-NEXT:    s_mov_b32 s8, 0x3f803f80
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_mov_b32 s9, s8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    s_mov_b32 s4, s0
-; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    s_mov_b32 s0, s4
+; GCN-NEXT:    s_mov_b32 s1, s5
+; GCN-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN-NEXT:    s_not_b64 exec, exec
 ; GCN-NEXT:    v_mov_b32_e32 v0, s8
 ; GCN-NEXT:    v_mov_b32_e32 v1, s9
 ; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
   %tmp = call <4 x bfloat> @llvm.amdgcn.set.inactive.v4bf16(<4 x bfloat> %in, <4 x bfloat> <bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0>) #0
   store <4 x bfloat> %tmp, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll
index 0755dcddd8f46..87c5f5bd7b784 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll
@@ -18,15 +18,15 @@ define amdgpu_kernel void @bfe_u32_arg_arg_arg(ptr addrspace(1) %out, i32 %src0,
 ;
 ; VI-LABEL: bfe_u32_arg_arg_arg:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    v_bfe_u32 v0, v0, s3, s3
-; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    v_mov_b32_e32 v0, s6
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    v_bfe_u32 v0, v0, s7, s7
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 %src1, i32 %src1)
   store i32 %bfe_u32, ptr addrspace(1) %out, align 4
@@ -50,16 +50,16 @@ define amdgpu_kernel void @bfe_u32_arg_arg_imm(ptr addrspace(1) %out, i32 %src0,
 ;
 ; VI-LABEL: bfe_u32_arg_arg_imm:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_mov_b32_e32 v1, 0x7b
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s3
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    v_bfe_u32 v0, s2, v0, v1
-; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    v_mov_b32_e32 v0, s7
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    v_bfe_u32 v0, s6, v0, v1
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 %src1, i32 123)
   store i32 %bfe_u32, ptr addrspace(1) %out, align 4
@@ -83,16 +83,16 @@ define amdgpu_kernel void @bfe_u32_arg_imm_arg(ptr addrspace(1) %out, i32 %src0,
 ;
 ; VI-LABEL: bfe_u32_arg_imm_arg:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_mov_b32_e32 v0, 0x7b
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    v_bfe_u32 v0, s2, v0, v1
-; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    v_bfe_u32 v0, s6, v0, v1
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 123, i32 %src2)
   store i32 %bfe_u32, ptr addrspace(1) %out, align 4
@@ -117,17 +117,17 @@ define amdgpu_kernel void @bfe_u32_imm_arg_arg(ptr addrspace(1) %out, i32 %src1,
 ;
 ; VI-LABEL: bfe_u32_imm_arg_arg:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    s_movk_i32 s8, 0x7b
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s6
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
 ; VI-NEXT:    v_bfe_u32 v0, s8, v0, v1
-; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 123, i32 %src1, i32 %src2)
   store i32 %bfe_u32, ptr addrspace(1) %out, align 4
@@ -1625,16 +1625,16 @@ define amdgpu_kernel void @v_lshr_and(ptr addrspace(1) %out, i32 %a, i32 %b) #0
 ;
 ; VI-LABEL: v_lshr_and:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_lshr_b32 s0, s2, s3
-; VI-NEXT:    s_and_b32 s0, s0, 7
-; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_lshr_b32 s4, s6, s7
+; VI-NEXT:    s_and_b32 s4, s4, 7
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %c = lshr i32 %a, %b
   %d = and i32 %c, 7
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
index 3a867879bb809..4f65acda2a210 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
@@ -336,33 +336,33 @@ define amdgpu_kernel void @s_exp_f32(ptr addrspace(1) %out, float %in) {
 define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
 ; VI-SDAG-LABEL: s_exp_v2f32:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v0, 0x3fb8a000
 ; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT:    s_and_b32 s4, s3, 0xfffff000
-; VI-SDAG-NEXT:    v_mov_b32_e32 v2, s4
-; VI-SDAG-NEXT:    v_sub_f32_e32 v2, s3, v2
+; VI-SDAG-NEXT:    s_and_b32 s0, s7, 0xfffff000
+; VI-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; VI-SDAG-NEXT:    v_sub_f32_e32 v2, s7, v2
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v4, 0x39a3b295, v2
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v2, 0x3fb8a000, v2
 ; VI-SDAG-NEXT:    v_add_f32_e32 v2, v2, v4
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0x39a3b295
-; VI-SDAG-NEXT:    v_mul_f32_e32 v1, s4, v0
-; VI-SDAG-NEXT:    v_mul_f32_e32 v5, s4, v4
-; VI-SDAG-NEXT:    s_and_b32 s4, s2, 0xfffff000
+; VI-SDAG-NEXT:    v_mul_f32_e32 v1, s0, v0
+; VI-SDAG-NEXT:    v_mul_f32_e32 v5, s0, v4
+; VI-SDAG-NEXT:    s_and_b32 s0, s6, 0xfffff000
 ; VI-SDAG-NEXT:    v_rndne_f32_e32 v3, v1
-; VI-SDAG-NEXT:    v_mov_b32_e32 v6, s4
+; VI-SDAG-NEXT:    v_mov_b32_e32 v6, s0
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v1, v1, v3
 ; VI-SDAG-NEXT:    v_add_f32_e32 v2, v5, v2
-; VI-SDAG-NEXT:    v_sub_f32_e32 v6, s2, v6
+; VI-SDAG-NEXT:    v_sub_f32_e32 v6, s6, v6
 ; VI-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
-; VI-SDAG-NEXT:    v_mul_f32_e32 v0, s4, v0
+; VI-SDAG-NEXT:    v_mul_f32_e32 v0, s0, v0
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v7, 0x39a3b295, v6
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v6, 0x3fb8a000, v6
 ; VI-SDAG-NEXT:    v_exp_f32_e32 v1, v1
 ; VI-SDAG-NEXT:    v_cvt_i32_f32_e32 v2, v3
 ; VI-SDAG-NEXT:    v_rndne_f32_e32 v5, v0
 ; VI-SDAG-NEXT:    v_add_f32_e32 v6, v6, v7
-; VI-SDAG-NEXT:    v_mul_f32_e32 v4, s4, v4
+; VI-SDAG-NEXT:    v_mul_f32_e32 v4, s0, v4
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v5
 ; VI-SDAG-NEXT:    v_add_f32_e32 v4, v4, v6
 ; VI-SDAG-NEXT:    v_add_f32_e32 v0, v0, v4
@@ -370,48 +370,48 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
 ; VI-SDAG-NEXT:    v_cvt_i32_f32_e32 v4, v5
 ; VI-SDAG-NEXT:    v_ldexp_f32 v1, v1, v2
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0xc2ce8ed0
-; VI-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s3, v2
+; VI-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s7, v2
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v3, 0x42b17218
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v5, 0x7f800000
-; VI-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s3, v3
+; VI-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s7, v3
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v4
-; VI-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s2, v2
+; VI-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s6, v2
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; VI-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s2, v3
-; VI-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s6, v3
+; VI-SDAG-NEXT:    v_mov_b32_e32 v2, s4
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
-; VI-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; VI-SDAG-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-SDAG-NEXT:    s_endpgm
 ;
 ; VI-GISEL-LABEL: s_exp_v2f32:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x3fb8a000
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x39a3b295
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT:    s_and_b32 s4, s2, 0xfffff000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT:    v_sub_f32_e32 v2, s2, v2
+; VI-GISEL-NEXT:    s_and_b32 s0, s6, 0xfffff000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT:    v_sub_f32_e32 v2, s6, v2
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x39a3b295, v2
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8a000, v2
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, s4, v0
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, s0, v0
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v4
-; VI-GISEL-NEXT:    v_mul_f32_e32 v4, s4, v1
-; VI-GISEL-NEXT:    s_and_b32 s4, s3, 0xfffff000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, s4
+; VI-GISEL-NEXT:    v_mul_f32_e32 v4, s0, v1
+; VI-GISEL-NEXT:    s_and_b32 s0, s7, 0xfffff000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v5, s0
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v4, v2
 ; VI-GISEL-NEXT:    v_rndne_f32_e32 v4, v3
-; VI-GISEL-NEXT:    v_sub_f32_e32 v5, s3, v5
+; VI-GISEL-NEXT:    v_sub_f32_e32 v5, s7, v5
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v3, v3, v4
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x39a3b295, v5
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3fb8a000, v5
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v3, v2
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, s4, v0
+; VI-GISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
 ; VI-GISEL-NEXT:    v_add_f32_e32 v5, v5, v6
-; VI-GISEL-NEXT:    v_mul_f32_e32 v1, s4, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v1, s0, v1
 ; VI-GISEL-NEXT:    v_cvt_i32_f32_e32 v3, v4
 ; VI-GISEL-NEXT:    v_exp_f32_e32 v2, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v5
@@ -422,77 +422,77 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
 ; VI-GISEL-NEXT:    v_exp_f32_e32 v5, v0
 ; VI-GISEL-NEXT:    v_ldexp_f32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc2ce8ed0
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v3
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v3
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x42b17218
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7f800000
-; VI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s2, v4
+; VI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s6, v4
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v6, vcc
 ; VI-GISEL-NEXT:    v_ldexp_f32 v1, v5, v1
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s3, v3
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s7, v3
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
-; VI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s3, v4
-; VI-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; VI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s7, v4
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    s_endpgm
 ;
 ; GFX900-SDAG-LABEL: s_exp_v2f32:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX900-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v0, 0x3fb8aa3b
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x32a5705f
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v5, 0xc2ce8ed0
 ; GFX900-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_mul_f32_e32 v2, s3, v0
+; GFX900-SDAG-NEXT:    v_mul_f32_e32 v2, s7, v0
 ; GFX900-SDAG-NEXT:    v_rndne_f32_e32 v3, v2
-; GFX900-SDAG-NEXT:    v_fma_f32 v4, s3, v0, -v2
+; GFX900-SDAG-NEXT:    v_fma_f32 v4, s7, v0, -v2
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v2, v2, v3
-; GFX900-SDAG-NEXT:    v_fma_f32 v4, s3, v1, v4
-; GFX900-SDAG-NEXT:    v_mul_f32_e32 v6, s2, v0
+; GFX900-SDAG-NEXT:    v_fma_f32 v4, s7, v1, v4
+; GFX900-SDAG-NEXT:    v_mul_f32_e32 v6, s6, v0
 ; GFX900-SDAG-NEXT:    v_add_f32_e32 v2, v2, v4
 ; GFX900-SDAG-NEXT:    v_rndne_f32_e32 v7, v6
-; GFX900-SDAG-NEXT:    v_fma_f32 v0, s2, v0, -v6
+; GFX900-SDAG-NEXT:    v_fma_f32 v0, s6, v0, -v6
 ; GFX900-SDAG-NEXT:    v_cvt_i32_f32_e32 v3, v3
 ; GFX900-SDAG-NEXT:    v_exp_f32_e32 v2, v2
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v8, v6, v7
-; GFX900-SDAG-NEXT:    v_fma_f32 v0, s2, v1, v0
+; GFX900-SDAG-NEXT:    v_fma_f32 v0, s6, v1, v0
 ; GFX900-SDAG-NEXT:    v_add_f32_e32 v0, v8, v0
 ; GFX900-SDAG-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX900-SDAG-NEXT:    v_cvt_i32_f32_e32 v6, v7
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v2, v2, v3
-; GFX900-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s3, v5
+; GFX900-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s7, v5
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v3, 0x42b17218
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v7, 0x7f800000
-; GFX900-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s3, v3
+; GFX900-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s7, v3
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v6
-; GFX900-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s2, v5
+; GFX900-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s6, v5
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GFX900-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s2, v3
+; GFX900-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s6, v3
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc
-; GFX900-SDAG-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX900-SDAG-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
 ; GFX900-SDAG-NEXT:    s_endpgm
 ;
 ; GFX900-GISEL-LABEL: s_exp_v2f32:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX900-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, 0x3fb8aa3b
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x32a5705f
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7f800000
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, s2, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v3, s2, v0, -v2
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, s6, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v3, s6, v0, -v2
 ; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v4, v2
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v5, s3, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v3, s2, v1, v3
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v5, s7, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v3, s6, v1, v3
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v4
-; GFX900-GISEL-NEXT:    v_fma_f32 v0, s3, v0, -v5
+; GFX900-GISEL-NEXT:    v_fma_f32 v0, s7, v0, -v5
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
-; GFX900-GISEL-NEXT:    v_fma_f32 v0, s3, v1, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v0, s7, v1, v0
 ; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v1, v5
 ; GFX900-GISEL-NEXT:    v_cvt_i32_f32_e32 v3, v4
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v2, v2
@@ -502,18 +502,18 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v5, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0xc2ce8ed0
 ; GFX900-GISEL-NEXT:    v_ldexp_f32 v2, v2, v3
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v4
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v4
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x42b17218
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s2, v3
+; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s6, v3
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v6, vcc
 ; GFX900-GISEL-NEXT:    v_ldexp_f32 v1, v5, v1
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s3, v4
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s7, v4
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
-; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s3, v3
+; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s7, v3
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; GFX900-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX900-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX900-GISEL-NEXT:    s_endpgm
 ;
 ; SI-SDAG-LABEL: s_exp_v2f32:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
index a162949587481..ff20f90f05ca0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
@@ -338,33 +338,33 @@ define amdgpu_kernel void @s_exp10_f32(ptr addrspace(1) %out, float %in) {
 define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
 ; VI-SDAG-LABEL: s_exp10_v2f32:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v0, 0x40549000
 ; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT:    s_and_b32 s4, s3, 0xfffff000
-; VI-SDAG-NEXT:    v_mov_b32_e32 v2, s4
-; VI-SDAG-NEXT:    v_sub_f32_e32 v2, s3, v2
+; VI-SDAG-NEXT:    s_and_b32 s0, s7, 0xfffff000
+; VI-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; VI-SDAG-NEXT:    v_sub_f32_e32 v2, s7, v2
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v4, 0x3a2784bc, v2
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v2, 0x40549000, v2
 ; VI-SDAG-NEXT:    v_add_f32_e32 v2, v2, v4
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0x3a2784bc
-; VI-SDAG-NEXT:    v_mul_f32_e32 v1, s4, v0
-; VI-SDAG-NEXT:    v_mul_f32_e32 v5, s4, v4
-; VI-SDAG-NEXT:    s_and_b32 s4, s2, 0xfffff000
+; VI-SDAG-NEXT:    v_mul_f32_e32 v1, s0, v0
+; VI-SDAG-NEXT:    v_mul_f32_e32 v5, s0, v4
+; VI-SDAG-NEXT:    s_and_b32 s0, s6, 0xfffff000
 ; VI-SDAG-NEXT:    v_rndne_f32_e32 v3, v1
-; VI-SDAG-NEXT:    v_mov_b32_e32 v6, s4
+; VI-SDAG-NEXT:    v_mov_b32_e32 v6, s0
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v1, v1, v3
 ; VI-SDAG-NEXT:    v_add_f32_e32 v2, v5, v2
-; VI-SDAG-NEXT:    v_sub_f32_e32 v6, s2, v6
+; VI-SDAG-NEXT:    v_sub_f32_e32 v6, s6, v6
 ; VI-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
-; VI-SDAG-NEXT:    v_mul_f32_e32 v0, s4, v0
+; VI-SDAG-NEXT:    v_mul_f32_e32 v0, s0, v0
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v7, 0x3a2784bc, v6
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v6, 0x40549000, v6
 ; VI-SDAG-NEXT:    v_exp_f32_e32 v1, v1
 ; VI-SDAG-NEXT:    v_cvt_i32_f32_e32 v2, v3
 ; VI-SDAG-NEXT:    v_rndne_f32_e32 v5, v0
 ; VI-SDAG-NEXT:    v_add_f32_e32 v6, v6, v7
-; VI-SDAG-NEXT:    v_mul_f32_e32 v4, s4, v4
+; VI-SDAG-NEXT:    v_mul_f32_e32 v4, s0, v4
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v5
 ; VI-SDAG-NEXT:    v_add_f32_e32 v4, v4, v6
 ; VI-SDAG-NEXT:    v_add_f32_e32 v0, v0, v4
@@ -372,48 +372,48 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; VI-SDAG-NEXT:    v_cvt_i32_f32_e32 v4, v5
 ; VI-SDAG-NEXT:    v_ldexp_f32 v1, v1, v2
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0xc23369f4
-; VI-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s3, v2
+; VI-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s7, v2
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v3, 0x421a209b
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v5, 0x7f800000
-; VI-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s3, v3
+; VI-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s7, v3
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v4
-; VI-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s2, v2
+; VI-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s6, v2
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; VI-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s2, v3
-; VI-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s6, v3
+; VI-SDAG-NEXT:    v_mov_b32_e32 v2, s4
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
-; VI-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; VI-SDAG-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-SDAG-NEXT:    s_endpgm
 ;
 ; VI-GISEL-LABEL: s_exp10_v2f32:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x40549000
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3a2784bc
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT:    s_and_b32 s4, s2, 0xfffff000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT:    v_sub_f32_e32 v2, s2, v2
+; VI-GISEL-NEXT:    s_and_b32 s0, s6, 0xfffff000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT:    v_sub_f32_e32 v2, s6, v2
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3a2784bc, v2
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x40549000, v2
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, s4, v0
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, s0, v0
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v4
-; VI-GISEL-NEXT:    v_mul_f32_e32 v4, s4, v1
-; VI-GISEL-NEXT:    s_and_b32 s4, s3, 0xfffff000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, s4
+; VI-GISEL-NEXT:    v_mul_f32_e32 v4, s0, v1
+; VI-GISEL-NEXT:    s_and_b32 s0, s7, 0xfffff000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v5, s0
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v4, v2
 ; VI-GISEL-NEXT:    v_rndne_f32_e32 v4, v3
-; VI-GISEL-NEXT:    v_sub_f32_e32 v5, s3, v5
+; VI-GISEL-NEXT:    v_sub_f32_e32 v5, s7, v5
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v3, v3, v4
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3a2784bc, v5
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x40549000, v5
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v3, v2
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, s4, v0
+; VI-GISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
 ; VI-GISEL-NEXT:    v_add_f32_e32 v5, v5, v6
-; VI-GISEL-NEXT:    v_mul_f32_e32 v1, s4, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v1, s0, v1
 ; VI-GISEL-NEXT:    v_cvt_i32_f32_e32 v3, v4
 ; VI-GISEL-NEXT:    v_exp_f32_e32 v2, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v5
@@ -424,77 +424,77 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; VI-GISEL-NEXT:    v_exp_f32_e32 v5, v0
 ; VI-GISEL-NEXT:    v_ldexp_f32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc23369f4
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v3
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v3
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x421a209b
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7f800000
-; VI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s2, v4
+; VI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s6, v4
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v6, vcc
 ; VI-GISEL-NEXT:    v_ldexp_f32 v1, v5, v1
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s3, v3
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s7, v3
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
-; VI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s3, v4
-; VI-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; VI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s7, v4
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    s_endpgm
 ;
 ; GFX900-SDAG-LABEL: s_exp10_v2f32:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX900-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v0, 0x40549a78
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x33979a37
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v5, 0xc23369f4
 ; GFX900-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_mul_f32_e32 v2, s3, v0
+; GFX900-SDAG-NEXT:    v_mul_f32_e32 v2, s7, v0
 ; GFX900-SDAG-NEXT:    v_rndne_f32_e32 v3, v2
-; GFX900-SDAG-NEXT:    v_fma_f32 v4, s3, v0, -v2
+; GFX900-SDAG-NEXT:    v_fma_f32 v4, s7, v0, -v2
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v2, v2, v3
-; GFX900-SDAG-NEXT:    v_fma_f32 v4, s3, v1, v4
-; GFX900-SDAG-NEXT:    v_mul_f32_e32 v6, s2, v0
+; GFX900-SDAG-NEXT:    v_fma_f32 v4, s7, v1, v4
+; GFX900-SDAG-NEXT:    v_mul_f32_e32 v6, s6, v0
 ; GFX900-SDAG-NEXT:    v_add_f32_e32 v2, v2, v4
 ; GFX900-SDAG-NEXT:    v_rndne_f32_e32 v7, v6
-; GFX900-SDAG-NEXT:    v_fma_f32 v0, s2, v0, -v6
+; GFX900-SDAG-NEXT:    v_fma_f32 v0, s6, v0, -v6
 ; GFX900-SDAG-NEXT:    v_cvt_i32_f32_e32 v3, v3
 ; GFX900-SDAG-NEXT:    v_exp_f32_e32 v2, v2
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v8, v6, v7
-; GFX900-SDAG-NEXT:    v_fma_f32 v0, s2, v1, v0
+; GFX900-SDAG-NEXT:    v_fma_f32 v0, s6, v1, v0
 ; GFX900-SDAG-NEXT:    v_add_f32_e32 v0, v8, v0
 ; GFX900-SDAG-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX900-SDAG-NEXT:    v_cvt_i32_f32_e32 v6, v7
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v2, v2, v3
-; GFX900-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s3, v5
+; GFX900-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s7, v5
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v3, 0x421a209b
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v7, 0x7f800000
-; GFX900-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s3, v3
+; GFX900-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s7, v3
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v6
-; GFX900-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s2, v5
+; GFX900-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s6, v5
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GFX900-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s2, v3
+; GFX900-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s6, v3
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc
-; GFX900-SDAG-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX900-SDAG-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
 ; GFX900-SDAG-NEXT:    s_endpgm
 ;
 ; GFX900-GISEL-LABEL: s_exp10_v2f32:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX900-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, 0x40549a78
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x33979a37
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7f800000
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, s2, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v3, s2, v0, -v2
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, s6, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v3, s6, v0, -v2
 ; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v4, v2
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v5, s3, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v3, s2, v1, v3
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v5, s7, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v3, s6, v1, v3
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v4
-; GFX900-GISEL-NEXT:    v_fma_f32 v0, s3, v0, -v5
+; GFX900-GISEL-NEXT:    v_fma_f32 v0, s7, v0, -v5
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
-; GFX900-GISEL-NEXT:    v_fma_f32 v0, s3, v1, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v0, s7, v1, v0
 ; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v1, v5
 ; GFX900-GISEL-NEXT:    v_cvt_i32_f32_e32 v3, v4
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v2, v2
@@ -504,18 +504,18 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v5, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0xc23369f4
 ; GFX900-GISEL-NEXT:    v_ldexp_f32 v2, v2, v3
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v4
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v4
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x421a209b
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s2, v3
+; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s6, v3
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v6, vcc
 ; GFX900-GISEL-NEXT:    v_ldexp_f32 v1, v5, v1
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s3, v4
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s7, v4
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
-; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s3, v3
+; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s7, v3
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; GFX900-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX900-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX900-GISEL-NEXT:    s_endpgm
 ;
 ; SI-SDAG-LABEL: s_exp10_v2f32:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
index 36e78975cdb01..06fa910366584 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
@@ -222,25 +222,25 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ;
 ; VI-SDAG-LABEL: s_exp2_v2f32:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v0, 0xc2fc0000
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x42800000
 ; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s3, v0
+; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s7, v0
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v3, 1.0, v1, vcc
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v2, vcc
-; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v0
+; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, 1.0, v1, vcc
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; VI-SDAG-NEXT:    v_add_f32_e32 v4, s3, v4
-; VI-SDAG-NEXT:    v_add_f32_e32 v1, s2, v1
+; VI-SDAG-NEXT:    v_add_f32_e32 v4, s7, v4
+; VI-SDAG-NEXT:    v_add_f32_e32 v1, s6, v1
 ; VI-SDAG-NEXT:    v_exp_f32_e32 v4, v4
 ; VI-SDAG-NEXT:    v_exp_f32_e32 v2, v1
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v1, v4, v3
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v0, v2, v0
-; VI-SDAG-NEXT:    v_mov_b32_e32 v3, s1
-; VI-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; VI-SDAG-NEXT:    v_mov_b32_e32 v2, s4
+; VI-SDAG-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-SDAG-NEXT:    s_endpgm
 ;
@@ -270,25 +270,25 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ;
 ; GFX900-SDAG-LABEL: s_exp2_v2f32:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX900-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v0, 0xc2fc0000
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v2, 0x42800000
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX900-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s3, v0
+; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s7, v0
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v3, 1.0, v1, vcc
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v2, vcc
-; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v0
+; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, 1.0, v1, vcc
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GFX900-SDAG-NEXT:    v_add_f32_e32 v4, s3, v4
-; GFX900-SDAG-NEXT:    v_add_f32_e32 v1, s2, v1
+; GFX900-SDAG-NEXT:    v_add_f32_e32 v4, s7, v4
+; GFX900-SDAG-NEXT:    v_add_f32_e32 v1, s6, v1
 ; GFX900-SDAG-NEXT:    v_exp_f32_e32 v4, v4
 ; GFX900-SDAG-NEXT:    v_exp_f32_e32 v2, v1
 ; GFX900-SDAG-NEXT:    v_mul_f32_e32 v1, v4, v3
 ; GFX900-SDAG-NEXT:    v_mul_f32_e32 v0, v2, v0
-; GFX900-SDAG-NEXT:    global_store_dwordx2 v5, v[0:1], s[0:1]
+; GFX900-SDAG-NEXT:    global_store_dwordx2 v5, v[0:1], s[4:5]
 ; GFX900-SDAG-NEXT:    s_endpgm
 ;
 ; GFX900-GISEL-LABEL: s_exp2_v2f32:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
index d847af780acab..ad70589b544ee 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
@@ -568,15 +568,15 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
 ;
 ; GFX1100-SDAG-LABEL: s_log_v2f32:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1100-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s4, 0x800000, s3
-; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s5, 0x800000, s2
+; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, s7
+; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s1, 0x800000, s6
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s5
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s0
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s1
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_dual_mul_f32 v0, s3, v0 :: v_dual_mul_f32 v1, s2, v1
+; GFX1100-SDAG-NEXT:    v_dual_mul_f32 v0, s7, v0 :: v_dual_mul_f32 v1, s6, v1
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v1, v1
@@ -589,29 +589,29 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
 ; GFX1100-SDAG-NEXT:    v_dual_fmac_f32 v4, 0x3377d1cf, v0 :: v_dual_fmac_f32 v5, 0x3377d1cf, v1
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX1100-SDAG-NEXT:    v_dual_add_f32 v2, v2, v4 :: v_dual_add_f32 v3, v3, v5
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 0x41b17218, s4
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v5, 0, 0x41b17218, s5
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 0x41b17218, s0
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v5, 0, 0x41b17218, s1
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1|
 ; GFX1100-SDAG-NEXT:    v_dual_cndmask_b32 v2, v1, v3 :: v_dual_mov_b32 v3, 0
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_dual_sub_f32 v1, v0, v4 :: v_dual_sub_f32 v0, v2, v5
-; GFX1100-SDAG-NEXT:    global_store_b64 v3, v[0:1], s[0:1]
+; GFX1100-SDAG-NEXT:    global_store_b64 v3, v[0:1], s[4:5]
 ; GFX1100-SDAG-NEXT:    s_nop 0
 ; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1100-GISEL-LABEL: s_log_v2f32:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1100-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s4, 0x800000, s2
-; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s5, 0x800000, s3
+; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, s6
+; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s1, 0x800000, s7
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s5
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s0
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s1
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v0, s2, v0 :: v_dual_mul_f32 v1, s3, v1
+; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v0, s6, v0 :: v_dual_mul_f32 v1, s7, v1
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v1, v1
@@ -624,14 +624,14 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
 ; GFX1100-GISEL-NEXT:    v_dual_fmac_f32 v4, 0x3377d1cf, v0 :: v_dual_fmac_f32 v5, 0x3377d1cf, v1
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX1100-GISEL-NEXT:    v_dual_add_f32 v2, v2, v4 :: v_dual_add_f32 v3, v3, v5
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 0x41b17218, s4
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 0x41b17218, s5
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 0x41b17218, s0
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 0x41b17218, s1
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1|
 ; GFX1100-GISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_cndmask_b32 v1, v1, v3
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_dual_sub_f32 v0, v0, v4 :: v_dual_sub_f32 v1, v1, v5
-; GFX1100-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1100-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
 ; GFX1100-GISEL-NEXT:    s_nop 0
 ; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-GISEL-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
index 3f060de9f6596..82c73fa441aaf 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
@@ -568,15 +568,15 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ;
 ; GFX1100-SDAG-LABEL: s_log10_v2f32:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1100-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s4, 0x800000, s3
-; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s5, 0x800000, s2
+; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, s7
+; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s1, 0x800000, s6
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s5
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s0
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s1
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_dual_mul_f32 v0, s3, v0 :: v_dual_mul_f32 v1, s2, v1
+; GFX1100-SDAG-NEXT:    v_dual_mul_f32 v0, s7, v0 :: v_dual_mul_f32 v1, s6, v1
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v1, v1
@@ -589,29 +589,29 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; GFX1100-SDAG-NEXT:    v_dual_fmac_f32 v4, 0x3284fbcf, v0 :: v_dual_fmac_f32 v5, 0x3284fbcf, v1
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX1100-SDAG-NEXT:    v_dual_add_f32 v2, v2, v4 :: v_dual_add_f32 v3, v3, v5
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 0x411a209b, s4
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v5, 0, 0x411a209b, s5
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 0x411a209b, s0
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v5, 0, 0x411a209b, s1
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1|
 ; GFX1100-SDAG-NEXT:    v_dual_cndmask_b32 v2, v1, v3 :: v_dual_mov_b32 v3, 0
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_dual_sub_f32 v1, v0, v4 :: v_dual_sub_f32 v0, v2, v5
-; GFX1100-SDAG-NEXT:    global_store_b64 v3, v[0:1], s[0:1]
+; GFX1100-SDAG-NEXT:    global_store_b64 v3, v[0:1], s[4:5]
 ; GFX1100-SDAG-NEXT:    s_nop 0
 ; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1100-GISEL-LABEL: s_log10_v2f32:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1100-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s4, 0x800000, s2
-; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s5, 0x800000, s3
+; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, s6
+; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s1, 0x800000, s7
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s5
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s0
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s1
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v0, s2, v0 :: v_dual_mul_f32 v1, s3, v1
+; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v0, s6, v0 :: v_dual_mul_f32 v1, s7, v1
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v1, v1
@@ -624,14 +624,14 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; GFX1100-GISEL-NEXT:    v_dual_fmac_f32 v4, 0x3284fbcf, v0 :: v_dual_fmac_f32 v5, 0x3284fbcf, v1
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX1100-GISEL-NEXT:    v_dual_add_f32 v2, v2, v4 :: v_dual_add_f32 v3, v3, v5
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 0x411a209b, s4
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 0x411a209b, s5
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 0x411a209b, s0
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 0x411a209b, s1
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1|
 ; GFX1100-GISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_cndmask_b32 v1, v1, v3
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_dual_sub_f32 v0, v0, v4 :: v_dual_sub_f32 v1, v1, v5
-; GFX1100-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1100-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
 ; GFX1100-GISEL-NEXT:    s_nop 0
 ; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-GISEL-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
index 035b2439eff15..b76e6214922bd 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
@@ -265,25 +265,25 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ;
 ; VI-SDAG-LABEL: s_log2_v2f32:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v0, 0x800000
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s3, v0
+; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s7, v0
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v3, 0, v1, vcc
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 1.0, v2, vcc
-; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v0
+; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-SDAG-NEXT:    v_mul_f32_e32 v4, s3, v4
-; VI-SDAG-NEXT:    v_mul_f32_e32 v1, s2, v1
+; VI-SDAG-NEXT:    v_mul_f32_e32 v4, s7, v4
+; VI-SDAG-NEXT:    v_mul_f32_e32 v1, s6, v1
 ; VI-SDAG-NEXT:    v_log_f32_e32 v4, v4
 ; VI-SDAG-NEXT:    v_log_f32_e32 v2, v1
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v1, v4, v3
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v2, v0
-; VI-SDAG-NEXT:    v_mov_b32_e32 v3, s1
-; VI-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; VI-SDAG-NEXT:    v_mov_b32_e32 v2, s4
+; VI-SDAG-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-SDAG-NEXT:    s_endpgm
 ;
@@ -313,25 +313,25 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ;
 ; GFX900-SDAG-LABEL: s_log2_v2f32:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX900-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v0, 0x800000
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX900-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s3, v0
+; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s7, v0
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v3, 0, v1, vcc
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v4, 1.0, v2, vcc
-; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v0
+; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-SDAG-NEXT:    v_mul_f32_e32 v4, s3, v4
-; GFX900-SDAG-NEXT:    v_mul_f32_e32 v1, s2, v1
+; GFX900-SDAG-NEXT:    v_mul_f32_e32 v4, s7, v4
+; GFX900-SDAG-NEXT:    v_mul_f32_e32 v1, s6, v1
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v4, v4
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v2, v1
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v1, v4, v3
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v2, v0
-; GFX900-SDAG-NEXT:    global_store_dwordx2 v5, v[0:1], s[0:1]
+; GFX900-SDAG-NEXT:    global_store_dwordx2 v5, v[0:1], s[4:5]
 ; GFX900-SDAG-NEXT:    s_endpgm
 ;
 ; GFX900-GISEL-LABEL: s_log2_v2f32:
@@ -359,49 +359,49 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ;
 ; GFX1100-SDAG-LABEL: s_log2_v2f32:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1100-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s4, 0x800000, s3
-; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s5, 0x800000, s2
+; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, s7
+; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s1, 0x800000, s6
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s4
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s5
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 0x42000000, s4
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42000000, s5
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s1
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 0x42000000, s0
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42000000, s1
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v1, s3, v1
-; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v3, s2, v3
+; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v1, s7, v1
+; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v3, s6, v3
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v1, v1
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v3, v3
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_dual_sub_f32 v1, v1, v0 :: v_dual_sub_f32 v0, v3, v2
-; GFX1100-SDAG-NEXT:    global_store_b64 v4, v[0:1], s[0:1]
+; GFX1100-SDAG-NEXT:    global_store_b64 v4, v[0:1], s[4:5]
 ; GFX1100-SDAG-NEXT:    s_nop 0
 ; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1100-GISEL-LABEL: s_log2_v2f32:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1100-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s4, 0x800000, s2
-; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s5, 0x800000, s3
+; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, s6
+; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s1, 0x800000, s7
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s5
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42000000, s4
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 0x42000000, s5
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s0
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s1
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42000000, s0
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 0x42000000, s1
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v0, s2, v0 :: v_dual_mul_f32 v1, s3, v1
+; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v0, s6, v0 :: v_dual_mul_f32 v1, s7, v1
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v1, v1
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_dual_sub_f32 v0, v0, v2 :: v_dual_sub_f32 v1, v1, v3
 ; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1100-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1100-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
 ; GFX1100-GISEL-NEXT:    s_nop 0
 ; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-GISEL-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll b/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll
index 826862e124920..9fcbdf3968693 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll
@@ -224,11 +224,11 @@ define amdgpu_kernel void @local_size_yz(ptr addrspace(1) %out) {
 ;
 ; VI-LABEL: local_size_yz:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x1c
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x1c
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mul_i32 s0, s0, s1
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    s_mul_i32 s0, s4, s5
+; VI-NEXT:    v_mov_b32_e32 v0, s6
+; VI-NEXT:    v_mov_b32_e32 v1, s7
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.ll
index 7ad7cc821c1b5..c5d2f791d1677 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.round.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.round.ll
@@ -135,54 +135,54 @@ define amdgpu_kernel void @round_v2f32(ptr addrspace(1) %out, <2 x float> %in) #
 ;
 ; GFX89-LABEL: round_v2f32:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX89-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX89-NEXT:    s_brev_b32 s8, -2
-; GFX89-NEXT:    s_mov_b32 s7, 0xf000
-; GFX89-NEXT:    s_mov_b32 s6, -1
+; GFX89-NEXT:    s_mov_b32 s3, 0xf000
+; GFX89-NEXT:    s_mov_b32 s2, -1
 ; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX89-NEXT:    v_trunc_f32_e32 v0, s3
-; GFX89-NEXT:    v_sub_f32_e32 v1, s3, v0
-; GFX89-NEXT:    s_mov_b32 s4, s0
-; GFX89-NEXT:    s_mov_b32 s5, s1
-; GFX89-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
-; GFX89-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
-; GFX89-NEXT:    v_mov_b32_e32 v2, s3
+; GFX89-NEXT:    v_trunc_f32_e32 v0, s7
+; GFX89-NEXT:    v_sub_f32_e32 v1, s7, v0
+; GFX89-NEXT:    s_mov_b32 s0, s4
+; GFX89-NEXT:    s_mov_b32 s1, s5
+; GFX89-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5
+; GFX89-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, s[4:5]
+; GFX89-NEXT:    v_mov_b32_e32 v2, s7
 ; GFX89-NEXT:    v_bfi_b32 v1, s8, v1, v2
 ; GFX89-NEXT:    v_add_f32_e32 v1, v0, v1
-; GFX89-NEXT:    v_trunc_f32_e32 v0, s2
-; GFX89-NEXT:    v_sub_f32_e32 v2, s2, v0
-; GFX89-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v2|, 0.5
-; GFX89-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s[0:1]
-; GFX89-NEXT:    v_mov_b32_e32 v3, s2
+; GFX89-NEXT:    v_trunc_f32_e32 v0, s6
+; GFX89-NEXT:    v_sub_f32_e32 v2, s6, v0
+; GFX89-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
+; GFX89-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
+; GFX89-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX89-NEXT:    v_bfi_b32 v2, s8, v2, v3
 ; GFX89-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX89-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX89-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX89-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: round_v2f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_trunc_f32_e32 v0, s3
-; GFX11-NEXT:    v_trunc_f32_e32 v2, s2
+; GFX11-NEXT:    v_trunc_f32_e32 v0, s7
+; GFX11-NEXT:    v_trunc_f32_e32 v2, s6
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_sub_f32_e32 v1, s3, v0
-; GFX11-NEXT:    v_sub_f32_e32 v3, s2, v2
+; GFX11-NEXT:    v_sub_f32_e32 v1, s7, v0
+; GFX11-NEXT:    v_sub_f32_e32 v3, s6, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_ge_f32_e64 s4, |v1|, 0.5
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, s4
+; GFX11-NEXT:    v_cmp_ge_f32_e64 s0, |v1|, 0.5
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_ge_f32_e64 s4, |v3|, 0.5
-; GFX11-NEXT:    v_bfi_b32 v1, 0x7fffffff, v1, s3
+; GFX11-NEXT:    v_cmp_ge_f32_e64 s0, |v3|, 0.5
+; GFX11-NEXT:    v_bfi_b32 v1, 0x7fffffff, v1, s7
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1.0, s4
-; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1.0, s0
+; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX11-NEXT:    v_add_f32_e32 v1, v0, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_bfi_b32 v3, 0x7fffffff, v3, s2
-; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    v_bfi_b32 v3, 0x7fffffff, v3, s6
+; GFX11-NEXT:    s_mov_b32 s6, -1
 ; GFX11-NEXT:    v_add_f32_e32 v0, v2, v3
-; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
index 994ef22539a65..67071327e3aff 100644
--- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
@@ -8,28 +8,28 @@
 define amdgpu_kernel void @s_lshr_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
 ; GFX9-LABEL: s_lshr_v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    v_pk_lshrrev_b16 v1, s3, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-NEXT:    v_pk_lshrrev_b16 v1, s7, v1
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_lshr_v2i16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_and_b32 s4, s2, 0xffff
-; VI-NEXT:    s_lshr_b32 s2, s2, 16
-; VI-NEXT:    s_lshr_b32 s5, s3, 16
-; VI-NEXT:    s_lshr_b32 s2, s2, s5
-; VI-NEXT:    s_lshr_b32 s3, s4, s3
-; VI-NEXT:    s_lshl_b32 s2, s2, 16
-; VI-NEXT:    s_or_b32 s2, s3, s2
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    s_lshr_b32 s1, s6, 16
+; VI-NEXT:    s_lshr_b32 s2, s7, 16
+; VI-NEXT:    s_and_b32 s0, s6, 0xffff
+; VI-NEXT:    s_lshr_b32 s1, s1, s2
+; VI-NEXT:    s_lshr_b32 s0, s0, s7
+; VI-NEXT:    s_lshl_b32 s1, s1, 16
+; VI-NEXT:    s_or_b32 s0, s0, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -54,20 +54,20 @@ define amdgpu_kernel void @s_lshr_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <
 ;
 ; GFX10-LABEL: s_lshr_v2i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_pk_lshrrev_b16 v1, s3, s2
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    v_pk_lshrrev_b16 v1, s7, s6
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_lshr_v2i16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_pk_lshrrev_b16 v1, s3, s2
-; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    v_pk_lshrrev_b16 v1, s7, s6
+; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/madak.ll b/llvm/test/CodeGen/AMDGPU/madak.ll
index 9ec37a5e14cdf..def0dfa4b903b 100644
--- a/llvm/test/CodeGen/AMDGPU/madak.ll
+++ b/llvm/test/CodeGen/AMDGPU/madak.ll
@@ -190,11 +190,11 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad
 ;
 ; GFX8-LABEL: madak_2_use_f32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 2, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, s3
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v6
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 4, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -206,8 +206,8 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v4, v[4:5] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v6
 ; GFX8-NEXT:    v_mov_b32_e32 v5, 0x41200000
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    v_madak_f32 v6, v7, v8, 0x41200000
@@ -220,61 +220,61 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad
 ;
 ; GFX9-LABEL: madak_2_use_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x41200000
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v0, s[2:3] offset:4 glc
+; GFX9-NEXT:    global_load_dword v2, v0, s[6:7] offset:4 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v0, s[2:3] offset:8 glc
+; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] offset:8 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_madak_f32 v2, v1, v2, 0x41200000
 ; GFX9-NEXT:    v_mac_f32_e32 v4, v1, v3
-; GFX9-NEXT:    global_store_dword v0, v2, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v2, s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_dword v0, v4, s[2:3] offset:4
+; GFX9-NEXT:    global_store_dword v0, v4, s[6:7] offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-MAD-LABEL: madak_2_use_f32:
 ; GFX10-MAD:       ; %bb.0:
-; GFX10-MAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-MAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-MAD-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-MAD-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-MAD-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
+; GFX10-MAD-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
 ; GFX10-MAD-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-MAD-NEXT:    global_load_dword v2, v0, s[2:3] offset:4 glc dlc
+; GFX10-MAD-NEXT:    global_load_dword v2, v0, s[6:7] offset:4 glc dlc
 ; GFX10-MAD-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-MAD-NEXT:    global_load_dword v3, v0, s[2:3] offset:8 glc dlc
+; GFX10-MAD-NEXT:    global_load_dword v3, v0, s[6:7] offset:8 glc dlc
 ; GFX10-MAD-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-MAD-NEXT:    v_madak_f32 v2, v1, v2, 0x41200000
 ; GFX10-MAD-NEXT:    v_madak_f32 v1, v1, v3, 0x41200000
-; GFX10-MAD-NEXT:    global_store_dword v0, v2, s[0:1]
+; GFX10-MAD-NEXT:    global_store_dword v0, v2, s[4:5]
 ; GFX10-MAD-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-MAD-NEXT:    global_store_dword v0, v1, s[2:3] offset:4
+; GFX10-MAD-NEXT:    global_store_dword v0, v1, s[6:7] offset:4
 ; GFX10-MAD-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-MAD-NEXT:    s_endpgm
 ;
 ; GFX11-MAD-LABEL: madak_2_use_f32:
 ; GFX11-MAD:       ; %bb.0:
-; GFX11-MAD-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-MAD-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-MAD-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-MAD-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-MAD-NEXT:    global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-MAD-NEXT:    global_load_b32 v1, v0, s[6:7] glc dlc
 ; GFX11-MAD-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-MAD-NEXT:    global_load_b32 v2, v0, s[2:3] offset:4 glc dlc
+; GFX11-MAD-NEXT:    global_load_b32 v2, v0, s[6:7] offset:4 glc dlc
 ; GFX11-MAD-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-MAD-NEXT:    global_load_b32 v3, v0, s[2:3] offset:8 glc dlc
+; GFX11-MAD-NEXT:    global_load_b32 v3, v0, s[6:7] offset:8 glc dlc
 ; GFX11-MAD-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-MAD-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GFX11-MAD-NEXT:    v_mul_f32_e32 v1, v1, v3
 ; GFX11-MAD-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-MAD-NEXT:    v_dual_add_f32 v1, 0x41200000, v1 :: v_dual_add_f32 v2, 0x41200000, v2
-; GFX11-MAD-NEXT:    global_store_b32 v0, v2, s[0:1] dlc
+; GFX11-MAD-NEXT:    global_store_b32 v0, v2, s[4:5] dlc
 ; GFX11-MAD-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-MAD-NEXT:    global_store_b32 v0, v1, s[2:3] offset:4 dlc
+; GFX11-MAD-NEXT:    global_store_b32 v0, v1, s[6:7] offset:4 dlc
 ; GFX11-MAD-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-MAD-NEXT:    s_nop 0
 ; GFX11-MAD-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -282,59 +282,59 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad
 ;
 ; GFX940-FMA-LABEL: madak_2_use_f32:
 ; GFX940-FMA:       ; %bb.0:
-; GFX940-FMA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX940-FMA-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX940-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX940-FMA-NEXT:    v_mov_b32_e32 v4, 0x41200000
 ; GFX940-FMA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-FMA-NEXT:    global_load_dword v1, v0, s[2:3] sc0 sc1
+; GFX940-FMA-NEXT:    global_load_dword v1, v0, s[6:7] sc0 sc1
 ; GFX940-FMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-FMA-NEXT:    global_load_dword v2, v0, s[2:3] offset:4 sc0 sc1
+; GFX940-FMA-NEXT:    global_load_dword v2, v0, s[6:7] offset:4 sc0 sc1
 ; GFX940-FMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-FMA-NEXT:    global_load_dword v3, v0, s[2:3] offset:8 sc0 sc1
+; GFX940-FMA-NEXT:    global_load_dword v3, v0, s[6:7] offset:8 sc0 sc1
 ; GFX940-FMA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-FMA-NEXT:    v_fmaak_f32 v2, v1, v2, 0x41200000
 ; GFX940-FMA-NEXT:    v_fmac_f32_e32 v4, v1, v3
-; GFX940-FMA-NEXT:    global_store_dword v0, v2, s[0:1] sc0 sc1
+; GFX940-FMA-NEXT:    global_store_dword v0, v2, s[4:5] sc0 sc1
 ; GFX940-FMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-FMA-NEXT:    global_store_dword v0, v4, s[2:3] offset:4 sc0 sc1
+; GFX940-FMA-NEXT:    global_store_dword v0, v4, s[6:7] offset:4 sc0 sc1
 ; GFX940-FMA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-FMA-NEXT:    s_endpgm
 ;
 ; GFX10-FMA-LABEL: madak_2_use_f32:
 ; GFX10-FMA:       ; %bb.0:
-; GFX10-FMA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-FMA-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-FMA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-FMA-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
+; GFX10-FMA-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
 ; GFX10-FMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FMA-NEXT:    global_load_dword v2, v0, s[2:3] offset:4 glc dlc
+; GFX10-FMA-NEXT:    global_load_dword v2, v0, s[6:7] offset:4 glc dlc
 ; GFX10-FMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FMA-NEXT:    global_load_dword v3, v0, s[2:3] offset:8 glc dlc
+; GFX10-FMA-NEXT:    global_load_dword v3, v0, s[6:7] offset:8 glc dlc
 ; GFX10-FMA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FMA-NEXT:    v_fmaak_f32 v2, v1, v2, 0x41200000
 ; GFX10-FMA-NEXT:    v_fmaak_f32 v1, v1, v3, 0x41200000
-; GFX10-FMA-NEXT:    global_store_dword v0, v2, s[0:1]
+; GFX10-FMA-NEXT:    global_store_dword v0, v2, s[4:5]
 ; GFX10-FMA-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-FMA-NEXT:    global_store_dword v0, v1, s[2:3] offset:4
+; GFX10-FMA-NEXT:    global_store_dword v0, v1, s[6:7] offset:4
 ; GFX10-FMA-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-FMA-NEXT:    s_endpgm
 ;
 ; GFX11-FMA-LABEL: madak_2_use_f32:
 ; GFX11-FMA:       ; %bb.0:
-; GFX11-FMA-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-FMA-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-FMA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FMA-NEXT:    global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-FMA-NEXT:    global_load_b32 v1, v0, s[6:7] glc dlc
 ; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FMA-NEXT:    global_load_b32 v2, v0, s[2:3] offset:4 glc dlc
+; GFX11-FMA-NEXT:    global_load_b32 v2, v0, s[6:7] offset:4 glc dlc
 ; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FMA-NEXT:    global_load_b32 v3, v0, s[2:3] offset:8 glc dlc
+; GFX11-FMA-NEXT:    global_load_b32 v3, v0, s[6:7] offset:8 glc dlc
 ; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FMA-NEXT:    v_fmaak_f32 v2, v1, v2, 0x41200000
 ; GFX11-FMA-NEXT:    v_fmaak_f32 v1, v1, v3, 0x41200000
-; GFX11-FMA-NEXT:    global_store_b32 v0, v2, s[0:1] dlc
+; GFX11-FMA-NEXT:    global_store_b32 v0, v2, s[4:5] dlc
 ; GFX11-FMA-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FMA-NEXT:    global_store_b32 v0, v1, s[2:3] offset:4 dlc
+; GFX11-FMA-NEXT:    global_store_b32 v0, v1, s[6:7] offset:4 dlc
 ; GFX11-FMA-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-FMA-NEXT:    s_nop 0
 ; GFX11-FMA-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -381,15 +381,15 @@ define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out,
 ;
 ; GFX8-LABEL: madak_m_inline_imm_f32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, s3
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_madak_f32 v2, 4.0, v3, 0x41200000
@@ -398,72 +398,72 @@ define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out,
 ;
 ; GFX9-LABEL: madak_m_inline_imm_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_madak_f32 v1, 4.0, v1, 0x41200000
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-MAD-LABEL: madak_m_inline_imm_f32:
 ; GFX10-MAD:       ; %bb.0:
-; GFX10-MAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-MAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-MAD-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-MAD-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-MAD-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-MAD-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-MAD-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-MAD-NEXT:    v_madak_f32 v1, 4.0, v1, 0x41200000
-; GFX10-MAD-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-MAD-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-MAD-NEXT:    s_endpgm
 ;
 ; GFX11-MAD-LABEL: madak_m_inline_imm_f32:
 ; GFX11-MAD:       ; %bb.0:
-; GFX11-MAD-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-MAD-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-MAD-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-MAD-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-MAD-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-MAD-NEXT:    global_load_b32 v1, v0, s[6:7]
 ; GFX11-MAD-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-MAD-NEXT:    v_mul_f32_e32 v1, 4.0, v1
 ; GFX11-MAD-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-MAD-NEXT:    v_add_f32_e32 v1, 0x41200000, v1
-; GFX11-MAD-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-MAD-NEXT:    global_store_b32 v0, v1, s[4:5]
 ; GFX11-MAD-NEXT:    s_nop 0
 ; GFX11-MAD-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-MAD-NEXT:    s_endpgm
 ;
 ; GFX940-FMA-LABEL: madak_m_inline_imm_f32:
 ; GFX940-FMA:       ; %bb.0:
-; GFX940-FMA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX940-FMA-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX940-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX940-FMA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-FMA-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX940-FMA-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX940-FMA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-FMA-NEXT:    v_fmaak_f32 v1, 4.0, v1, 0x41200000
-; GFX940-FMA-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
+; GFX940-FMA-NEXT:    global_store_dword v0, v1, s[4:5] sc0 sc1
 ; GFX940-FMA-NEXT:    s_endpgm
 ;
 ; GFX10-FMA-LABEL: madak_m_inline_imm_f32:
 ; GFX10-FMA:       ; %bb.0:
-; GFX10-FMA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-FMA-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-FMA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-FMA-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-FMA-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-FMA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FMA-NEXT:    v_fmaak_f32 v1, 4.0, v1, 0x41200000
-; GFX10-FMA-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-FMA-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-FMA-NEXT:    s_endpgm
 ;
 ; GFX11-FMA-LABEL: madak_m_inline_imm_f32:
 ; GFX11-FMA:       ; %bb.0:
-; GFX11-FMA-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-FMA-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-FMA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FMA-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-FMA-NEXT:    global_load_b32 v1, v0, s[6:7]
 ; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FMA-NEXT:    v_fmaak_f32 v1, 4.0, v1, 0x41200000
-; GFX11-FMA-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-FMA-NEXT:    global_store_b32 v0, v1, s[4:5]
 ; GFX11-FMA-NEXT:    s_nop 0
 ; GFX11-FMA-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FMA-NEXT:    s_endpgm
@@ -919,78 +919,78 @@ define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float
 ;
 ; GFX8-LABEL: s_s_madak_f32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0x41200000
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v0, s3
-; GFX8-NEXT:    v_mac_f32_e32 v2, s2, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s7
+; GFX8-NEXT:    v_mac_f32_e32 v2, s6, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: s_s_madak_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x41200000
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v2, s3
-; GFX9-NEXT:    v_mac_f32_e32 v1, s2, v2
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v2, s7
+; GFX9-NEXT:    v_mac_f32_e32 v1, s6, v2
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-MAD-LABEL: s_s_madak_f32:
 ; GFX10-MAD:       ; %bb.0:
-; GFX10-MAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-MAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-MAD-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-MAD-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-MAD-NEXT:    v_mov_b32_e32 v0, s3
-; GFX10-MAD-NEXT:    v_madak_f32 v0, s2, v0, 0x41200000
-; GFX10-MAD-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-MAD-NEXT:    v_mov_b32_e32 v0, s7
+; GFX10-MAD-NEXT:    v_madak_f32 v0, s6, v0, 0x41200000
+; GFX10-MAD-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-MAD-NEXT:    s_endpgm
 ;
 ; GFX11-MAD-LABEL: s_s_madak_f32:
 ; GFX11-MAD:       ; %bb.0:
-; GFX11-MAD-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-MAD-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-MAD-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-MAD-NEXT:    v_mul_f32_e64 v0, s2, s3
+; GFX11-MAD-NEXT:    v_mul_f32_e64 v0, s6, s7
 ; GFX11-MAD-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-MAD-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_f32 v0, 0x41200000, v0
-; GFX11-MAD-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-MAD-NEXT:    global_store_b32 v1, v0, s[4:5]
 ; GFX11-MAD-NEXT:    s_nop 0
 ; GFX11-MAD-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-MAD-NEXT:    s_endpgm
 ;
 ; GFX940-FMA-LABEL: s_s_madak_f32:
 ; GFX940-FMA:       ; %bb.0:
-; GFX940-FMA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX940-FMA-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX940-FMA-NEXT:    v_mov_b32_e32 v1, 0x41200000
 ; GFX940-FMA-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-FMA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-FMA-NEXT:    v_mov_b32_e32 v2, s3
-; GFX940-FMA-NEXT:    v_fmac_f32_e32 v1, s2, v2
-; GFX940-FMA-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
+; GFX940-FMA-NEXT:    v_mov_b32_e32 v2, s7
+; GFX940-FMA-NEXT:    v_fmac_f32_e32 v1, s6, v2
+; GFX940-FMA-NEXT:    global_store_dword v0, v1, s[4:5] sc0 sc1
 ; GFX940-FMA-NEXT:    s_endpgm
 ;
 ; GFX10-FMA-LABEL: s_s_madak_f32:
 ; GFX10-FMA:       ; %bb.0:
-; GFX10-FMA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-FMA-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-FMA-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-FMA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-FMA-NEXT:    v_mov_b32_e32 v0, s3
-; GFX10-FMA-NEXT:    v_fmaak_f32 v0, s2, v0, 0x41200000
-; GFX10-FMA-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-FMA-NEXT:    v_mov_b32_e32 v0, s7
+; GFX10-FMA-NEXT:    v_fmaak_f32 v0, s6, v0, 0x41200000
+; GFX10-FMA-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-FMA-NEXT:    s_endpgm
 ;
 ; GFX11-FMA-LABEL: s_s_madak_f32:
 ; GFX11-FMA:       ; %bb.0:
-; GFX11-FMA-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-FMA-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-FMA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FMA-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s3
+; GFX11-FMA-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s7
 ; GFX11-FMA-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FMA-NEXT:    v_fmaak_f32 v0, s2, v0, 0x41200000
-; GFX11-FMA-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-FMA-NEXT:    v_fmaak_f32 v0, s6, v0, 0x41200000
+; GFX11-FMA-NEXT:    global_store_b32 v1, v0, s[4:5]
 ; GFX11-FMA-NEXT:    s_nop 0
 ; GFX11-FMA-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FMA-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/memory_clause.ll b/llvm/test/CodeGen/AMDGPU/memory_clause.ll
index 940287d44d8d1..5c88328b6dd91 100644
--- a/llvm/test/CodeGen/AMDGPU/memory_clause.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory_clause.ll
@@ -5,41 +5,41 @@
 define amdgpu_kernel void @vector_clause(ptr addrspace(1) noalias nocapture readonly %arg, ptr addrspace(1) noalias nocapture %arg1) {
 ; GCN-LABEL: vector_clause:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GCN-NEXT:    v_lshlrev_b32_e32 v16, 4, v0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1]
-; GCN-NEXT:    global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
-; GCN-NEXT:    global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
-; GCN-NEXT:    global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
+; GCN-NEXT:    global_load_dwordx4 v[0:3], v16, s[4:5]
+; GCN-NEXT:    global_load_dwordx4 v[4:7], v16, s[4:5] offset:16
+; GCN-NEXT:    global_load_dwordx4 v[8:11], v16, s[4:5] offset:32
+; GCN-NEXT:    global_load_dwordx4 v[12:15], v16, s[4:5] offset:48
 ; GCN-NEXT:    s_waitcnt vmcnt(3)
-; GCN-NEXT:    global_store_dwordx4 v16, v[0:3], s[2:3]
+; GCN-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7]
 ; GCN-NEXT:    s_waitcnt vmcnt(3)
-; GCN-NEXT:    global_store_dwordx4 v16, v[4:7], s[2:3] offset:16
+; GCN-NEXT:    global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
 ; GCN-NEXT:    s_waitcnt vmcnt(3)
-; GCN-NEXT:    global_store_dwordx4 v16, v[8:11], s[2:3] offset:32
+; GCN-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
 ; GCN-NEXT:    s_waitcnt vmcnt(3)
-; GCN-NEXT:    global_store_dwordx4 v16, v[12:15], s[2:3] offset:48
+; GCN-NEXT:    global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
 ; GCN-NEXT:    s_endpgm
 ;
 ; GCN-SCRATCH-LABEL: vector_clause:
 ; GCN-SCRATCH:       ; %bb.0: ; %bb
-; GCN-SCRATCH-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-SCRATCH-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GCN-SCRATCH-NEXT:    v_lshlrev_b32_e32 v16, 4, v0
 ; GCN-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-SCRATCH-NEXT:    s_clause 0x3
-; GCN-SCRATCH-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1]
-; GCN-SCRATCH-NEXT:    global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
-; GCN-SCRATCH-NEXT:    global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
-; GCN-SCRATCH-NEXT:    global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
+; GCN-SCRATCH-NEXT:    global_load_dwordx4 v[0:3], v16, s[4:5]
+; GCN-SCRATCH-NEXT:    global_load_dwordx4 v[4:7], v16, s[4:5] offset:16
+; GCN-SCRATCH-NEXT:    global_load_dwordx4 v[8:11], v16, s[4:5] offset:32
+; GCN-SCRATCH-NEXT:    global_load_dwordx4 v[12:15], v16, s[4:5] offset:48
 ; GCN-SCRATCH-NEXT:    s_waitcnt vmcnt(3)
-; GCN-SCRATCH-NEXT:    global_store_dwordx4 v16, v[0:3], s[2:3]
+; GCN-SCRATCH-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7]
 ; GCN-SCRATCH-NEXT:    s_waitcnt vmcnt(2)
-; GCN-SCRATCH-NEXT:    global_store_dwordx4 v16, v[4:7], s[2:3] offset:16
+; GCN-SCRATCH-NEXT:    global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
 ; GCN-SCRATCH-NEXT:    s_waitcnt vmcnt(1)
-; GCN-SCRATCH-NEXT:    global_store_dwordx4 v16, v[8:11], s[2:3] offset:32
+; GCN-SCRATCH-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
 ; GCN-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GCN-SCRATCH-NEXT:    global_store_dwordx4 v16, v[12:15], s[2:3] offset:48
+; GCN-SCRATCH-NEXT:    global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
 ; GCN-SCRATCH-NEXT:    s_endpgm
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/merge-s-load.mir b/llvm/test/CodeGen/AMDGPU/merge-s-load.mir
index b08da2e1848ff..a87503c731c7a 100644
--- a/llvm/test/CodeGen/AMDGPU/merge-s-load.mir
+++ b/llvm/test/CodeGen/AMDGPU/merge-s-load.mir
@@ -8,9 +8,9 @@ body: |
   bb.0:
     ; CHECK-LABEL: name: merge_s_load_x1_x1
     ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s64), align 4)
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[S_LOAD_DWORDX2_IMM]].sub0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_LOAD_DWORDX2_IMM]].sub1
+    ; CHECK-NEXT: early-clobber %3:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s64), align 4)
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32_xm0_xexec = COPY %3.sub0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed %3.sub1
     %0:sgpr_64 = IMPLICIT_DEF
     %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s32))
     %2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 4, 0 :: (dereferenceable invariant load (s32))
@@ -48,16 +48,16 @@ body: |
   bb.0:
     ; GFX11-LABEL: name: merge_s_load_x1_x1_x1
     ; GFX11: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
-    ; GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s64), align 4)
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[S_LOAD_DWORDX2_IMM]].sub0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_LOAD_DWORDX2_IMM]].sub1
+    ; GFX11-NEXT: early-clobber %4:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s64), align 4)
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32_xm0_xexec = COPY %4.sub0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed %4.sub1
     ; GFX11-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 8, 0 :: (dereferenceable invariant load (s32))
     ;
     ; GFX12-LABEL: name: merge_s_load_x1_x1_x1
     ; GFX12: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
-    ; GFX12-NEXT: [[S_LOAD_DWORDX3_IMM:%[0-9]+]]:sgpr_96 = S_LOAD_DWORDX3_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s96), align 4)
-    ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_64_xexec = COPY [[S_LOAD_DWORDX3_IMM]].sub0_sub1
-    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_LOAD_DWORDX3_IMM]].sub2
+    ; GFX12-NEXT: early-clobber %5:sgpr_96 = S_LOAD_DWORDX3_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s96), align 4)
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_64_xexec = COPY %5.sub0_sub1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed %5.sub2
     ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY]].sub0
     ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY]].sub1
     %0:sgpr_64 = IMPLICIT_DEF
@@ -72,9 +72,9 @@ body: |
   bb.0:
     ; GFX11-LABEL: name: merge_s_load_x1_x1_x1_x1
     ; GFX11: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
-    ; GFX11-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s128), align 4)
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64_xexec = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY killed [[S_LOAD_DWORDX4_IMM]].sub2_sub3
+    ; GFX11-NEXT: early-clobber %7:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s128), align 4)
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64_xexec = COPY %7.sub0_sub1
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY killed %7.sub2_sub3
     ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY]].sub0
     ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY]].sub1
     ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY1]].sub0
@@ -82,9 +82,9 @@ body: |
     ;
     ; GFX12-LABEL: name: merge_s_load_x1_x1_x1_x1
     ; GFX12: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
-    ; GFX12-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s128), align 4)
-    ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_96 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1_sub2
-    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_LOAD_DWORDX4_IMM]].sub3
+    ; GFX12-NEXT: early-clobber %7:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s128), align 4)
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_96 = COPY %7.sub0_sub1_sub2
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed %7.sub3
     ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_64_xexec = COPY [[COPY]].sub0_sub1
     ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY]].sub2
     ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY2]].sub0
@@ -102,9 +102,9 @@ body: |
   bb.0:
     ; GFX11-LABEL: name: merge_s_load_x1_x1_x1_x1_x1_x1_x1_x1
     ; GFX11: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
-    ; GFX11-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 4)
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed [[S_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7
+    ; GFX11-NEXT: early-clobber %15:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 4)
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %15.sub0_sub1_sub2_sub3
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed %15.sub4_sub5_sub6_sub7
     ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_64_xexec = COPY [[COPY]].sub0_sub1
     ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY killed [[COPY]].sub2_sub3
     ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY2]].sub0
@@ -120,9 +120,9 @@ body: |
     ;
     ; GFX12-LABEL: name: merge_s_load_x1_x1_x1_x1_x1_x1_x1_x1
     ; GFX12: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
-    ; GFX12-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 4)
-    ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3
-    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed [[S_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7
+    ; GFX12-NEXT: early-clobber %15:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 4)
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %15.sub0_sub1_sub2_sub3
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed %15.sub4_sub5_sub6_sub7
     ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_96 = COPY [[COPY]].sub0_sub1_sub2
     ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY]].sub3
     ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_64_xexec = COPY [[COPY2]].sub0_sub1
@@ -157,9 +157,9 @@ body: |
     ;
     ; GFX12-LABEL: name: merge_s_load_x2_x1
     ; GFX12: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
-    ; GFX12-NEXT: [[S_LOAD_DWORDX3_IMM:%[0-9]+]]:sgpr_96 = S_LOAD_DWORDX3_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s96), align 8)
-    ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY [[S_LOAD_DWORDX3_IMM]].sub0_sub1
-    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_LOAD_DWORDX3_IMM]].sub2
+    ; GFX12-NEXT: early-clobber %3:sgpr_96 = S_LOAD_DWORDX3_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s96), align 8)
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY %3.sub0_sub1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed %3.sub2
     %0:sgpr_64 = IMPLICIT_DEF
     %1:sgpr_64 = S_LOAD_DWORDX2_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s64))
     %2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 8, 0 :: (dereferenceable invariant load (s32))
@@ -171,9 +171,9 @@ body: |
   bb.0:
     ; CHECK-LABEL: name: merge_s_load_x2_x2
     ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s128), align 8)
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_64 = COPY killed [[S_LOAD_DWORDX4_IMM]].sub2_sub3
+    ; CHECK-NEXT: early-clobber %3:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s128), align 8)
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY %3.sub0_sub1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_64 = COPY killed %3.sub2_sub3
     %0:sgpr_64 = IMPLICIT_DEF
     %1:sgpr_64 = S_LOAD_DWORDX2_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s64))
     %2:sgpr_64 = S_LOAD_DWORDX2_IMM %0:sgpr_64, 8, 0 :: (dereferenceable invariant load (s64))
@@ -185,9 +185,9 @@ body: |
   bb.0:
     ; CHECK-LABEL: name: merge_s_load_x2_x2_x2_x2
     ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 8)
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed [[S_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7
+    ; CHECK-NEXT: early-clobber %7:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 8)
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %7.sub0_sub1_sub2_sub3
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed %7.sub4_sub5_sub6_sub7
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_64 = COPY [[COPY]].sub0_sub1
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY killed [[COPY]].sub2_sub3
     ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY [[COPY1]].sub0_sub1
@@ -205,9 +205,9 @@ body: |
   bb.0:
     ; CHECK-LABEL: name: merge_s_load_x3_x1
     ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s128))
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_96 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1_sub2
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_LOAD_DWORDX4_IMM]].sub3
+    ; CHECK-NEXT: early-clobber %3:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s128))
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_96 = COPY %3.sub0_sub1_sub2
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed %3.sub3
     %0:sgpr_64 = IMPLICIT_DEF
     %1:sgpr_96 = S_LOAD_DWORDX3_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s96))
     %2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 12, 0 :: (dereferenceable invariant load (s32))
@@ -219,10 +219,118 @@ body: |
   bb.0:
     ; CHECK-LABEL: name: merge_s_load_x4_x4
     ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 16)
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed [[S_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7
+    ; CHECK-NEXT: early-clobber %3:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 16)
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %3.sub0_sub1_sub2_sub3
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed %3.sub4_sub5_sub6_sub7
     %0:sgpr_64 = IMPLICIT_DEF
     %1:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s128))
     %2:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 16, 0 :: (dereferenceable invariant load (s128))
 ...
+
+# The constrained multi-dword scalar load merge tests.
+---
+name: merge_s_load_x1_x2ec
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: merge_s_load_x1_x2ec
+    ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s32))
+    ; CHECK-NEXT: early-clobber %2:sgpr_64 = S_LOAD_DWORDX2_IMM_ec [[DEF]], 4, 0 :: (dereferenceable invariant load (s64))
+    %0:sgpr_64 = IMPLICIT_DEF
+    %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s32))
+    early-clobber %2:sgpr_64 = S_LOAD_DWORDX2_IMM_ec %0:sgpr_64, 4, 0 :: (dereferenceable invariant load (s64))
+...
+
+---
+name: merge_s_load_x1_x3ec
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: merge_s_load_x1_x3ec
+    ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s32))
+    ; CHECK-NEXT: early-clobber %2:sgpr_96 = S_LOAD_DWORDX3_IMM_ec [[DEF]], 4, 0 :: (dereferenceable invariant load (s96), align 16)
+    %0:sgpr_64 = IMPLICIT_DEF
+    %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s32))
+    early-clobber %2:sgpr_96 = S_LOAD_DWORDX3_IMM_ec %0:sgpr_64, 4, 0 :: (dereferenceable invariant load (s96))
+...
+
+---
+name: merge_s_load_x2ec_x1
+body: |
+  bb.0:
+    ; GFX11-LABEL: name: merge_s_load_x2ec_x1
+    ; GFX11: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
+    ; GFX11-NEXT: early-clobber %1:sgpr_64 = S_LOAD_DWORDX2_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s64))
+    ; GFX11-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 8, 0 :: (dereferenceable invariant load (s32))
+    ;
+    ; GFX12-LABEL: name: merge_s_load_x2ec_x1
+    ; GFX12: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
+    ; GFX12-NEXT: early-clobber %3:sgpr_96 = S_LOAD_DWORDX3_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s96), align 8)
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY %3.sub0_sub1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed %3.sub2
+    %0:sgpr_64 = IMPLICIT_DEF
+    early-clobber %1:sgpr_64 = S_LOAD_DWORDX2_IMM_ec %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s64))
+    %2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 8, 0 :: (dereferenceable invariant load (s32))
+...
+
+---
+name: merge_s_load_x2ec_x2ec
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: merge_s_load_x2ec_x2ec
+    ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
+    ; CHECK-NEXT: early-clobber %3:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s128), align 8)
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY %3.sub0_sub1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_64 = COPY killed %3.sub2_sub3
+    %0:sgpr_64 = IMPLICIT_DEF
+    early-clobber %1:sgpr_64 = S_LOAD_DWORDX2_IMM_ec %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s64))
+    early-clobber %2:sgpr_64 = S_LOAD_DWORDX2_IMM_ec %0:sgpr_64, 8, 0 :: (dereferenceable invariant load (s64))
+...
+
+---
+name: merge_s_load_x2ec_x2ec_x2ec_x2ec
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: merge_s_load_x2ec_x2ec_x2ec_x2ec
+    ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
+    ; CHECK-NEXT: early-clobber %7:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 8)
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %7.sub0_sub1_sub2_sub3
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed %7.sub4_sub5_sub6_sub7
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_64 = COPY [[COPY]].sub0_sub1
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY killed [[COPY]].sub2_sub3
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY [[COPY1]].sub0_sub1
+    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY killed [[COPY1]].sub2_sub3
+    %0:sgpr_64 = IMPLICIT_DEF
+    early-clobber %1:sgpr_64 = S_LOAD_DWORDX2_IMM_ec %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s64))
+    early-clobber %2:sgpr_64 = S_LOAD_DWORDX2_IMM_ec %0:sgpr_64, 8, 0 :: (dereferenceable invariant load (s64))
+    early-clobber %3:sgpr_64 = S_LOAD_DWORDX2_IMM_ec %0:sgpr_64, 16, 0 :: (dereferenceable invariant load (s64))
+    early-clobber %4:sgpr_64 = S_LOAD_DWORDX2_IMM_ec %0:sgpr_64, 24, 0 :: (dereferenceable invariant load (s64))
+...
+
+---
+name: merge_s_load_x3ec_x1
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: merge_s_load_x3ec_x1
+    ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
+    ; CHECK-NEXT: early-clobber %3:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s128))
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_96 = COPY %3.sub0_sub1_sub2
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed %3.sub3
+    %0:sgpr_64 = IMPLICIT_DEF
+    early-clobber %1:sgpr_96 = S_LOAD_DWORDX3_IMM_ec %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s96))
+    %2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 12, 0 :: (dereferenceable invariant load (s32))
+...
+
+---
+name: merge_s_load_x4ec_x4ec
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: merge_s_load_x4ec_x4ec
+    ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
+    ; CHECK-NEXT: early-clobber %3:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 16)
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %3.sub0_sub1_sub2_sub3
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed %3.sub4_sub5_sub6_sub7
+    %0:sgpr_64 = IMPLICIT_DEF
+    early-clobber %1:sgpr_128 = S_LOAD_DWORDX4_IMM_ec %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s128))
+    early-clobber %2:sgpr_128 = S_LOAD_DWORDX4_IMM_ec %0:sgpr_64, 16, 0 :: (dereferenceable invariant load (s128))
+...
diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll
index 9dafa27ece86f..2334543157b6e 100644
--- a/llvm/test/CodeGen/AMDGPU/min.ll
+++ b/llvm/test/CodeGen/AMDGPU/min.ll
@@ -185,13 +185,13 @@ define amdgpu_kernel void @s_test_imin_sle_i32(ptr addrspace(1) %out, i32 %a, i3
 ;
 ; GFX11-LABEL: s_test_imin_sle_i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_min_i32 s2, s2, s3
+; GFX11-NEXT:    s_min_i32 s0, s6, s7
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_mov_b32_e32 v1, s2
-; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -257,13 +257,13 @@ define amdgpu_kernel void @s_test_imin_sle_v1i32(ptr addrspace(1) %out, <1 x i32
 ;
 ; GFX11-LABEL: s_test_imin_sle_v1i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_min_i32 s2, s2, s3
+; GFX11-NEXT:    s_min_i32 s0, s6, s7
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_mov_b32_e32 v1, s2
-; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -808,11 +808,11 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16
 ;
 ; GFX11-LABEL: s_test_imin_sle_v2i16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_pk_min_i16 v1, s2, s3
-; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    v_pk_min_i16 v1, s6, s7
+; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -1324,13 +1324,13 @@ define amdgpu_kernel void @s_test_imin_slt_i32(ptr addrspace(1) %out, i32 %a, i3
 ;
 ; GFX11-LABEL: s_test_imin_slt_i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_min_i32 s2, s2, s3
+; GFX11-NEXT:    s_min_i32 s0, s6, s7
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_mov_b32_e32 v1, s2
-; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -2084,13 +2084,13 @@ define amdgpu_kernel void @s_test_umin_ule_i32(ptr addrspace(1) %out, i32 %a, i3
 ;
 ; GFX11-LABEL: s_test_umin_ule_i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_min_u32 s2, s2, s3
+; GFX11-NEXT:    s_min_u32 s0, s6, s7
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_mov_b32_e32 v1, s2
-; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -2405,13 +2405,13 @@ define amdgpu_kernel void @s_test_umin_ult_i32(ptr addrspace(1) %out, i32 %a, i3
 ;
 ; GFX11-LABEL: s_test_umin_ult_i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_min_u32 s2, s2, s3
+; GFX11-NEXT:    s_min_u32 s0, s6, s7
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_mov_b32_e32 v1, s2
-; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -2763,13 +2763,13 @@ define amdgpu_kernel void @s_test_umin_ult_v1i32(ptr addrspace(1) %out, <1 x i32
 ;
 ; GFX11-LABEL: s_test_umin_ult_v1i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_min_u32 s2, s2, s3
+; GFX11-NEXT:    s_min_u32 s0, s6, s7
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_mov_b32_e32 v1, s2
-; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll
index 9d6e0927b0dfd..c98cfa08160ca 100644
--- a/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll
+++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll
@@ -8,17 +8,17 @@ declare i64 @llvm.cttz.i64(i64, i1) nounwind readnone
 define amdgpu_kernel void @ctlz_i64_poison(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
 ; GFX9-LABEL: ctlz_i64_poison:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5
-; GFX9-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:6
-; GFX9-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:7
-; GFX9-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1
-; GFX9-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3
-; GFX9-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:4
-; GFX9-NEXT:    global_load_ubyte v7, v1, s[2:3]
-; GFX9-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:2
+; GFX9-NEXT:    global_load_ubyte v0, v1, s[6:7] offset:5
+; GFX9-NEXT:    global_load_ubyte v2, v1, s[6:7] offset:6
+; GFX9-NEXT:    global_load_ubyte v3, v1, s[6:7] offset:7
+; GFX9-NEXT:    global_load_ubyte v4, v1, s[6:7] offset:1
+; GFX9-NEXT:    global_load_ubyte v5, v1, s[6:7] offset:3
+; GFX9-NEXT:    global_load_ubyte v6, v1, s[6:7] offset:4
+; GFX9-NEXT:    global_load_ubyte v7, v1, s[6:7]
+; GFX9-NEXT:    global_load_ubyte v8, v1, s[6:7] offset:2
 ; GFX9-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(5)
@@ -40,23 +40,23 @@ define amdgpu_kernel void @ctlz_i64_poison(ptr addrspace(1) noalias %out, ptr ad
 ; GFX9-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX9-NEXT:    v_add_u32_e64 v2, v2, 32 clamp
 ; GFX9-NEXT:    v_min_u32_e32 v0, v2, v0
-; GFX9-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX9-NEXT:    global_store_dwordx2 v1, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: ctlz_i64_poison:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x7
-; GFX10-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5
-; GFX10-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:6
-; GFX10-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:7
-; GFX10-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1
-; GFX10-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3
-; GFX10-NEXT:    global_load_ubyte v6, v1, s[2:3]
-; GFX10-NEXT:    global_load_ubyte v7, v1, s[2:3] offset:2
-; GFX10-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:4
+; GFX10-NEXT:    global_load_ubyte v0, v1, s[6:7] offset:5
+; GFX10-NEXT:    global_load_ubyte v2, v1, s[6:7] offset:6
+; GFX10-NEXT:    global_load_ubyte v3, v1, s[6:7] offset:7
+; GFX10-NEXT:    global_load_ubyte v4, v1, s[6:7] offset:1
+; GFX10-NEXT:    global_load_ubyte v5, v1, s[6:7] offset:3
+; GFX10-NEXT:    global_load_ubyte v6, v1, s[6:7]
+; GFX10-NEXT:    global_load_ubyte v7, v1, s[6:7] offset:2
+; GFX10-NEXT:    global_load_ubyte v8, v1, s[6:7] offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(7)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GFX10-NEXT:    s_waitcnt vmcnt(5)
@@ -76,7 +76,7 @@ define amdgpu_kernel void @ctlz_i64_poison(ptr addrspace(1) noalias %out, ptr ad
 ; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX10-NEXT:    v_add_nc_u32_e64 v2, v2, 32 clamp
 ; GFX10-NEXT:    v_min_u32_e32 v0, v2, v0
-; GFX10-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX10-NEXT:    global_store_dwordx2 v1, v[0:1], s[4:5]
 ; GFX10-NEXT:    s_endpgm
   %val = load i64, ptr addrspace(1) %arrayidx, align 1
   %ctlz = tail call i64 @llvm.ctlz.i64(i64 %val, i1 true) nounwind readnone
@@ -87,17 +87,17 @@ define amdgpu_kernel void @ctlz_i64_poison(ptr addrspace(1) noalias %out, ptr ad
 define amdgpu_kernel void @ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
 ; GFX9-LABEL: ctlz_i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5
-; GFX9-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:6
-; GFX9-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:7
-; GFX9-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1
-; GFX9-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3
-; GFX9-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:4
-; GFX9-NEXT:    global_load_ubyte v7, v1, s[2:3]
-; GFX9-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:2
+; GFX9-NEXT:    global_load_ubyte v0, v1, s[6:7] offset:5
+; GFX9-NEXT:    global_load_ubyte v2, v1, s[6:7] offset:6
+; GFX9-NEXT:    global_load_ubyte v3, v1, s[6:7] offset:7
+; GFX9-NEXT:    global_load_ubyte v4, v1, s[6:7] offset:1
+; GFX9-NEXT:    global_load_ubyte v5, v1, s[6:7] offset:3
+; GFX9-NEXT:    global_load_ubyte v6, v1, s[6:7] offset:4
+; GFX9-NEXT:    global_load_ubyte v7, v1, s[6:7]
+; GFX9-NEXT:    global_load_ubyte v8, v1, s[6:7] offset:2
 ; GFX9-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(5)
@@ -120,23 +120,23 @@ define amdgpu_kernel void @ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace
 ; GFX9-NEXT:    v_add_u32_e64 v2, v2, 32 clamp
 ; GFX9-NEXT:    v_min_u32_e32 v0, v2, v0
 ; GFX9-NEXT:    v_min_u32_e32 v0, 64, v0
-; GFX9-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX9-NEXT:    global_store_dwordx2 v1, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: ctlz_i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x7
-; GFX10-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5
-; GFX10-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:6
-; GFX10-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:7
-; GFX10-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1
-; GFX10-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3
-; GFX10-NEXT:    global_load_ubyte v6, v1, s[2:3]
-; GFX10-NEXT:    global_load_ubyte v7, v1, s[2:3] offset:2
-; GFX10-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:4
+; GFX10-NEXT:    global_load_ubyte v0, v1, s[6:7] offset:5
+; GFX10-NEXT:    global_load_ubyte v2, v1, s[6:7] offset:6
+; GFX10-NEXT:    global_load_ubyte v3, v1, s[6:7] offset:7
+; GFX10-NEXT:    global_load_ubyte v4, v1, s[6:7] offset:1
+; GFX10-NEXT:    global_load_ubyte v5, v1, s[6:7] offset:3
+; GFX10-NEXT:    global_load_ubyte v6, v1, s[6:7]
+; GFX10-NEXT:    global_load_ubyte v7, v1, s[6:7] offset:2
+; GFX10-NEXT:    global_load_ubyte v8, v1, s[6:7] offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(7)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GFX10-NEXT:    s_waitcnt vmcnt(5)
@@ -157,7 +157,7 @@ define amdgpu_kernel void @ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace
 ; GFX10-NEXT:    v_add_nc_u32_e64 v2, v2, 32 clamp
 ; GFX10-NEXT:    v_min_u32_e32 v0, v2, v0
 ; GFX10-NEXT:    v_min_u32_e32 v0, 64, v0
-; GFX10-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX10-NEXT:    global_store_dwordx2 v1, v[0:1], s[4:5]
 ; GFX10-NEXT:    s_endpgm
   %val = load i64, ptr addrspace(1) %arrayidx, align 1
   %ctlz = tail call i64 @llvm.ctlz.i64(i64 %val, i1 false) nounwind readnone
@@ -168,17 +168,17 @@ define amdgpu_kernel void @ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace
 define amdgpu_kernel void @cttz_i64_poison(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
 ; GFX9-LABEL: cttz_i64_poison:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5
-; GFX9-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:6
-; GFX9-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:7
-; GFX9-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1
-; GFX9-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3
-; GFX9-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:4
-; GFX9-NEXT:    global_load_ubyte v7, v1, s[2:3]
-; GFX9-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:2
+; GFX9-NEXT:    global_load_ubyte v0, v1, s[6:7] offset:5
+; GFX9-NEXT:    global_load_ubyte v2, v1, s[6:7] offset:6
+; GFX9-NEXT:    global_load_ubyte v3, v1, s[6:7] offset:7
+; GFX9-NEXT:    global_load_ubyte v4, v1, s[6:7] offset:1
+; GFX9-NEXT:    global_load_ubyte v5, v1, s[6:7] offset:3
+; GFX9-NEXT:    global_load_ubyte v6, v1, s[6:7] offset:4
+; GFX9-NEXT:    global_load_ubyte v7, v1, s[6:7]
+; GFX9-NEXT:    global_load_ubyte v8, v1, s[6:7] offset:2
 ; GFX9-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(5)
@@ -200,23 +200,23 @@ define amdgpu_kernel void @cttz_i64_poison(ptr addrspace(1) noalias %out, ptr ad
 ; GFX9-NEXT:    v_ffbl_b32_e32 v2, v2
 ; GFX9-NEXT:    v_add_u32_e64 v0, v0, 32 clamp
 ; GFX9-NEXT:    v_min_u32_e32 v0, v0, v2
-; GFX9-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX9-NEXT:    global_store_dwordx2 v1, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: cttz_i64_poison:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x7
-; GFX10-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5
-; GFX10-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:7
-; GFX10-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:6
-; GFX10-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1
-; GFX10-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3
-; GFX10-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:4
-; GFX10-NEXT:    global_load_ubyte v7, v1, s[2:3]
-; GFX10-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:2
+; GFX10-NEXT:    global_load_ubyte v0, v1, s[6:7] offset:5
+; GFX10-NEXT:    global_load_ubyte v2, v1, s[6:7] offset:7
+; GFX10-NEXT:    global_load_ubyte v3, v1, s[6:7] offset:6
+; GFX10-NEXT:    global_load_ubyte v4, v1, s[6:7] offset:1
+; GFX10-NEXT:    global_load_ubyte v5, v1, s[6:7] offset:3
+; GFX10-NEXT:    global_load_ubyte v6, v1, s[6:7] offset:4
+; GFX10-NEXT:    global_load_ubyte v7, v1, s[6:7]
+; GFX10-NEXT:    global_load_ubyte v8, v1, s[6:7] offset:2
 ; GFX10-NEXT:    s_waitcnt vmcnt(7)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GFX10-NEXT:    s_waitcnt vmcnt(6)
@@ -238,7 +238,7 @@ define amdgpu_kernel void @cttz_i64_poison(ptr addrspace(1) noalias %out, ptr ad
 ; GFX10-NEXT:    v_ffbl_b32_e32 v2, v2
 ; GFX10-NEXT:    v_add_nc_u32_e64 v0, v0, 32 clamp
 ; GFX10-NEXT:    v_min_u32_e32 v0, v0, v2
-; GFX10-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX10-NEXT:    global_store_dwordx2 v1, v[0:1], s[4:5]
 ; GFX10-NEXT:    s_endpgm
   %val = load i64, ptr addrspace(1) %arrayidx, align 1
   %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 true) nounwind readnone
@@ -249,17 +249,17 @@ define amdgpu_kernel void @cttz_i64_poison(ptr addrspace(1) noalias %out, ptr ad
 define amdgpu_kernel void @cttz_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
 ; GFX9-LABEL: cttz_i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5
-; GFX9-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:6
-; GFX9-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:7
-; GFX9-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1
-; GFX9-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3
-; GFX9-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:4
-; GFX9-NEXT:    global_load_ubyte v7, v1, s[2:3]
-; GFX9-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:2
+; GFX9-NEXT:    global_load_ubyte v0, v1, s[6:7] offset:5
+; GFX9-NEXT:    global_load_ubyte v2, v1, s[6:7] offset:6
+; GFX9-NEXT:    global_load_ubyte v3, v1, s[6:7] offset:7
+; GFX9-NEXT:    global_load_ubyte v4, v1, s[6:7] offset:1
+; GFX9-NEXT:    global_load_ubyte v5, v1, s[6:7] offset:3
+; GFX9-NEXT:    global_load_ubyte v6, v1, s[6:7] offset:4
+; GFX9-NEXT:    global_load_ubyte v7, v1, s[6:7]
+; GFX9-NEXT:    global_load_ubyte v8, v1, s[6:7] offset:2
 ; GFX9-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(5)
@@ -282,23 +282,23 @@ define amdgpu_kernel void @cttz_i64(ptr addrspace(1) noalias %out, ptr addrspace
 ; GFX9-NEXT:    v_add_u32_e64 v0, v0, 32 clamp
 ; GFX9-NEXT:    v_min_u32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_min_u32_e32 v0, 64, v0
-; GFX9-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX9-NEXT:    global_store_dwordx2 v1, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: cttz_i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x7
-; GFX10-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5
-; GFX10-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:7
-; GFX10-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:6
-; GFX10-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1
-; GFX10-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3
-; GFX10-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:4
-; GFX10-NEXT:    global_load_ubyte v7, v1, s[2:3]
-; GFX10-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:2
+; GFX10-NEXT:    global_load_ubyte v0, v1, s[6:7] offset:5
+; GFX10-NEXT:    global_load_ubyte v2, v1, s[6:7] offset:7
+; GFX10-NEXT:    global_load_ubyte v3, v1, s[6:7] offset:6
+; GFX10-NEXT:    global_load_ubyte v4, v1, s[6:7] offset:1
+; GFX10-NEXT:    global_load_ubyte v5, v1, s[6:7] offset:3
+; GFX10-NEXT:    global_load_ubyte v6, v1, s[6:7] offset:4
+; GFX10-NEXT:    global_load_ubyte v7, v1, s[6:7]
+; GFX10-NEXT:    global_load_ubyte v8, v1, s[6:7] offset:2
 ; GFX10-NEXT:    s_waitcnt vmcnt(7)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GFX10-NEXT:    s_waitcnt vmcnt(6)
@@ -321,7 +321,7 @@ define amdgpu_kernel void @cttz_i64(ptr addrspace(1) noalias %out, ptr addrspace
 ; GFX10-NEXT:    v_add_nc_u32_e64 v0, v0, 32 clamp
 ; GFX10-NEXT:    v_min_u32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_min_u32_e32 v0, 64, v0
-; GFX10-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX10-NEXT:    global_store_dwordx2 v1, v[0:1], s[4:5]
 ; GFX10-NEXT:    s_endpgm
   %val = load i64, ptr addrspace(1) %arrayidx, align 1
   %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 false) nounwind readnone
diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll
index b4272049f36a4..3c60153df441e 100644
--- a/llvm/test/CodeGen/AMDGPU/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul.ll
@@ -676,14 +676,17 @@ define amdgpu_kernel void @mul64_sext_c(ptr addrspace(1) %out, i32 %in) {
 ;
 ; GFX12-LABEL: mul64_sext_c:
 ; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_ashr_i32 s3, s2, 31
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_mul_u64 s[4:5], s[2:3], 0x50
+; GFX12-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
 ; GFX12-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    s_mov_b32 s2, -1
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_ashr_i32 s1, s6, 31
+; GFX12-NEXT:    s_mov_b32 s0, s6
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_mul_u64 s[0:1], s[0:1], 0x50
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    s_mov_b32 s0, s4
+; GFX12-NEXT:    s_mov_b32 s1, s5
 ; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], null
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -784,13 +787,17 @@ define amdgpu_kernel void @mul64_zext_c(ptr addrspace(1) %out, i32 %in) {
 ;
 ; GFX12-LABEL: mul64_zext_c:
 ; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
-; GFX12-NEXT:    s_mov_b32 s3, 0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_mul_u64 s[4:5], s[2:3], 0x50
+; GFX12-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
+; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    s_mov_b32 s2, -1
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s0, s6
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_mul_u64 s[0:1], s[0:1], 0x50
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    s_mov_b32 s0, s4
+; GFX12-NEXT:    s_mov_b32 s1, s5
 ; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], null
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
diff --git a/llvm/test/CodeGen/AMDGPU/mul_int24.ll b/llvm/test/CodeGen/AMDGPU/mul_int24.ll
index 357b851a8f56f..6d7bf0027a822 100644
--- a/llvm/test/CodeGen/AMDGPU/mul_int24.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul_int24.ll
@@ -24,32 +24,32 @@ define amdgpu_kernel void @test_smul24_i32(ptr addrspace(1) %out, i32 %a, i32 %b
 ;
 ; VI-LABEL: test_smul24_i32:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    s_bfe_i32 s0, s2, 0x180000
-; VI-NEXT:    s_bfe_i32 s1, s3, 0x180000
-; VI-NEXT:    s_mul_i32 s0, s0, s1
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_bfe_i32 s4, s6, 0x180000
+; VI-NEXT:    s_bfe_i32 s5, s7, 0x180000
+; VI-NEXT:    s_mul_i32 s4, s4, s5
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: test_smul24_i32:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s4, s0
-; GFX9-NEXT:    s_mov_b32 s5, s1
-; GFX9-NEXT:    s_bfe_i32 s0, s2, 0x180000
-; GFX9-NEXT:    s_bfe_i32 s1, s3, 0x180000
-; GFX9-NEXT:    s_mul_i32 s0, s0, s1
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_bfe_i32 s4, s6, 0x180000
+; GFX9-NEXT:    s_bfe_i32 s5, s7, 0x180000
+; GFX9-NEXT:    s_mul_i32 s4, s4, s5
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; EG-LABEL: test_smul24_i32:
@@ -113,30 +113,30 @@ define amdgpu_kernel void @test_smulhi24_i64(ptr addrspace(1) %out, i32 %a, i32
 ;
 ; VI-LABEL: test_smulhi24_i64:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s3
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    v_mul_hi_i32_i24_e32 v0, s2, v0
-; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    v_mov_b32_e32 v0, s7
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    v_mul_hi_i32_i24_e32 v0, s6, v0
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: test_smulhi24_i64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s4, s0
-; GFX9-NEXT:    s_mov_b32 s5, s1
-; GFX9-NEXT:    s_bfe_i32 s0, s2, 0x180000
-; GFX9-NEXT:    s_bfe_i32 s1, s3, 0x180000
-; GFX9-NEXT:    s_mul_hi_i32 s0, s0, s1
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_bfe_i32 s4, s6, 0x180000
+; GFX9-NEXT:    s_bfe_i32 s5, s7, 0x180000
+; GFX9-NEXT:    s_mul_hi_i32 s4, s4, s5
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; EG-LABEL: test_smulhi24_i64:
diff --git a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
index 3a16c88f32cc3..e6470a5833d3a 100644
--- a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
@@ -24,32 +24,32 @@ define amdgpu_kernel void @test_umul24_i32(ptr addrspace(1) %out, i32 %a, i32 %b
 ;
 ; VI-LABEL: test_umul24_i32:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    s_and_b32 s0, s2, 0xffffff
-; VI-NEXT:    s_and_b32 s1, s3, 0xffffff
-; VI-NEXT:    s_mul_i32 s0, s0, s1
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_and_b32 s4, s6, 0xffffff
+; VI-NEXT:    s_and_b32 s5, s7, 0xffffff
+; VI-NEXT:    s_mul_i32 s4, s4, s5
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: test_umul24_i32:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s4, s0
-; GFX9-NEXT:    s_mov_b32 s5, s1
-; GFX9-NEXT:    s_and_b32 s0, s2, 0xffffff
-; GFX9-NEXT:    s_and_b32 s1, s3, 0xffffff
-; GFX9-NEXT:    s_mul_i32 s0, s0, s1
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_and_b32 s4, s6, 0xffffff
+; GFX9-NEXT:    s_and_b32 s5, s7, 0xffffff
+; GFX9-NEXT:    s_mul_i32 s4, s4, s5
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 entry:
   %0 = shl i32 %a, 8
@@ -392,30 +392,30 @@ define amdgpu_kernel void @test_umulhi24_i32_i64(ptr addrspace(1) %out, i32 %a,
 ;
 ; VI-LABEL: test_umulhi24_i32_i64:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s3
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    v_mul_hi_u32_u24_e32 v0, s2, v0
-; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    v_mov_b32_e32 v0, s7
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    v_mul_hi_u32_u24_e32 v0, s6, v0
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: test_umulhi24_i32_i64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s4, s0
-; GFX9-NEXT:    s_mov_b32 s5, s1
-; GFX9-NEXT:    s_and_b32 s0, s2, 0xffffff
-; GFX9-NEXT:    s_and_b32 s1, s3, 0xffffff
-; GFX9-NEXT:    s_mul_hi_u32 s0, s0, s1
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_and_b32 s4, s6, 0xffffff
+; GFX9-NEXT:    s_and_b32 s5, s7, 0xffffff
+; GFX9-NEXT:    s_mul_hi_u32 s4, s4, s5
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 entry:
   %a.24 = and i32 %a, 16777215
@@ -647,30 +647,30 @@ define amdgpu_kernel void @test_umulhi16_i32(ptr addrspace(1) %out, i32 %a, i32
 ;
 ; VI-LABEL: test_umulhi16_i32:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    s_and_b32 s0, s2, 0xffff
-; VI-NEXT:    s_and_b32 s1, s3, 0xffff
-; VI-NEXT:    s_mul_i32 s0, s0, s1
-; VI-NEXT:    s_lshr_b32 s0, s0, 16
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_and_b32 s4, s6, 0xffff
+; VI-NEXT:    s_and_b32 s5, s7, 0xffff
+; VI-NEXT:    s_mul_i32 s4, s4, s5
+; VI-NEXT:    s_lshr_b32 s4, s4, 16
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: test_umulhi16_i32:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX9-NEXT:    s_and_b32 s3, s3, 0xffff
-; GFX9-NEXT:    s_mul_i32 s2, s2, s3
-; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    global_store_short_d16_hi v0, v1, s[0:1]
+; GFX9-NEXT:    s_and_b32 s0, s6, 0xffff
+; GFX9-NEXT:    s_and_b32 s1, s7, 0xffff
+; GFX9-NEXT:    s_mul_i32 s0, s0, s1
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    global_store_short_d16_hi v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 entry:
   %a.16 = and i32 %a, 65535
diff --git a/llvm/test/CodeGen/AMDGPU/or.ll b/llvm/test/CodeGen/AMDGPU/or.ll
index 0473f803bfb30..65f4a1b17a881 100644
--- a/llvm/test/CodeGen/AMDGPU/or.ll
+++ b/llvm/test/CodeGen/AMDGPU/or.ll
@@ -156,15 +156,15 @@ define amdgpu_kernel void @scalar_or_i32(ptr addrspace(1) %out, i32 %a, i32 %b)
 ;
 ; GFX8-LABEL: scalar_or_i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8-NEXT:    s_mov_b32 s7, 0xf000
-; GFX8-NEXT:    s_mov_b32 s6, -1
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT:    s_mov_b32 s3, 0xf000
+; GFX8-NEXT:    s_mov_b32 s2, -1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 s4, s0
-; GFX8-NEXT:    s_or_b32 s0, s2, s3
-; GFX8-NEXT:    s_mov_b32 s5, s1
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX8-NEXT:    s_mov_b32 s0, s4
+; GFX8-NEXT:    s_or_b32 s4, s6, s7
+; GFX8-NEXT:    s_mov_b32 s1, s5
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
 ; EG-LABEL: scalar_or_i32:
diff --git a/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll b/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll
index a3f7906a05f6b..1899a0abc6592 100644
--- a/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll
+++ b/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll
@@ -4,17 +4,17 @@
 define amdgpu_kernel void @fma_vector_vector_scalar_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 {
 ; GCN-LABEL: fma_vector_vector_scalar_lo:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN-NEXT:    ds_read_b32 v2, v0
 ; GCN-NEXT:    ds_read_b32 v0, v0 offset:4
 ; GCN-NEXT:    ds_read_u16 v1, v1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0]
-; GCN-NEXT:    global_store_dword v3, v0, s[0:1]
+; GCN-NEXT:    global_store_dword v3, v0, s[4:5]
 ; GCN-NEXT:    s_endpgm
 bb:
   %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -35,17 +35,17 @@ bb:
 define amdgpu_kernel void @fma_vector_vector_neg_broadcast_scalar_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 {
 ; GCN-LABEL: fma_vector_vector_neg_broadcast_scalar_lo:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN-NEXT:    ds_read_b32 v2, v0
 ; GCN-NEXT:    ds_read_b32 v0, v0 offset:4
 ; GCN-NEXT:    ds_read_u16 v1, v1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1]
-; GCN-NEXT:    global_store_dword v3, v0, s[0:1]
+; GCN-NEXT:    global_store_dword v3, v0, s[4:5]
 ; GCN-NEXT:    s_endpgm
 bb:
   %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -67,17 +67,17 @@ bb:
 define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 {
 ; GCN-LABEL: fma_vector_vector_neg_scalar_lo:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN-NEXT:    ds_read_b32 v2, v0
 ; GCN-NEXT:    ds_read_b32 v0, v0 offset:4
 ; GCN-NEXT:    ds_read_u16 v1, v1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1]
-; GCN-NEXT:    global_store_dword v3, v0, s[0:1]
+; GCN-NEXT:    global_store_dword v3, v0, s[4:5]
 ; GCN-NEXT:    s_endpgm
 bb:
   %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -99,17 +99,17 @@ bb:
 define amdgpu_kernel void @fma_vector_vector_neg_broadcast_neg_scalar_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 {
 ; GCN-LABEL: fma_vector_vector_neg_broadcast_neg_scalar_lo:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN-NEXT:    ds_read_b32 v2, v0
 ; GCN-NEXT:    ds_read_b32 v0, v0 offset:4
 ; GCN-NEXT:    ds_read_u16 v1, v1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0]
-; GCN-NEXT:    global_store_dword v3, v0, s[0:1]
+; GCN-NEXT:    global_store_dword v3, v0, s[4:5]
 ; GCN-NEXT:    s_endpgm
 bb:
   %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -132,17 +132,17 @@ bb:
 define amdgpu_kernel void @fma_vector_vector_scalar_neg_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 {
 ; GCN-LABEL: fma_vector_vector_scalar_neg_lo:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN-NEXT:    ds_read_b32 v2, v0
 ; GCN-NEXT:    ds_read_b32 v0, v0 offset:4
 ; GCN-NEXT:    ds_read_u16 v1, v1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] neg_lo:[0,0,1]
-; GCN-NEXT:    global_store_dword v3, v0, s[0:1]
+; GCN-NEXT:    global_store_dword v3, v0, s[4:5]
 ; GCN-NEXT:    s_endpgm
 bb:
   %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -163,17 +163,17 @@ bb:
 define amdgpu_kernel void @fma_vector_vector_scalar_neg_hi(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 {
 ; GCN-LABEL: fma_vector_vector_scalar_neg_hi:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN-NEXT:    ds_read_b32 v2, v0
 ; GCN-NEXT:    ds_read_b32 v0, v0 offset:4
 ; GCN-NEXT:    ds_read_u16 v1, v1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] neg_hi:[0,0,1]
-; GCN-NEXT:    global_store_dword v3, v0, s[0:1]
+; GCN-NEXT:    global_store_dword v3, v0, s[4:5]
 ; GCN-NEXT:    s_endpgm
 bb:
   %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -194,16 +194,16 @@ bb:
 define amdgpu_kernel void @add_vector_neg_bitcast_scalar_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 {
 ; GCN-LABEL: add_vector_neg_bitcast_scalar_lo:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN-NEXT:    ds_read_b32 v0, v0
 ; GCN-NEXT:    ds_read_u16 v1, v1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_pk_add_u16 v0, v0, v1 op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]
-; GCN-NEXT:    global_store_dword v2, v0, s[0:1]
+; GCN-NEXT:    global_store_dword v2, v0, s[4:5]
 ; GCN-NEXT:    s_endpgm
 bb:
   %vec0 = load volatile <2 x i16>, ptr addrspace(3) %lds, align 4
@@ -222,11 +222,11 @@ bb:
 define amdgpu_kernel void @fma_vector_vector_scalar_lo_neg_scalar_hi(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 {
 ; GCN-LABEL: fma_vector_vector_scalar_lo_neg_scalar_hi:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN-NEXT:    ds_read_b32 v2, v0
 ; GCN-NEXT:    ds_read_b32 v0, v0 offset:4
 ; GCN-NEXT:    ds_read_u16 v3, v1
@@ -237,7 +237,7 @@ define amdgpu_kernel void @fma_vector_vector_scalar_lo_neg_scalar_hi(ptr addrspa
 ; GCN-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
 ; GCN-NEXT:    v_lshl_or_b32 v1, v1, 16, v3
 ; GCN-NEXT:    v_pk_fma_f16 v0, v2, v0, v1
-; GCN-NEXT:    global_store_dword v4, v0, s[0:1]
+; GCN-NEXT:    global_store_dword v4, v0, s[4:5]
 ; GCN-NEXT:    s_endpgm
 bb:
   %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -261,10 +261,10 @@ bb:
 define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo_scalar_hi(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 {
 ; GCN-LABEL: fma_vector_vector_neg_scalar_lo_scalar_hi:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN-NEXT:    ds_read_b32 v2, v0
 ; GCN-NEXT:    ds_read_b32 v0, v0 offset:4
 ; GCN-NEXT:    ds_read_u16 v3, v1
@@ -273,7 +273,7 @@ define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo_scalar_hi(ptr addrspa
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_pk_fma_f16 v0, v2, v0, v3 neg_lo:[0,0,1] neg_hi:[0,0,1]
-; GCN-NEXT:    global_store_dword v1, v0, s[0:1]
+; GCN-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GCN-NEXT:    s_endpgm
 bb:
   %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
index e076df97e1ba4..3f8b64b618e9d 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
@@ -720,13 +720,13 @@ define amdgpu_kernel void @v8i32_kernel_preload_arg(ptr addrspace(1) nocapture %
 define amdgpu_kernel void @v3i16_kernel_preload_arg(ptr addrspace(1) nocapture %out, <3 x i16> %in) #0 {
 ; GFX940-NO-PRELOAD-LABEL: v3i16_kernel_preload_arg:
 ; GFX940-NO-PRELOAD:       ; %bb.0:
-; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x0
 ; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s3
-; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
-; GFX940-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[0:1] offset:4 sc0 sc1
-; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[0:1] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s7
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[4:5] offset:4 sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[4:5] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
 ; GFX940-PRELOAD-2-LABEL: v3i16_kernel_preload_arg:
@@ -934,13 +934,13 @@ define amdgpu_kernel void @v3f32_kernel_preload_arg(ptr addrspace(1) nocapture %
 define amdgpu_kernel void @v5i8_kernel_preload_arg(ptr addrspace(1) nocapture %out, <5 x i8> %in) #0 {
 ; GFX940-NO-PRELOAD-LABEL: v5i8_kernel_preload_arg:
 ; GFX940-NO-PRELOAD:       ; %bb.0:
-; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x0
 ; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s3
-; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
-; GFX940-NO-PRELOAD-NEXT:    global_store_byte v0, v1, s[0:1] offset:4 sc0 sc1
-; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[0:1] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s7
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-NO-PRELOAD-NEXT:    global_store_byte v0, v1, s[4:5] offset:4 sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[4:5] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
 ; GFX940-PRELOAD-2-LABEL: v5i8_kernel_preload_arg:
@@ -1172,11 +1172,11 @@ define amdgpu_kernel void @v5f64_kernel_preload_arg(ptr addrspace(1) nocapture %
 define amdgpu_kernel void @v8i8_kernel_preload_arg(ptr addrspace(1) %out, <8 x i8> %in) #0 {
 ; GFX940-NO-PRELOAD-LABEL: v8i8_kernel_preload_arg:
 ; GFX940-NO-PRELOAD:       ; %bb.0:
-; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x0
 ; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NO-PRELOAD-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
+; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
 ; GFX940-PRELOAD-2-LABEL: v8i8_kernel_preload_arg:
@@ -1592,13 +1592,13 @@ define amdgpu_kernel void @v2bfloat_kernel_preload_arg(ptr addrspace(1) %out, <2
 define amdgpu_kernel void @v3bfloat_kernel_preload_arg(ptr addrspace(1) %out, <3 x bfloat> %in) #0 {
 ; GFX940-NO-PRELOAD-LABEL: v3bfloat_kernel_preload_arg:
 ; GFX940-NO-PRELOAD:       ; %bb.0:
-; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x0
 ; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s3
-; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
-; GFX940-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[0:1] offset:4 sc0 sc1
-; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[0:1] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s7
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[4:5] offset:4 sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[4:5] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
 ; GFX940-PRELOAD-2-LABEL: v3bfloat_kernel_preload_arg:
@@ -1983,14 +1983,14 @@ define amdgpu_kernel void @fp128_kernel_preload_arg(ptr addrspace(1) %out, fp128
 define amdgpu_kernel void @v7i8_kernel_preload_arg(ptr addrspace(1) %out, <7 x i8> %in) #0 {
 ; GFX940-NO-PRELOAD-LABEL: v7i8_kernel_preload_arg:
 ; GFX940-NO-PRELOAD:       ; %bb.0:
-; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x0
 ; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s3
-; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
-; GFX940-NO-PRELOAD-NEXT:    global_store_byte_d16_hi v0, v1, s[0:1] offset:6 sc0 sc1
-; GFX940-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[0:1] offset:4 sc0 sc1
-; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[0:1] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s7
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-NO-PRELOAD-NEXT:    global_store_byte_d16_hi v0, v1, s[4:5] offset:6 sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[4:5] offset:4 sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[4:5] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
 ; GFX940-PRELOAD-2-LABEL: v7i8_kernel_preload_arg:
diff --git a/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll b/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll
index 5bb260c09c9dd..2ce0b9eed02cb 100644
--- a/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll
+++ b/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll
@@ -5,28 +5,28 @@
 define amdgpu_kernel void @buffers_dont_alias(ptr addrspace(8) noalias %a, ptr addrspace(8) noalias %b) {
 ; SDAG-LABEL: buffers_dont_alias:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; SDAG-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    v_mul_f32_e32 v0, v0, v0
 ; SDAG-NEXT:    v_mul_f32_e32 v1, v1, v1
 ; SDAG-NEXT:    v_mul_f32_e32 v2, v2, v2
 ; SDAG-NEXT:    v_mul_f32_e32 v3, v3, v3
-; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: buffers_dont_alias:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GISEL-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    v_mul_f32_e32 v0, v0, v0
 ; GISEL-NEXT:    v_mul_f32_e32 v1, v1, v1
 ; GISEL-NEXT:    v_mul_f32_e32 v2, v2, v2
 ; GISEL-NEXT:    v_mul_f32_e32 v3, v3, v3
-; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
 ; GISEL-NEXT:    s_endpgm
   %l0 = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %a, i32 0, i32 0, i32 0)
   %s0 = fmul float %l0, %l0
@@ -50,40 +50,40 @@ define amdgpu_kernel void @buffers_dont_alias(ptr addrspace(8) noalias %a, ptr a
 define amdgpu_kernel void @buffers_from_flat_dont_alias(ptr noalias %a.flat, ptr noalias %b.flat) {
 ; SDAG-LABEL: buffers_from_flat_dont_alias:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; SDAG-NEXT:    s_mov_b32 s7, 0
-; SDAG-NEXT:    s_mov_b32 s6, 16
+; SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; SDAG-NEXT:    s_mov_b32 s3, 0
+; SDAG-NEXT:    s_mov_b32 s2, 16
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    s_and_b32 s5, s1, 0xffff
-; SDAG-NEXT:    s_mov_b32 s4, s0
-; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; SDAG-NEXT:    s_and_b32 s5, s3, 0xffff
-; SDAG-NEXT:    s_mov_b32 s4, s2
+; SDAG-NEXT:    s_and_b32 s1, s5, 0xffff
+; SDAG-NEXT:    s_mov_b32 s0, s4
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; SDAG-NEXT:    s_and_b32 s1, s7, 0xffff
+; SDAG-NEXT:    s_mov_b32 s0, s6
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    v_mul_f32_e32 v0, v0, v0
 ; SDAG-NEXT:    v_mul_f32_e32 v1, v1, v1
 ; SDAG-NEXT:    v_mul_f32_e32 v2, v2, v2
 ; SDAG-NEXT:    v_mul_f32_e32 v3, v3, v3
-; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: buffers_from_flat_dont_alias:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-NEXT:    s_mov_b32 s7, 0
-; GISEL-NEXT:    s_mov_b32 s6, 16
+; GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GISEL-NEXT:    s_mov_b32 s3, 0
+; GISEL-NEXT:    s_mov_b32 s2, 16
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    s_and_b32 s5, s1, 0xffff
-; GISEL-NEXT:    s_mov_b32 s4, s0
-; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; GISEL-NEXT:    s_and_b32 s5, s3, 0xffff
-; GISEL-NEXT:    s_mov_b32 s4, s2
+; GISEL-NEXT:    s_and_b32 s1, s5, 0xffff
+; GISEL-NEXT:    s_mov_b32 s0, s4
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; GISEL-NEXT:    s_and_b32 s1, s7, 0xffff
+; GISEL-NEXT:    s_mov_b32 s0, s6
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    v_mul_f32_e32 v0, v0, v0
 ; GISEL-NEXT:    v_mul_f32_e32 v1, v1, v1
 ; GISEL-NEXT:    v_mul_f32_e32 v2, v2, v2
 ; GISEL-NEXT:    v_mul_f32_e32 v3, v3, v3
-; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GISEL-NEXT:    s_endpgm
   %a = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %a.flat, i16 0, i32 16, i32 0)
   %b = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %b.flat, i16 0, i32 16, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/rotl.ll b/llvm/test/CodeGen/AMDGPU/rotl.ll
index a87973d93ac77..4a004731c42f9 100644
--- a/llvm/test/CodeGen/AMDGPU/rotl.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotl.ll
@@ -35,35 +35,35 @@ define amdgpu_kernel void @rotl_i32(ptr addrspace(1) %in, i32 %x, i32 %y) {
 ;
 ; GFX8-LABEL: rotl_i32:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_sub_i32 s3, 32, s3
-; GFX8-NEXT:    v_mov_b32_e32 v0, s3
-; GFX8-NEXT:    v_alignbit_b32 v2, s2, s2, v0
+; GFX8-NEXT:    s_sub_i32 s0, 32, s7
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_alignbit_b32 v2, s6, s6, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: rotl_i32:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_sub_i32 s3, 32, s3
-; GFX10-NEXT:    v_alignbit_b32 v1, s2, s2, s3
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    s_sub_i32 s0, 32, s7
+; GFX10-NEXT:    v_alignbit_b32 v1, s6, s6, s0
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: rotl_i32:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_sub_i32 s3, 32, s3
+; GFX11-NEXT:    s_sub_i32 s0, 32, s7
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_alignbit_b32 v1, s2, s2, s3
-; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    v_alignbit_b32 v1, s6, s6, s0
+; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/rotr.ll b/llvm/test/CodeGen/AMDGPU/rotr.ll
index 058ee589bc4b0..d6431d731a1f7 100644
--- a/llvm/test/CodeGen/AMDGPU/rotr.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotr.ll
@@ -32,31 +32,31 @@ define amdgpu_kernel void @rotr_i32(ptr addrspace(1) %in, i32 %x, i32 %y) {
 ;
 ; GFX8-LABEL: rotr_i32:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v0, s3
-; GFX8-NEXT:    v_alignbit_b32 v2, s2, s2, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s7
+; GFX8-NEXT:    v_alignbit_b32 v2, s6, s6, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: rotr_i32:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_alignbit_b32 v1, s2, s2, s3
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    v_alignbit_b32 v1, s6, s6, s7
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: rotr_i32:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_alignbit_b32 v1, s2, s2, s3
-; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    v_alignbit_b32 v1, s6, s6, s7
+; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
index b81af3eb838f1..1384fb0e0203a 100644
--- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
@@ -8,34 +8,34 @@
 define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
 ; GFX9-LABEL: s_shl_v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    s_mov_b32 s4, s0
-; GFX9-NEXT:    s_mov_b32 s5, s1
-; GFX9-NEXT:    v_pk_lshlrev_b16 v0, s3, v0
-; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    v_pk_lshlrev_b16 v0, s7, v0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_shl_v2i16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    s_lshr_b32 s0, s2, 16
-; VI-NEXT:    s_lshr_b32 s1, s3, 16
-; VI-NEXT:    s_lshl_b32 s0, s0, s1
-; VI-NEXT:    s_lshl_b32 s1, s2, s3
-; VI-NEXT:    s_lshl_b32 s0, s0, 16
-; VI-NEXT:    s_and_b32 s1, s1, 0xffff
-; VI-NEXT:    s_or_b32 s0, s1, s0
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_lshr_b32 s4, s6, 16
+; VI-NEXT:    s_lshr_b32 s5, s7, 16
+; VI-NEXT:    s_lshl_b32 s4, s4, s5
+; VI-NEXT:    s_lshl_b32 s5, s6, s7
+; VI-NEXT:    s_lshl_b32 s4, s4, 16
+; VI-NEXT:    s_and_b32 s5, s5, 0xffff
+; VI-NEXT:    s_or_b32 s4, s5, s4
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; CI-LABEL: s_shl_v2i16:
@@ -59,26 +59,26 @@ define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2
 ;
 ; GFX10-LABEL: s_shl_v2i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX10-NEXT:    s_mov_b32 s6, -1
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s2, -1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_pk_lshlrev_b16 v0, s3, s2
-; GFX10-NEXT:    s_mov_b32 s4, s0
-; GFX10-NEXT:    s_mov_b32 s5, s1
-; GFX10-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX10-NEXT:    v_pk_lshlrev_b16 v0, s7, s6
+; GFX10-NEXT:    s_mov_b32 s0, s4
+; GFX10-NEXT:    s_mov_b32 s1, s5
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_shl_v2i16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s6, -1
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_pk_lshlrev_b16 v0, s3, s2
-; GFX11-NEXT:    s_mov_b32 s4, s0
-; GFX11-NEXT:    s_mov_b32 s5, s1
-; GFX11-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT:    v_pk_lshlrev_b16 v0, s7, s6
+; GFX11-NEXT:    s_mov_b32 s0, s4
+; GFX11-NEXT:    s_mov_b32 s1, s5
+; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/sign_extend.ll b/llvm/test/CodeGen/AMDGPU/sign_extend.ll
index 9a03d216c7a99..33249e4faeccd 100644
--- a/llvm/test/CodeGen/AMDGPU/sign_extend.ll
+++ b/llvm/test/CodeGen/AMDGPU/sign_extend.ll
@@ -19,16 +19,16 @@ define amdgpu_kernel void @s_sext_i1_to_i32(ptr addrspace(1) %out, i32 %a, i32 %
 ;
 ; VI-LABEL: s_sext_i1_to_i32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_cmp_eq_u32 s2, s3
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
-; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    s_cmp_eq_u32 s6, s7
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[4:5]
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %cmp = icmp eq i32 %a, %b
   %sext = sext i1 %cmp to i32
@@ -96,17 +96,17 @@ define amdgpu_kernel void @s_sext_i1_to_i64(ptr addrspace(1) %out, i32 %a, i32 %
 ;
 ; VI-LABEL: s_sext_i1_to_i64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_cmp_eq_u32 s2, s3
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
+; VI-NEXT:    s_cmp_eq_u32 s6, s7
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[4:5]
 ; VI-NEXT:    v_mov_b32_e32 v1, v0
-; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %cmp = icmp eq i32 %a, %b
   %sext = sext i1 %cmp to i64
@@ -235,16 +235,16 @@ define amdgpu_kernel void @s_sext_i1_to_i16(ptr addrspace(1) %out, i32 %a, i32 %
 ;
 ; VI-LABEL: s_sext_i1_to_i16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_cmp_eq_u32 s2, s3
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
-; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT:    s_cmp_eq_u32 s6, s7
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[4:5]
+; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %cmp = icmp eq i32 %a, %b
   %sext = sext i1 %cmp to i16
diff --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
index f88aaf389ca9a..23222a45818c0 100644
--- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
@@ -208,11 +208,11 @@ define amdgpu_kernel void @local_store_i48(ptr addrspace(3) %ptr, i48 %arg) #0 {
 ; GFX11-LABEL: local_store_i48:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x0
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-NEXT:    s_load_b32 s4, s[0:1], 0x0
+; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-NEXT:    ds_store_b16 v0, v1 offset:4
 ; GFX11-NEXT:    ds_store_b32 v0, v2
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/sub.ll b/llvm/test/CodeGen/AMDGPU/sub.ll
index ded308ae4f230..45aa544bf80af 100644
--- a/llvm/test/CodeGen/AMDGPU/sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.ll
@@ -22,33 +22,33 @@ define amdgpu_kernel void @s_sub_i32(ptr addrspace(1) %out, i32 %a, i32 %b) {
 ;
 ; GFX8-LABEL: s_sub_i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_sub_i32 s2, s2, s3
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
-; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    s_sub_i32 s0, s6, s7
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: s_sub_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_sub_i32 s2, s2, s3
-; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_sub_i32 s0, s6, s7
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: s_sub_i32:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_sub_co_i32 s2, s2, s3
+; GFX12-NEXT:    s_sub_co_i32 s0, s6, s7
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT:    global_store_b32 v0, v1, s[4:5]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
@@ -95,12 +95,12 @@ define amdgpu_kernel void @s_sub_imm_i32(ptr addrspace(1) %out, i32 %a) {
 ;
 ; GFX12-LABEL: s_sub_imm_i32:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_sub_co_i32 s2, 0x4d2, s2
+; GFX12-NEXT:    s_sub_co_i32 s0, 0x4d2, s6
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT:    global_store_b32 v0, v1, s[4:5]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
index 6ec213a06999b..6c53afe840d18 100644
--- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
@@ -214,58 +214,58 @@ define amdgpu_kernel void @s_test_sub_self_v2i16(ptr addrspace(1) %out, ptr addr
 define amdgpu_kernel void @s_test_sub_v2i16_kernarg(ptr addrspace(1) %out, <2 x i16> %a, <2 x i16> %b) #1 {
 ; GFX9-LABEL: s_test_sub_v2i16_kernarg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s3
-; GFX9-NEXT:    s_mov_b32 s4, s0
-; GFX9-NEXT:    s_mov_b32 s5, s1
-; GFX9-NEXT:    v_pk_sub_i16 v0, s2, v0
-; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s7
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    v_pk_sub_i16 v0, s6, v0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_test_sub_v2i16_kernarg:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    s_lshr_b32 s0, s2, 16
-; VI-NEXT:    s_lshr_b32 s1, s3, 16
-; VI-NEXT:    s_sub_i32 s0, s0, s1
-; VI-NEXT:    s_sub_i32 s1, s2, s3
-; VI-NEXT:    s_lshl_b32 s0, s0, 16
-; VI-NEXT:    s_and_b32 s1, s1, 0xffff
-; VI-NEXT:    s_or_b32 s0, s1, s0
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_lshr_b32 s4, s6, 16
+; VI-NEXT:    s_lshr_b32 s5, s7, 16
+; VI-NEXT:    s_sub_i32 s4, s4, s5
+; VI-NEXT:    s_sub_i32 s5, s6, s7
+; VI-NEXT:    s_lshl_b32 s4, s4, 16
+; VI-NEXT:    s_and_b32 s5, s5, 0xffff
+; VI-NEXT:    s_or_b32 s4, s5, s4
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: s_test_sub_v2i16_kernarg:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX10-NEXT:    s_mov_b32 s6, -1
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s2, -1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_pk_sub_i16 v0, s2, s3
-; GFX10-NEXT:    s_mov_b32 s4, s0
-; GFX10-NEXT:    s_mov_b32 s5, s1
-; GFX10-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX10-NEXT:    v_pk_sub_i16 v0, s6, s7
+; GFX10-NEXT:    s_mov_b32 s0, s4
+; GFX10-NEXT:    s_mov_b32 s1, s5
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_test_sub_v2i16_kernarg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s6, -1
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_pk_sub_i16 v0, s2, s3
-; GFX11-NEXT:    s_mov_b32 s4, s0
-; GFX11-NEXT:    s_mov_b32 s5, s1
-; GFX11-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT:    v_pk_sub_i16 v0, s6, s7
+; GFX11-NEXT:    s_mov_b32 s0, s4
+; GFX11-NEXT:    s_mov_b32 s1, s5
+; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll
index f686aad0cefc2..c7952f561427c 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv.ll
@@ -218,35 +218,35 @@ define amdgpu_kernel void @s_udiv_i32(ptr addrspace(1) %out, i32 %a, i32 %b) {
 ;
 ; VI-LABEL: s_udiv_i32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; VI-NEXT:    s_sub_i32 s4, 0, s3
-; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    v_cvt_f32_u32_e32 v0, s7
+; VI-NEXT:    s_sub_i32 s0, 0, s7
+; VI-NEXT:    s_mov_b32 s1, s5
 ; VI-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; VI-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; VI-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; VI-NEXT:    v_mul_lo_u32 v1, s4, v0
-; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    v_mul_lo_u32 v1, s0, v0
+; VI-NEXT:    s_mov_b32 s0, s4
 ; VI-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
-; VI-NEXT:    v_mul_hi_u32 v0, s2, v0
-; VI-NEXT:    v_readfirstlane_b32 s0, v0
-; VI-NEXT:    s_mul_i32 s0, s0, s3
-; VI-NEXT:    s_sub_i32 s0, s2, s0
-; VI-NEXT:    s_sub_i32 s1, s0, s3
+; VI-NEXT:    v_mul_hi_u32 v0, s6, v0
+; VI-NEXT:    v_readfirstlane_b32 s4, v0
+; VI-NEXT:    s_mul_i32 s4, s4, s7
+; VI-NEXT:    s_sub_i32 s4, s6, s4
+; VI-NEXT:    s_sub_i32 s5, s4, s7
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 1, v0
-; VI-NEXT:    s_cmp_ge_u32 s0, s3
+; VI-NEXT:    s_cmp_ge_u32 s4, s7
 ; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; VI-NEXT:    s_cselect_b32 s0, s1, s0
+; VI-NEXT:    s_cselect_b32 s4, s5, s4
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 1, v0
-; VI-NEXT:    s_cmp_ge_u32 s0, s3
+; VI-NEXT:    s_cmp_ge_u32 s4, s7
 ; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; GCN-LABEL: s_udiv_i32:
diff --git a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
index fc6df735c05b0..e5de7d0a74642 100644
--- a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
@@ -122,38 +122,38 @@ define amdgpu_kernel void @v_cnd_nan(ptr addrspace(1) %out, i32 %c, float %f) #0
 ;
 ; VI-LABEL: v_cnd_nan:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_cmp_eq_u32 s2, 0
-; VI-NEXT:    v_mov_b32_e32 v0, s3
+; VI-NEXT:    s_cmp_eq_u32 s6, 0
+; VI-NEXT:    v_mov_b32_e32 v0, s7
 ; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_cnd_nan:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_cmp_eq_u32 s2, 0
-; GFX10-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, -1, s3, s[4:5]
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX10-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, -1, s7, s[0:1]
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_cnd_nan:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_cmp_eq_u32 s2, 0
-; GFX11-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GFX11-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX11-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, -1, s3, s[4:5]
-; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, -1, s7, s[0:1]
+; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
index 89fef7eead839..7f69c4733b38a 100644
--- a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
@@ -89,110 +89,110 @@ define <2 x i16> @basic_smax_smin(i16 %src0, i16 %src1) {
 define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg %src0ext, i32 inreg %src1ext) {
 ; SDAG-VI-LABEL: basic_smax_smin_sgpr:
 ; SDAG-VI:       ; %bb.0:
-; SDAG-VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; SDAG-VI-NEXT:    v_mov_b32_e32 v0, 0xff
 ; SDAG-VI-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT:    v_max_i16_e64 v1, s2, 0
-; SDAG-VI-NEXT:    v_max_i16_e64 v2, s3, 0
+; SDAG-VI-NEXT:    v_max_i16_e64 v1, s6, 0
+; SDAG-VI-NEXT:    v_max_i16_e64 v2, s7, 0
 ; SDAG-VI-NEXT:    v_min_i16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; SDAG-VI-NEXT:    v_min_i16_e32 v1, 0xff, v1
 ; SDAG-VI-NEXT:    v_or_b32_e32 v2, v1, v0
-; SDAG-VI-NEXT:    v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT:    v_mov_b32_e32 v1, s1
+; SDAG-VI-NEXT:    v_mov_b32_e32 v0, s4
+; SDAG-VI-NEXT:    v_mov_b32_e32 v1, s5
 ; SDAG-VI-NEXT:    flat_store_dword v[0:1], v2
 ; SDAG-VI-NEXT:    s_endpgm
 ;
 ; SDAG-GFX9-LABEL: basic_smax_smin_sgpr:
 ; SDAG-GFX9:       ; %bb.0:
-; SDAG-GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; SDAG-GFX9-NEXT:    v_mov_b32_e32 v1, 0xff
 ; SDAG-GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX9-NEXT:    v_med3_i16 v2, s2, 0, v1
-; SDAG-GFX9-NEXT:    v_med3_i16 v1, s3, 0, v1
+; SDAG-GFX9-NEXT:    v_med3_i16 v2, s6, 0, v1
+; SDAG-GFX9-NEXT:    v_med3_i16 v1, s7, 0, v1
 ; SDAG-GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; SDAG-GFX9-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
-; SDAG-GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; SDAG-GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; SDAG-GFX9-NEXT:    s_endpgm
 ;
 ; SDAG-GFX11-LABEL: basic_smax_smin_sgpr:
 ; SDAG-GFX11:       ; %bb.0:
-; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; SDAG-GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; SDAG-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT:    v_med3_i16 v0, s2, 0, 0xff
-; SDAG-GFX11-NEXT:    v_med3_i16 v1, s3, 0, 0xff
+; SDAG-GFX11-NEXT:    v_med3_i16 v0, s6, 0, 0xff
+; SDAG-GFX11-NEXT:    v_med3_i16 v1, s7, 0, 0xff
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SDAG-GFX11-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; SDAG-GFX11-NEXT:    global_store_b32 v2, v0, s[0:1]
+; SDAG-GFX11-NEXT:    global_store_b32 v2, v0, s[4:5]
 ; SDAG-GFX11-NEXT:    s_nop 0
 ; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-VI-LABEL: basic_smax_smin_sgpr:
 ; GISEL-VI:       ; %bb.0:
-; GISEL-VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-VI-NEXT:    s_sext_i32_i16 s4, 0
-; GISEL-VI-NEXT:    s_sext_i32_i16 s5, 0xff
+; GISEL-VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GISEL-VI-NEXT:    s_sext_i32_i16 s0, 0
+; GISEL-VI-NEXT:    s_sext_i32_i16 s1, 0xff
 ; GISEL-VI-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT:    s_sext_i32_i16 s3, s3
+; GISEL-VI-NEXT:    s_sext_i32_i16 s2, s6
+; GISEL-VI-NEXT:    s_sext_i32_i16 s3, s7
+; GISEL-VI-NEXT:    s_max_i32 s2, s2, s0
+; GISEL-VI-NEXT:    s_max_i32 s0, s3, s0
+; GISEL-VI-NEXT:    s_sext_i32_i16 s0, s0
 ; GISEL-VI-NEXT:    s_sext_i32_i16 s2, s2
-; GISEL-VI-NEXT:    s_max_i32 s3, s3, s4
-; GISEL-VI-NEXT:    s_max_i32 s2, s2, s4
-; GISEL-VI-NEXT:    s_sext_i32_i16 s3, s3
-; GISEL-VI-NEXT:    s_sext_i32_i16 s2, s2
-; GISEL-VI-NEXT:    s_min_i32 s3, s3, s5
-; GISEL-VI-NEXT:    s_min_i32 s2, s2, s5
-; GISEL-VI-NEXT:    s_and_b32 s3, 0xffff, s3
-; GISEL-VI-NEXT:    s_and_b32 s2, 0xffff, s2
-; GISEL-VI-NEXT:    s_lshl_b32 s3, s3, 16
-; GISEL-VI-NEXT:    s_or_b32 s2, s2, s3
-; GISEL-VI-NEXT:    v_mov_b32_e32 v0, s0
-; GISEL-VI-NEXT:    v_mov_b32_e32 v2, s2
-; GISEL-VI-NEXT:    v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT:    s_min_i32 s0, s0, s1
+; GISEL-VI-NEXT:    s_min_i32 s2, s2, s1
+; GISEL-VI-NEXT:    s_and_b32 s0, 0xffff, s0
+; GISEL-VI-NEXT:    s_and_b32 s1, 0xffff, s2
+; GISEL-VI-NEXT:    s_lshl_b32 s0, s0, 16
+; GISEL-VI-NEXT:    s_or_b32 s0, s1, s0
+; GISEL-VI-NEXT:    v_mov_b32_e32 v0, s4
+; GISEL-VI-NEXT:    v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT:    v_mov_b32_e32 v1, s5
 ; GISEL-VI-NEXT:    flat_store_dword v[0:1], v2
 ; GISEL-VI-NEXT:    s_endpgm
 ;
 ; GISEL-GFX9-LABEL: basic_smax_smin_sgpr:
 ; GISEL-GFX9:       ; %bb.0:
-; GISEL-GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-GFX9-NEXT:    s_sext_i32_i16 s4, 0
-; GISEL-GFX9-NEXT:    s_sext_i32_i16 s5, 0xff
+; GISEL-GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GISEL-GFX9-NEXT:    s_sext_i32_i16 s0, 0
+; GISEL-GFX9-NEXT:    s_sext_i32_i16 s1, 0xff
 ; GISEL-GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-GFX9-NEXT:    s_sext_i32_i16 s2, s6
+; GISEL-GFX9-NEXT:    s_sext_i32_i16 s3, s7
+; GISEL-GFX9-NEXT:    s_max_i32 s2, s2, s0
+; GISEL-GFX9-NEXT:    s_max_i32 s0, s3, s0
 ; GISEL-GFX9-NEXT:    s_sext_i32_i16 s2, s2
-; GISEL-GFX9-NEXT:    s_sext_i32_i16 s3, s3
-; GISEL-GFX9-NEXT:    s_max_i32 s2, s2, s4
-; GISEL-GFX9-NEXT:    s_max_i32 s3, s3, s4
-; GISEL-GFX9-NEXT:    s_sext_i32_i16 s2, s2
-; GISEL-GFX9-NEXT:    s_sext_i32_i16 s3, s3
-; GISEL-GFX9-NEXT:    s_min_i32 s2, s2, s5
-; GISEL-GFX9-NEXT:    s_min_i32 s3, s3, s5
-; GISEL-GFX9-NEXT:    s_pack_ll_b32_b16 s2, s2, s3
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GISEL-GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GISEL-GFX9-NEXT:    s_sext_i32_i16 s0, s0
+; GISEL-GFX9-NEXT:    s_min_i32 s2, s2, s1
+; GISEL-GFX9-NEXT:    s_min_i32 s0, s0, s1
+; GISEL-GFX9-NEXT:    s_pack_ll_b32_b16 s0, s2, s0
+; GISEL-GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GISEL-GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GISEL-GFX9-NEXT:    s_endpgm
 ;
 ; GISEL-GFX11-LABEL: basic_smax_smin_sgpr:
 ; GISEL-GFX11:       ; %bb.0:
-; GISEL-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GISEL-GFX11-NEXT:    s_sext_i32_i16 s4, 0
-; GISEL-GFX11-NEXT:    s_sext_i32_i16 s5, 0xff
+; GISEL-GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GISEL-GFX11-NEXT:    s_sext_i32_i16 s0, 0
+; GISEL-GFX11-NEXT:    s_sext_i32_i16 s1, 0xff
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-GFX11-NEXT:    s_sext_i32_i16 s2, s6
+; GISEL-GFX11-NEXT:    s_sext_i32_i16 s3, s7
+; GISEL-GFX11-NEXT:    s_max_i32 s2, s2, s0
+; GISEL-GFX11-NEXT:    s_max_i32 s0, s3, s0
 ; GISEL-GFX11-NEXT:    s_sext_i32_i16 s2, s2
-; GISEL-GFX11-NEXT:    s_sext_i32_i16 s3, s3
-; GISEL-GFX11-NEXT:    s_max_i32 s2, s2, s4
-; GISEL-GFX11-NEXT:    s_max_i32 s3, s3, s4
-; GISEL-GFX11-NEXT:    s_sext_i32_i16 s2, s2
-; GISEL-GFX11-NEXT:    s_sext_i32_i16 s3, s3
-; GISEL-GFX11-NEXT:    s_min_i32 s2, s2, s5
-; GISEL-GFX11-NEXT:    s_min_i32 s3, s3, s5
+; GISEL-GFX11-NEXT:    s_sext_i32_i16 s0, s0
+; GISEL-GFX11-NEXT:    s_min_i32 s2, s2, s1
+; GISEL-GFX11-NEXT:    s_min_i32 s0, s0, s1
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-GFX11-NEXT:    s_pack_ll_b32_b16 s2, s2, s3
-; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT:    s_pack_ll_b32_b16 s0, s2, s0
+; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
 ; GISEL-GFX11-NEXT:    s_nop 0
 ; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
index 901e88a4c6aca..e12a4beb5dbe5 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -1344,40 +1344,40 @@ exit:
 define amdgpu_kernel void @fdiv_f32(ptr addrspace(1) %out, float %a, float %b) #0 {
 ; GFX1032-LABEL: fdiv_f32:
 ; GFX1032:       ; %bb.0: ; %entry
-; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1032-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_div_scale_f32 v0, s4, s3, s3, s2
+; GFX1032-NEXT:    v_div_scale_f32 v0, s0, s7, s7, s6
 ; GFX1032-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX1032-NEXT:    v_fma_f32 v2, -v0, v1, 1.0
 ; GFX1032-NEXT:    v_fmac_f32_e32 v1, v2, v1
-; GFX1032-NEXT:    v_div_scale_f32 v2, vcc_lo, s2, s3, s2
+; GFX1032-NEXT:    v_div_scale_f32 v2, vcc_lo, s6, s7, s6
 ; GFX1032-NEXT:    v_mul_f32_e32 v3, v2, v1
 ; GFX1032-NEXT:    v_fma_f32 v4, -v0, v3, v2
 ; GFX1032-NEXT:    v_fmac_f32_e32 v3, v4, v1
 ; GFX1032-NEXT:    v_fma_f32 v0, -v0, v3, v2
 ; GFX1032-NEXT:    v_div_fmas_f32 v0, v0, v1, v3
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-NEXT:    v_div_fixup_f32 v0, v0, s3, s2
-; GFX1032-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1032-NEXT:    v_div_fixup_f32 v0, v0, s7, s6
+; GFX1032-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: fdiv_f32:
 ; GFX1064:       ; %bb.0: ; %entry
-; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1064-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_div_scale_f32 v0, s[4:5], s3, s3, s2
+; GFX1064-NEXT:    v_div_scale_f32 v0, s[0:1], s7, s7, s6
 ; GFX1064-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX1064-NEXT:    v_fma_f32 v2, -v0, v1, 1.0
 ; GFX1064-NEXT:    v_fmac_f32_e32 v1, v2, v1
-; GFX1064-NEXT:    v_div_scale_f32 v2, vcc, s2, s3, s2
+; GFX1064-NEXT:    v_div_scale_f32 v2, vcc, s6, s7, s6
 ; GFX1064-NEXT:    v_mul_f32_e32 v3, v2, v1
 ; GFX1064-NEXT:    v_fma_f32 v4, -v0, v3, v2
 ; GFX1064-NEXT:    v_fmac_f32_e32 v3, v4, v1
 ; GFX1064-NEXT:    v_fma_f32 v0, -v0, v3, v2
 ; GFX1064-NEXT:    v_div_fmas_f32 v0, v0, v1, v3
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-NEXT:    v_div_fixup_f32 v0, v0, s3, s2
-; GFX1064-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1064-NEXT:    v_div_fixup_f32 v0, v0, s7, s6
+; GFX1064-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX1064-NEXT:    s_endpgm
 entry:
   %fdiv = fdiv float %a, %b
@@ -2138,23 +2138,23 @@ main_body:
 define amdgpu_kernel void @test_intr_fcmp_i64(ptr addrspace(1) %out, float %src, float %a) {
 ; GFX1032-LABEL: test_intr_fcmp_i64:
 ; GFX1032:       ; %bb.0:
-; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1032-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_cmp_eq_f32_e64 s2, s2, |s3|
-; GFX1032-NEXT:    v_mov_b32_e32 v0, s2
-; GFX1032-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX1032-NEXT:    v_cmp_eq_f32_e64 s0, s6, |s7|
+; GFX1032-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1032-NEXT:    global_store_dwordx2 v1, v[0:1], s[4:5]
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: test_intr_fcmp_i64:
 ; GFX1064:       ; %bb.0:
-; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1064-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_cmp_eq_f32_e64 s[2:3], s2, |s3|
-; GFX1064-NEXT:    v_mov_b32_e32 v0, s2
-; GFX1064-NEXT:    v_mov_b32_e32 v1, s3
-; GFX1064-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064-NEXT:    v_cmp_eq_f32_e64 s[0:1], s6, |s7|
+; GFX1064-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1064-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1064-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX1064-NEXT:    s_endpgm
   %temp = call float @llvm.fabs.f32(float %a)
   %result = call i64 @llvm.amdgcn.fcmp.i64.f32(float %src, float %temp, i32 1)
@@ -2195,22 +2195,22 @@ define amdgpu_kernel void @test_intr_icmp_i64(ptr addrspace(1) %out, i32 %src) {
 define amdgpu_kernel void @test_intr_fcmp_i32(ptr addrspace(1) %out, float %src, float %a) {
 ; GFX1032-LABEL: test_intr_fcmp_i32:
 ; GFX1032:       ; %bb.0:
-; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1032-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_cmp_eq_f32_e64 s2, s2, |s3|
-; GFX1032-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1032-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX1032-NEXT:    v_cmp_eq_f32_e64 s0, s6, |s7|
+; GFX1032-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1032-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: test_intr_fcmp_i32:
 ; GFX1064:       ; %bb.0:
-; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1064-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_cmp_eq_f32_e64 s[2:3], s2, |s3|
-; GFX1064-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1064-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX1064-NEXT:    v_cmp_eq_f32_e64 s[0:1], s6, |s7|
+; GFX1064-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1064-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX1064-NEXT:    s_endpgm
   %temp = call float @llvm.fabs.f32(float %a)
   %result = call i32 @llvm.amdgcn.fcmp.i32.f32(float %src, float %temp, i32 1)
diff --git a/llvm/test/CodeGen/AMDGPU/xor.ll b/llvm/test/CodeGen/AMDGPU/xor.ll
index e15fd7f29671a..5422bfa5389c9 100644
--- a/llvm/test/CodeGen/AMDGPU/xor.ll
+++ b/llvm/test/CodeGen/AMDGPU/xor.ll
@@ -276,12 +276,12 @@ define amdgpu_kernel void @scalar_xor_i32(ptr addrspace(1) %out, i32 %a, i32 %b)
 ;
 ; VI-LABEL: scalar_xor_i32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_xor_b32 s2, s2, s3
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    s_xor_b32 s0, s6, s7
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
   %result = xor i32 %a, %b

>From 2e1353753e7de18f8ca02d51db811b5f3c63d0d3 Mon Sep 17 00:00:00 2001
From: Christudasan Devadasan <Christudasan.Devadasan at amd.com>
Date: Thu, 20 Jun 2024 15:02:07 +0000
Subject: [PATCH 2/6] Used hasXnackReplay call to check if xnack feature is
 enabled.

---
 .../Target/AMDGPU/SILoadStoreOptimizer.cpp    |  17 +-
 .../AMDGPU/GlobalISel/cvt_f32_ubyte.ll        | 169 ++++----
 .../llvm.amdgcn.global.atomic.csub.ll         |  12 +-
 .../GlobalISel/llvm.amdgcn.intersect_ray.ll   |  78 ++--
 .../GlobalISel/llvm.amdgcn.set.inactive.ll    | 100 ++---
 .../GlobalISel/llvm.amdgcn.update.dpp.ll      |  56 +--
 .../test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll |  52 +--
 .../test/CodeGen/AMDGPU/GlobalISel/udivrem.ll |  46 +--
 llvm/test/CodeGen/AMDGPU/add.v2i16.ll         |  28 +-
 llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll  |  48 +--
 llvm/test/CodeGen/AMDGPU/bfe-patterns.ll      |  40 +-
 llvm/test/CodeGen/AMDGPU/bfm.ll               |  10 +-
 llvm/test/CodeGen/AMDGPU/bitreverse.ll        | 314 +++++++--------
 llvm/test/CodeGen/AMDGPU/build_vector.ll      |  38 +-
 .../CodeGen/AMDGPU/calling-conventions.ll     |  50 +--
 llvm/test/CodeGen/AMDGPU/cluster_stores.ll    |  32 +-
 llvm/test/CodeGen/AMDGPU/ctlz.ll              | 274 ++++++-------
 llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll   | 288 +++++++-------
 llvm/test/CodeGen/AMDGPU/ctpop16.ll           | 100 +++--
 llvm/test/CodeGen/AMDGPU/ctpop64.ll           |  48 +--
 llvm/test/CodeGen/AMDGPU/cttz.ll              | 186 ++++-----
 llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll   | 298 +++++++-------
 llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll     | 371 +++++++++---------
 .../AMDGPU/divergence-driven-buildvector.ll   |  16 +-
 .../CodeGen/AMDGPU/extract_vector_elt-f16.ll  |  50 +--
 llvm/test/CodeGen/AMDGPU/fabs.f16.ll          |  12 +-
 llvm/test/CodeGen/AMDGPU/fabs.ll              |  34 +-
 llvm/test/CodeGen/AMDGPU/fcanonicalize.ll     |   6 +-
 llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll     |  43 +-
 llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll     |  66 ++--
 llvm/test/CodeGen/AMDGPU/fdiv.ll              | 200 +++++-----
 llvm/test/CodeGen/AMDGPU/flat_atomics.ll      |  20 +-
 .../CodeGen/AMDGPU/flat_atomics_i32_system.ll |  56 +--
 llvm/test/CodeGen/AMDGPU/fma-combine.ll       | 176 ++++-----
 .../AMDGPU/fmul-2-combine-multi-use.ll        |  18 +-
 llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll       | 300 +++++++-------
 llvm/test/CodeGen/AMDGPU/fnearbyint.ll        |  18 +-
 llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll |  46 +--
 llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll     |  12 +-
 llvm/test/CodeGen/AMDGPU/fneg-fabs.ll         |  34 +-
 llvm/test/CodeGen/AMDGPU/fneg.ll              |  26 +-
 .../test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll |  52 +--
 llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll |  32 +-
 llvm/test/CodeGen/AMDGPU/fp-classify.ll       |  26 +-
 .../AMDGPU/fp-min-max-buffer-atomics.ll       |  46 +--
 .../AMDGPU/fp-min-max-buffer-ptr-atomics.ll   |  72 ++--
 llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll      |  40 +-
 llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll      |  40 +-
 llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll      |  40 +-
 .../AMDGPU/fp64-min-max-buffer-atomics.ll     |  24 +-
 .../AMDGPU/fp64-min-max-buffer-ptr-atomics.ll |  52 +--
 llvm/test/CodeGen/AMDGPU/fp_to_sint.ll        |  38 +-
 llvm/test/CodeGen/AMDGPU/fp_to_uint.ll        |  38 +-
 llvm/test/CodeGen/AMDGPU/fshl.ll              |  46 +--
 llvm/test/CodeGen/AMDGPU/fshr.ll              |  26 +-
 llvm/test/CodeGen/AMDGPU/global_atomics.ll    |  32 +-
 .../AMDGPU/global_atomics_i32_system.ll       | 100 ++---
 llvm/test/CodeGen/AMDGPU/half.ll              |  82 ++--
 .../CodeGen/AMDGPU/insert_vector_dynelt.ll    | 176 ++++-----
 .../insert_waitcnt_for_precise_memory.ll      |  88 ++---
 llvm/test/CodeGen/AMDGPU/kernel-args.ll       | 106 ++---
 .../CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll   |  16 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll    |  32 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll    |  76 ++--
 .../CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll    |  16 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll    |  34 +-
 .../AMDGPU/llvm.amdgcn.intersect_ray.ll       |  48 +--
 .../CodeGen/AMDGPU/llvm.amdgcn.permlane.ll    | 280 ++++++-------
 .../AMDGPU/llvm.amdgcn.permlane16.var.ll      | 168 ++++----
 .../CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll   |  36 +-
 .../AMDGPU/llvm.amdgcn.s.barrier.wait.ll      | 132 +++----
 .../llvm.amdgcn.sched.group.barrier.gfx11.ll  |  28 +-
 .../llvm.amdgcn.sched.group.barrier.gfx12.ll  |  28 +-
 .../AMDGPU/llvm.amdgcn.set.inactive.ll        |  80 ++--
 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll  |  82 ++--
 llvm/test/CodeGen/AMDGPU/llvm.exp.ll          |  68 ++--
 llvm/test/CodeGen/AMDGPU/llvm.exp10.ll        |  68 ++--
 llvm/test/CodeGen/AMDGPU/llvm.exp2.ll         |  14 +-
 llvm/test/CodeGen/AMDGPU/llvm.log.ll          |  36 +-
 llvm/test/CodeGen/AMDGPU/llvm.log10.ll        |  36 +-
 llvm/test/CodeGen/AMDGPU/llvm.log2.ll         |  52 +--
 .../AMDGPU/llvm.r600.read.local.size.ll       |   8 +-
 llvm/test/CodeGen/AMDGPU/llvm.round.ll        | 104 +++--
 llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll        |  28 +-
 llvm/test/CodeGen/AMDGPU/madak.ll             |  80 ++--
 llvm/test/CodeGen/AMDGPU/memory_clause.ll     |  18 +-
 llvm/test/CodeGen/AMDGPU/merge-s-load.mir     | 180 ++-------
 llvm/test/CodeGen/AMDGPU/min.ll               |  54 +--
 llvm/test/CodeGen/AMDGPU/mul.ll               |  29 +-
 llvm/test/CodeGen/AMDGPU/mul_int24.ll         |  36 +-
 llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll |  58 +--
 llvm/test/CodeGen/AMDGPU/or.ll                |  16 +-
 llvm/test/CodeGen/AMDGPU/rotl.ll              |  18 +-
 llvm/test/CodeGen/AMDGPU/rotr.ll              |  16 +-
 llvm/test/CodeGen/AMDGPU/shl.v2i16.ll         |  42 +-
 llvm/test/CodeGen/AMDGPU/sign_extend.ll       |  54 +--
 llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll |   8 +-
 llvm/test/CodeGen/AMDGPU/sub.ll               |  26 +-
 llvm/test/CodeGen/AMDGPU/sub.v2i16.ll         |  42 +-
 llvm/test/CodeGen/AMDGPU/udiv.ll              |  34 +-
 llvm/test/CodeGen/AMDGPU/v_cndmask.ll         |  20 +-
 llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll   |  78 ++--
 llvm/test/CodeGen/AMDGPU/xor.ll               |  10 +-
 103 files changed, 3573 insertions(+), 3660 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index 0b285d52b539e..f1c9f19346c7d 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -216,8 +216,7 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
                                    CombineInfo &Paired, bool Modify = false);
   static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI,
                         const CombineInfo &Paired);
-  static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired,
-                               const GCNSubtarget *STI = nullptr);
+  unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);
   static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI,
                                                      const CombineInfo &Paired);
   const TargetRegisterClass *
@@ -344,7 +343,6 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
   case AMDGPU::S_LOAD_DWORD_IMM:
-  case AMDGPU::S_LOAD_DWORD_IMM_ec:
   case AMDGPU::GLOBAL_LOAD_DWORD:
   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
   case AMDGPU::GLOBAL_STORE_DWORD:
@@ -513,7 +511,6 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
   case AMDGPU::S_LOAD_DWORDX3_IMM:
   case AMDGPU::S_LOAD_DWORDX4_IMM:
   case AMDGPU::S_LOAD_DWORDX8_IMM:
-  case AMDGPU::S_LOAD_DWORD_IMM_ec:
   case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
   case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
   case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
@@ -602,7 +599,6 @@ static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
   case AMDGPU::S_LOAD_DWORDX3_IMM:
   case AMDGPU::S_LOAD_DWORDX4_IMM:
   case AMDGPU::S_LOAD_DWORDX8_IMM:
-  case AMDGPU::S_LOAD_DWORD_IMM_ec:
   case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
   case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
   case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
@@ -719,7 +715,6 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
   case AMDGPU::S_LOAD_DWORDX3_IMM:
   case AMDGPU::S_LOAD_DWORDX4_IMM:
   case AMDGPU::S_LOAD_DWORDX8_IMM:
-  case AMDGPU::S_LOAD_DWORD_IMM_ec:
   case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
   case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
   case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
@@ -1476,7 +1471,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair(
     MachineBasicBlock::iterator InsertBefore) {
   MachineBasicBlock *MBB = CI.I->getParent();
   DebugLoc DL = CI.I->getDebugLoc();
-  const unsigned Opcode = getNewOpcode(CI, Paired, STM);
+  const unsigned Opcode = getNewOpcode(CI, Paired);
 
   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
 
@@ -1688,8 +1683,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
 }
 
 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
-                                            const CombineInfo &Paired,
-                                            const GCNSubtarget *STI) {
+                                            const CombineInfo &Paired) {
   const unsigned Width = CI.Width + Paired.Width;
 
   switch (getCommonInstClass(CI, Paired)) {
@@ -1732,8 +1726,9 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
       return AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
     }
   case S_LOAD_IMM:
-    // For targets that support XNACK replay, use the constrained load opcode.
-    if (STI && STI->hasXnackReplay()) {
+    // Use the constrained opcodes when the subtarget has the XNACK support
+    // enabled.
+    if (STM->isXNACKEnabled()) {
       switch (Width) {
       default:
         return 0;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
index 3f034eaca4997..eb20178f9f4d8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
@@ -468,18 +468,18 @@ define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr add
 ;
 ; VI-LABEL: load_i8_to_f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s6
-; VI-NEXT:    v_mov_b32_e32 v2, s7
+; VI-NEXT:    v_mov_b32_e32 v1, s2
+; VI-NEXT:    v_mov_b32_e32 v2, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, v2, v3, vcc
 ; VI-NEXT:    flat_load_ubyte v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v0
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -513,16 +513,16 @@ define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr
 ;
 ; VI-LABEL: load_v2i8_to_v2f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s6
-; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_ushort v1, v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v2, s4
-; VI-NEXT:    v_mov_b32_e32 v3, s5
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
 ; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
@@ -562,16 +562,16 @@ define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr
 ;
 ; VI-LABEL: load_v3i8_to_v3f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s6
-; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v2, v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v3, s4
-; VI-NEXT:    v_mov_b32_e32 v4, s5
+; VI-NEXT:    v_mov_b32_e32 v4, s1
+; VI-NEXT:    v_mov_b32_e32 v3, s0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
 ; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
@@ -612,16 +612,16 @@ define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr
 ;
 ; VI-LABEL: load_v4i8_to_v4f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s6
-; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v3, v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v4, s4
-; VI-NEXT:    v_mov_b32_e32 v5, s5
+; VI-NEXT:    v_mov_b32_e32 v5, s1
+; VI-NEXT:    v_mov_b32_e32 v4, s0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
 ; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
@@ -679,11 +679,11 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias
 ;
 ; VI-LABEL: load_v4i8_to_v4f32_unaligned:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s6
-; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 1, v0
@@ -706,12 +706,12 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias
 ; VI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; VI-NEXT:    v_or_b32_e32 v1, v2, v3
 ; VI-NEXT:    v_or_b32_e32 v3, v1, v0
-; VI-NEXT:    v_mov_b32_e32 v4, s4
+; VI-NEXT:    v_mov_b32_e32 v5, s1
 ; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
 ; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
 ; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
 ; VI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v3
-; VI-NEXT:    v_mov_b32_e32 v5, s5
+; VI-NEXT:    v_mov_b32_e32 v4, s0
 ; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; VI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -770,7 +770,6 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
 ; VI-LABEL: load_v4i8_to_v4f32_2_uses:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; VI-NEXT:    v_mov_b32_e32 v6, 9
 ; VI-NEXT:    v_mov_b32_e32 v7, 8
@@ -780,9 +779,11 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v1, v[0:1]
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_mov_b32_e32 v2, 0xff
-; VI-NEXT:    v_mov_b32_e32 v4, s4
-; VI-NEXT:    v_mov_b32_e32 v5, s5
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v5, s1
+; VI-NEXT:    v_mov_b32_e32 v4, s0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
 ; VI-NEXT:    v_and_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -803,8 +804,8 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 24, v6
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; VI-NEXT:    v_or_b32_e32 v2, v0, v2
-; VI-NEXT:    v_mov_b32_e32 v0, s6
-; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
@@ -857,11 +858,11 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr
 ;
 ; VI-LABEL: load_v7i8_to_v7f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s6
-; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 1, v0
@@ -883,10 +884,10 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr
 ; VI-NEXT:    flat_load_ubyte v4, v[8:9]
 ; VI-NEXT:    flat_load_ubyte v5, v[10:11]
 ; VI-NEXT:    flat_load_ubyte v6, v[12:13]
-; VI-NEXT:    s_add_u32 s0, s4, 16
-; VI-NEXT:    v_mov_b32_e32 v8, s5
-; VI-NEXT:    s_addc_u32 s1, s5, 0
-; VI-NEXT:    v_mov_b32_e32 v7, s4
+; VI-NEXT:    v_mov_b32_e32 v8, s1
+; VI-NEXT:    v_mov_b32_e32 v7, s0
+; VI-NEXT:    s_add_u32 s0, s0, 16
+; VI-NEXT:    s_addc_u32 s1, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v10, s1
 ; VI-NEXT:    v_mov_b32_e32 v9, s0
 ; VI-NEXT:    s_waitcnt vmcnt(6)
@@ -948,18 +949,18 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr
 ;
 ; VI-LABEL: load_v8i8_to_v8f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s6
-; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
-; VI-NEXT:    s_add_u32 s0, s4, 16
-; VI-NEXT:    v_mov_b32_e32 v9, s5
-; VI-NEXT:    s_addc_u32 s1, s5, 0
-; VI-NEXT:    v_mov_b32_e32 v8, s4
+; VI-NEXT:    v_mov_b32_e32 v9, s1
+; VI-NEXT:    v_mov_b32_e32 v8, s0
+; VI-NEXT:    s_add_u32 s0, s0, 16
+; VI-NEXT:    s_addc_u32 s1, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v11, s1
 ; VI-NEXT:    v_mov_b32_e32 v10, s0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -1004,19 +1005,19 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %ou
 ;
 ; VI-LABEL: i8_zext_inreg_i32_to_f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s6
-; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 2, v0
 ; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1050,18 +1051,18 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %ou
 ;
 ; VI-LABEL: i8_zext_inreg_hi1_to_f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s6
-; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1095,18 +1096,18 @@ define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr
 ;
 ; VI-LABEL: i8_zext_i32_to_f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s6
-; VI-NEXT:    v_mov_b32_e32 v2, s7
+; VI-NEXT:    v_mov_b32_e32 v1, s2
+; VI-NEXT:    v_mov_b32_e32 v2, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, v2, v3, vcc
 ; VI-NEXT:    flat_load_ubyte v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v0
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1156,11 +1157,11 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou
 ;
 ; VI-LABEL: v4i8_zext_v4i32_to_v4f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s6
-; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 1, v0
@@ -1183,12 +1184,12 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou
 ; VI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; VI-NEXT:    v_or_b32_e32 v1, v2, v3
 ; VI-NEXT:    v_or_b32_e32 v3, v1, v0
-; VI-NEXT:    v_mov_b32_e32 v4, s4
+; VI-NEXT:    v_mov_b32_e32 v5, s1
 ; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
 ; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
 ; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
 ; VI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v3
-; VI-NEXT:    v_mov_b32_e32 v5, s5
+; VI-NEXT:    v_mov_b32_e32 v4, s0
 ; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; VI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1220,18 +1221,18 @@ define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, p
 ;
 ; VI-LABEL: extract_byte0_to_f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s6
-; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_ubyte v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v0
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1264,18 +1265,18 @@ define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, p
 ;
 ; VI-LABEL: extract_byte1_to_f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s6
-; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1309,18 +1310,18 @@ define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, p
 ;
 ; VI-LABEL: extract_byte2_to_f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s6
-; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1353,18 +1354,18 @@ define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, p
 ;
 ; VI-LABEL: extract_byte3_to_f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s6
-; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte3_e32 v2, v0
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
index dbe7d473ea632..ade6e55b482bb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
@@ -160,10 +160,10 @@ define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset(ptr addrspace(1)
 ;
 ; GFX12-LABEL: global_atomic_csub_sgpr_base_offset:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b96 s[4:6], s[0:1], 0x0
+; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
-; GFX12-NEXT:    global_atomic_sub_clamp_u32 v0, v1, v0, s[4:5] offset:4096 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-NEXT:    global_atomic_sub_clamp_u32 v0, v1, v0, s[0:1] offset:4096 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v[0:1], v0, off
 ; GFX12-NEXT:    s_nop 0
@@ -199,10 +199,10 @@ define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset_nortn(ptr addrspa
 ;
 ; GFX12-LABEL: global_atomic_csub_sgpr_base_offset_nortn:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b96 s[4:6], s[0:1], 0x0
+; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
-; GFX12-NEXT:    global_atomic_sub_clamp_u32 v0, v1, v0, s[4:5] offset:4096 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-NEXT:    global_atomic_sub_clamp_u32 v0, v1, v0, s[0:1] offset:4096 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_endpgm
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
   %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %gep, i32 %data)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
index 87d0d712d5bde..ce402fb4e4abc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
@@ -628,7 +628,7 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node
 define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> inreg %tdescr) {
 ; GFX1030-LABEL: image_bvh_intersect_ray_nsa_reassign:
 ; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX1030-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
 ; GFX1030-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
 ; GFX1030-NEXT:    v_mov_b32_e32 v5, 0x40400000
 ; GFX1030-NEXT:    v_mov_b32_e32 v6, 4.0
@@ -637,10 +637,10 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr,
 ; GFX1030-NEXT:    v_mov_b32_e32 v9, 0x40e00000
 ; GFX1030-NEXT:    v_mov_b32_e32 v10, 0x41000000
 ; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v0, s4
-; GFX1030-NEXT:    v_mov_b32_e32 v1, s5
-; GFX1030-NEXT:    v_mov_b32_e32 v2, s6
-; GFX1030-NEXT:    v_mov_b32_e32 v3, s7
+; GFX1030-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1030-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1030-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1030-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX1030-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
 ; GFX1030-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX1030-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
@@ -651,7 +651,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr,
 ; GFX1030-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1030-NEXT:    v_mov_b32_e32 v3, 1.0
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    image_bvh_intersect_ray v[0:3], v[0:10], s[8:11]
+; GFX1030-NEXT:    image_bvh_intersect_ray v[0:3], v[0:10], s[4:7]
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1030-NEXT:    flat_store_dwordx4 v[0:1], v[0:3]
 ; GFX1030-NEXT:    s_endpgm
@@ -688,35 +688,35 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr,
 ;
 ; GFX11-LABEL: image_bvh_intersect_ray_nsa_reassign:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
-; GFX11-NEXT:    s_mov_b32 s2, 2.0
-; GFX11-NEXT:    s_mov_b32 s1, 1.0
-; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    s_mov_b32 s8, 0x40400000
 ; GFX11-NEXT:    s_mov_b32 s12, 0x40c00000
+; GFX11-NEXT:    s_mov_b32 s10, 0x40a00000
+; GFX11-NEXT:    s_mov_b32 s9, 4.0
 ; GFX11-NEXT:    s_mov_b32 s14, 0x41000000
 ; GFX11-NEXT:    s_mov_b32 s13, 0x40e00000
 ; GFX11-NEXT:    v_mov_b32_e32 v6, s12
 ; GFX11-NEXT:    v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v7, s13
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX11-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
-; GFX11-NEXT:    s_mov_b32 s4, 0x40400000
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_mov_b32 s1, 1.0
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
 ; GFX11-NEXT:    flat_load_b32 v9, v[0:1]
 ; GFX11-NEXT:    flat_load_b32 v10, v[2:3]
-; GFX11-NEXT:    s_mov_b32 s6, 0x40a00000
-; GFX11-NEXT:    s_mov_b32 s5, 4.0
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s4
+; GFX11-NEXT:    s_mov_b32 s2, 2.0
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s8
 ; GFX11-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
-; GFX11-NEXT:    v_dual_mov_b32 v5, s6 :: v_dual_mov_b32 v4, s5
+; GFX11-NEXT:    v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v4, s9
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    image_bvh_intersect_ray v[0:3], [v9, v10, v[0:2], v[3:5], v[6:8]], s[8:11]
+; GFX11-NEXT:    image_bvh_intersect_ray v[0:3], [v9, v10, v[0:2], v[3:5], v[6:8]], s[4:7]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b128 v[0:1], v[0:3]
 ; GFX11-NEXT:    s_endpgm
@@ -742,16 +742,16 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr,
 define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> inreg %tdescr) {
 ; GFX1030-LABEL: image_bvh_intersect_ray_a16_nsa_reassign:
 ; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX1030-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
 ; GFX1030-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
 ; GFX1030-NEXT:    v_mov_b32_e32 v5, 0x44004200
 ; GFX1030-NEXT:    v_mov_b32_e32 v6, 0x46004500
 ; GFX1030-NEXT:    v_mov_b32_e32 v7, 0x48004700
 ; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v0, s4
-; GFX1030-NEXT:    v_mov_b32_e32 v1, s5
-; GFX1030-NEXT:    v_mov_b32_e32 v2, s6
-; GFX1030-NEXT:    v_mov_b32_e32 v3, s7
+; GFX1030-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1030-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1030-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1030-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX1030-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
 ; GFX1030-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX1030-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
@@ -762,7 +762,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_
 ; GFX1030-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1030-NEXT:    v_mov_b32_e32 v3, 1.0
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    image_bvh_intersect_ray v[0:3], v[0:7], s[8:11] a16
+; GFX1030-NEXT:    image_bvh_intersect_ray v[0:3], v[0:7], s[4:7] a16
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1030-NEXT:    flat_store_dwordx4 v[0:1], v[0:3]
 ; GFX1030-NEXT:    s_endpgm
@@ -796,30 +796,30 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_
 ;
 ; GFX11-LABEL: image_bvh_intersect_ray_a16_nsa_reassign:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
-; GFX11-NEXT:    s_mov_b32 s2, 2.0
+; GFX11-NEXT:    s_mov_b32 s8, 0x42004600
+; GFX11-NEXT:    s_mov_b32 s9, 0x44004700
+; GFX11-NEXT:    s_mov_b32 s10, 0x45004800
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    s_mov_b32 s1, 1.0
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX11-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
-; GFX11-NEXT:    s_mov_b32 s4, 0x42004600
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
-; GFX11-NEXT:    s_mov_b32 s5, 0x44004700
 ; GFX11-NEXT:    flat_load_b32 v6, v[0:1]
 ; GFX11-NEXT:    flat_load_b32 v7, v[2:3]
-; GFX11-NEXT:    s_mov_b32 s6, 0x45004800
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s4
+; GFX11-NEXT:    s_mov_b32 s2, 2.0
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s8
 ; GFX11-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
-; GFX11-NEXT:    v_dual_mov_b32 v5, s6 :: v_dual_mov_b32 v4, s5
+; GFX11-NEXT:    v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v4, s9
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    image_bvh_intersect_ray v[0:3], [v6, v7, v[0:2], v[3:5]], s[8:11] a16
+; GFX11-NEXT:    image_bvh_intersect_ray v[0:3], [v6, v7, v[0:2], v[3:5]], s[4:7] a16
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b128 v[0:1], v[0:3]
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
index fa24489df52dc..0c60be9d94591 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
@@ -180,21 +180,21 @@ define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half>
 define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> %in) {
 ; GCN-LABEL: set_inactive_v2i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN-NEXT:    s_mov_b32 s0, 1
-; GCN-NEXT:    s_mov_b32 s1, s0
-; GCN-NEXT:    v_mov_b32_e32 v3, s1
-; GCN-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT:    s_mov_b32 s4, 1
+; GCN-NEXT:    s_mov_b32 s5, s4
+; GCN-NEXT:    v_mov_b32_e32 v2, s4
+; GCN-NEXT:    v_mov_b32_e32 v3, s5
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-NEXT:    s_not_b64 exec, exec
 ; GCN-NEXT:    v_mov_b32_e32 v0, v2
 ; GCN-NEXT:    v_mov_b32_e32 v1, v3
 ; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    s_mov_b32 s6, -1
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
   %tmp = call <2 x i32> @llvm.amdgcn.set.inactive.v2i32(<2 x i32> %in, <2 x i32> <i32 1, i32 1>) #0
   store <2 x i32> %tmp, ptr addrspace(1) %out
@@ -204,21 +204,21 @@ define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> %
 define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
 ; GCN-LABEL: set_inactive_v2f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN-NEXT:    s_mov_b32 s0, 1.0
-; GCN-NEXT:    s_mov_b32 s1, s0
-; GCN-NEXT:    v_mov_b32_e32 v3, s1
-; GCN-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT:    s_mov_b32 s4, 1.0
+; GCN-NEXT:    s_mov_b32 s5, s4
+; GCN-NEXT:    v_mov_b32_e32 v2, s4
+; GCN-NEXT:    v_mov_b32_e32 v3, s5
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-NEXT:    s_not_b64 exec, exec
 ; GCN-NEXT:    v_mov_b32_e32 v0, v2
 ; GCN-NEXT:    v_mov_b32_e32 v1, v3
 ; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    s_mov_b32 s6, -1
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
   %tmp = call <2 x float> @llvm.amdgcn.set.inactive.v2f32(<2 x float> %in, <2 x float> <float 1.0, float 1.0>) #0
   store <2 x float> %tmp, ptr addrspace(1) %out
@@ -248,21 +248,21 @@ define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloa
 define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> %in) {
 ; GCN-LABEL: set_inactive_v4i16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN-NEXT:    s_mov_b32 s0, 0x10001
-; GCN-NEXT:    s_mov_b32 s1, s0
-; GCN-NEXT:    v_mov_b32_e32 v3, s1
-; GCN-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT:    s_mov_b32 s4, 0x10001
+; GCN-NEXT:    s_mov_b32 s5, s4
+; GCN-NEXT:    v_mov_b32_e32 v2, s4
+; GCN-NEXT:    v_mov_b32_e32 v3, s5
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-NEXT:    s_not_b64 exec, exec
 ; GCN-NEXT:    v_mov_b32_e32 v0, v2
 ; GCN-NEXT:    v_mov_b32_e32 v1, v3
 ; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    s_mov_b32 s6, -1
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
   %tmp = call <4 x i16> @llvm.amdgcn.set.inactive.v4i16(<4 x i16> %in, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) #0
   store <4 x i16> %tmp, ptr addrspace(1) %out
@@ -272,21 +272,21 @@ define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> %
 define amdgpu_kernel void @set_inactive_v4f16(ptr addrspace(1) %out, <4 x half> %in) {
 ; GCN-LABEL: set_inactive_v4f16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN-NEXT:    s_mov_b32 s0, 0x3c003c00
-; GCN-NEXT:    s_mov_b32 s1, s0
-; GCN-NEXT:    v_mov_b32_e32 v3, s1
-; GCN-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT:    s_mov_b32 s4, 0x3c003c00
+; GCN-NEXT:    s_mov_b32 s5, s4
+; GCN-NEXT:    v_mov_b32_e32 v2, s4
+; GCN-NEXT:    v_mov_b32_e32 v3, s5
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-NEXT:    s_not_b64 exec, exec
 ; GCN-NEXT:    v_mov_b32_e32 v0, v2
 ; GCN-NEXT:    v_mov_b32_e32 v1, v3
 ; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    s_mov_b32 s6, -1
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
   %tmp = call <4 x half> @llvm.amdgcn.set.inactive.v4f16(<4 x half> %in, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>) #0
   store <4 x half> %tmp, ptr addrspace(1) %out
@@ -296,21 +296,21 @@ define amdgpu_kernel void @set_inactive_v4f16(ptr addrspace(1) %out, <4 x half>
 define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloat> %in) {
 ; GCN-LABEL: set_inactive_v4bf16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN-NEXT:    s_mov_b32 s0, 0x3f803f80
-; GCN-NEXT:    s_mov_b32 s1, s0
-; GCN-NEXT:    v_mov_b32_e32 v3, s1
-; GCN-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT:    s_mov_b32 s4, 0x3f803f80
+; GCN-NEXT:    s_mov_b32 s5, s4
+; GCN-NEXT:    v_mov_b32_e32 v2, s4
+; GCN-NEXT:    v_mov_b32_e32 v3, s5
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-NEXT:    s_not_b64 exec, exec
 ; GCN-NEXT:    v_mov_b32_e32 v0, v2
 ; GCN-NEXT:    v_mov_b32_e32 v1, v3
 ; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    s_mov_b32 s6, -1
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
   %tmp = call <4 x bfloat> @llvm.amdgcn.set.inactive.v4bf16(<4 x bfloat> %in, <4 x bfloat> <bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0>) #0
   store <4 x bfloat> %tmp, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
index d6282708ece35..1092bb4dc834a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
@@ -6,14 +6,14 @@
 define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in1, i32 %in2) {
 ; GFX8-LABEL: dpp_test:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v2, s6
-; GFX8-NEXT:    v_mov_b32_e32 v0, s7
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX8-NEXT:    s_nop 1
 ; GFX8-NEXT:    v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
-; GFX8-NEXT:    v_mov_b32_e32 v0, s4
-; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -30,12 +30,12 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in1, i32 %in2) {
 ;
 ; GFX11-LABEL: dpp_test:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -156,19 +156,19 @@ define amdgpu_kernel void @update_dppf64_test(ptr addrspace(1) %arg, double %in1
 define amdgpu_kernel void @update_dppv2i32_test(ptr addrspace(1) %arg, <2 x i32> %in1, <2 x i32> %in2) {
 ; GFX8-LABEL: update_dppv2i32_test:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v0, s4
-; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
-; GFX8-NEXT:    v_mov_b32_e32 v4, s6
-; GFX8-NEXT:    v_mov_b32_e32 v5, s7
+; GFX8-NEXT:    v_mov_b32_e32 v5, s3
+; GFX8-NEXT:    v_mov_b32_e32 v4, s2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; GFX8-NEXT:    v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX8-NEXT:    v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -188,15 +188,15 @@ define amdgpu_kernel void @update_dppv2i32_test(ptr addrspace(1) %arg, <2 x i32>
 ;
 ; GFX11-LABEL: update_dppv2i32_test:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
-; GFX11-NEXT:    global_load_b64 v[0:1], v4, s[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT:    global_load_b64 v[0:1], v4, s[0:1]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; GFX11-NEXT:    v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
-; GFX11-NEXT:    global_store_b64 v4, v[2:3], s[4:5]
+; GFX11-NEXT:    global_store_b64 v4, v[2:3], s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -211,19 +211,19 @@ define amdgpu_kernel void @update_dppv2i32_test(ptr addrspace(1) %arg, <2 x i32>
 define amdgpu_kernel void @update_dppv2f32_test(ptr addrspace(1) %arg, <2 x float> %in1, <2 x float> %in2) {
 ; GFX8-LABEL: update_dppv2f32_test:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v0, s4
-; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
-; GFX8-NEXT:    v_mov_b32_e32 v4, s6
-; GFX8-NEXT:    v_mov_b32_e32 v5, s7
+; GFX8-NEXT:    v_mov_b32_e32 v5, s3
+; GFX8-NEXT:    v_mov_b32_e32 v4, s2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; GFX8-NEXT:    v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX8-NEXT:    v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -243,15 +243,15 @@ define amdgpu_kernel void @update_dppv2f32_test(ptr addrspace(1) %arg, <2 x floa
 ;
 ; GFX11-LABEL: update_dppv2f32_test:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
-; GFX11-NEXT:    global_load_b64 v[0:1], v4, s[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT:    global_load_b64 v[0:1], v4, s[0:1]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; GFX11-NEXT:    v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
-; GFX11-NEXT:    global_store_b64 v4, v[2:3], s[4:5]
+; GFX11-NEXT:    global_store_b64 v4, v[2:3], s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
index c3bd56610d102..8a2274cbfbf62 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
@@ -616,26 +616,26 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i32> %x, <2 x i32> %y) {
 ; GFX8-LABEL: sdivrem_v2i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x0
+; GFX8-NEXT:    s_load_dwordx8 s[4:11], s[4:5], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_ashr_i32 s2, s14, 31
-; GFX8-NEXT:    s_add_i32 s0, s14, s2
+; GFX8-NEXT:    s_ashr_i32 s2, s10, 31
+; GFX8-NEXT:    s_add_i32 s0, s10, s2
 ; GFX8-NEXT:    s_xor_b32 s3, s0, s2
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX8-NEXT:    s_ashr_i32 s4, s15, 31
-; GFX8-NEXT:    s_add_i32 s0, s15, s4
-; GFX8-NEXT:    s_xor_b32 s5, s0, s4
+; GFX8-NEXT:    s_ashr_i32 s10, s11, 31
+; GFX8-NEXT:    s_add_i32 s0, s11, s10
+; GFX8-NEXT:    s_xor_b32 s11, s0, s10
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX8-NEXT:    s_sub_i32 s0, 0, s3
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, s5
-; GFX8-NEXT:    s_ashr_i32 s6, s12, 31
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, s11
+; GFX8-NEXT:    s_ashr_i32 s12, s8, 31
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GFX8-NEXT:    s_sub_i32 s1, 0, s5
+; GFX8-NEXT:    s_sub_i32 s1, 0, s11
 ; GFX8-NEXT:    v_mul_lo_u32 v2, s0, v0
-; GFX8-NEXT:    s_add_i32 s0, s12, s6
-; GFX8-NEXT:    s_xor_b32 s0, s0, s6
+; GFX8-NEXT:    s_add_i32 s0, s8, s12
+; GFX8-NEXT:    s_xor_b32 s0, s0, s12
 ; GFX8-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GFX8-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v1, v1
@@ -654,39 +654,39 @@ define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s3, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s3, v3
-; GFX8-NEXT:    s_xor_b32 s0, s6, s2
-; GFX8-NEXT:    s_ashr_i32 s2, s13, 31
-; GFX8-NEXT:    s_add_i32 s1, s13, s2
+; GFX8-NEXT:    s_xor_b32 s0, s12, s2
+; GFX8-NEXT:    s_ashr_i32 s2, s9, 31
+; GFX8-NEXT:    s_add_i32 s1, s9, s2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX8-NEXT:    s_xor_b32 s1, s1, s2
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
 ; GFX8-NEXT:    v_mul_hi_u32 v1, s1, v1
-; GFX8-NEXT:    v_xor_b32_e32 v2, s6, v3
+; GFX8-NEXT:    v_xor_b32_e32 v2, s12, v3
 ; GFX8-NEXT:    v_xor_b32_e32 v0, s0, v0
 ; GFX8-NEXT:    v_subrev_u32_e32 v0, vcc, s0, v0
-; GFX8-NEXT:    v_mul_lo_u32 v3, v1, s5
-; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, s6, v2
+; GFX8-NEXT:    v_mul_lo_u32 v3, v1, s11
+; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, s12, v2
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v1
 ; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s1, v3
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s5, v3
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s5, v3
+; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s11, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v1
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s5, v3
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s5, v3
-; GFX8-NEXT:    s_xor_b32 s0, s2, s4
+; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s11, v3
+; GFX8-NEXT:    s_xor_b32 s0, s2, s10
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX8-NEXT:    v_xor_b32_e32 v1, s0, v1
-; GFX8-NEXT:    v_mov_b32_e32 v4, s8
+; GFX8-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX8-NEXT:    v_subrev_u32_e32 v1, vcc, s0, v1
-; GFX8-NEXT:    v_mov_b32_e32 v5, s9
+; GFX8-NEXT:    v_mov_b32_e32 v5, s5
 ; GFX8-NEXT:    v_xor_b32_e32 v3, s2, v3
 ; GFX8-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
-; GFX8-NEXT:    v_mov_b32_e32 v0, s10
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, s2, v3
-; GFX8-NEXT:    v_mov_b32_e32 v1, s11
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GFX8-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
index 63a0d8afd4de0..62d8b7d6f045c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
@@ -522,11 +522,11 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i32> %x, <2 x i32> %y) {
 ; GFX8-LABEL: udivrem_v2i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x0
+; GFX8-NEXT:    s_load_dwordx8 s[4:11], s[4:5], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s14
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, s15
-; GFX8-NEXT:    s_sub_i32 s0, 0, s14
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s10
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, s11
+; GFX8-NEXT:    s_sub_i32 s0, 0, s10
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
@@ -534,43 +534,43 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX8-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX8-NEXT:    v_mul_lo_u32 v2, s0, v0
-; GFX8-NEXT:    s_sub_i32 s0, 0, s15
+; GFX8-NEXT:    s_sub_i32 s0, 0, s11
 ; GFX8-NEXT:    v_mul_lo_u32 v3, s0, v1
 ; GFX8-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX8-NEXT:    v_mul_hi_u32 v3, v1, v3
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
-; GFX8-NEXT:    v_mul_hi_u32 v0, s12, v0
+; GFX8-NEXT:    v_mul_hi_u32 v0, s8, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
-; GFX8-NEXT:    v_mul_hi_u32 v1, s13, v1
-; GFX8-NEXT:    v_mul_lo_u32 v2, v0, s14
+; GFX8-NEXT:    v_mul_hi_u32 v1, s9, v1
+; GFX8-NEXT:    v_mul_lo_u32 v2, v0, s10
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 1, v0
-; GFX8-NEXT:    v_mul_lo_u32 v4, v1, s15
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s12, v2
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s14, v2
+; GFX8-NEXT:    v_mul_lo_u32 v4, v1, s11
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s8, v2
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX8-NEXT:    v_subrev_u32_e64 v3, s[0:1], s14, v2
+; GFX8-NEXT:    v_subrev_u32_e64 v3, s[0:1], s10, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 1, v0
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s14, v2
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX8-NEXT:    v_subrev_u32_e64 v3, s[0:1], s14, v2
+; GFX8-NEXT:    v_subrev_u32_e64 v3, s[0:1], s10, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s13, v4
+; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s9, v4
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v1
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s15, v3
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s15, v3
+; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s11, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v1
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s15, v3
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s15, v3
+; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s11, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; GFX8-NEXT:    v_mov_b32_e32 v4, s8
-; GFX8-NEXT:    v_mov_b32_e32 v5, s9
+; GFX8-NEXT:    v_mov_b32_e32 v4, s4
+; GFX8-NEXT:    v_mov_b32_e32 v5, s5
 ; GFX8-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
-; GFX8-NEXT:    v_mov_b32_e32 v0, s10
-; GFX8-NEXT:    v_mov_b32_e32 v1, s11
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GFX8-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
index be9b5b00c39d7..611a7b566070c 100644
--- a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
@@ -226,18 +226,18 @@ define amdgpu_kernel void @s_test_add_self_v2i16(ptr addrspace(1) %out, ptr addr
 define amdgpu_kernel void @s_test_add_v2i16_kernarg(ptr addrspace(1) %out, <2 x i16> %a, <2 x i16> %b) #1 {
 ; VI-LABEL: s_test_add_v2i16_kernarg:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshr_b32 s0, s6, 16
-; VI-NEXT:    s_lshr_b32 s1, s7, 16
-; VI-NEXT:    s_add_i32 s2, s6, s7
-; VI-NEXT:    s_add_i32 s0, s0, s1
-; VI-NEXT:    s_and_b32 s1, s2, 0xffff
-; VI-NEXT:    s_lshl_b32 s0, s0, 16
-; VI-NEXT:    s_or_b32 s0, s1, s0
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    s_lshr_b32 s4, s2, 16
+; VI-NEXT:    s_lshr_b32 s5, s3, 16
+; VI-NEXT:    s_add_i32 s2, s2, s3
+; VI-NEXT:    s_add_i32 s4, s4, s5
+; VI-NEXT:    s_and_b32 s2, s2, 0xffff
+; VI-NEXT:    s_lshl_b32 s3, s4, 16
+; VI-NEXT:    s_or_b32 s2, s2, s3
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -262,11 +262,11 @@ define amdgpu_kernel void @s_test_add_v2i16_kernarg(ptr addrspace(1) %out, <2 x
 ;
 ; GFX11-LABEL: s_test_add_v2i16_kernarg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_pk_add_u16 v1, s6, s7
-; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX11-NEXT:    v_pk_add_u16 v1, s2, s3
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll b/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll
index ad6009e378178..2c69ae58f0e61 100644
--- a/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll
@@ -9,19 +9,19 @@ declare i32 @llvm.amdgcn.atomic.cond.sub.u32.p0(ptr, i32)
 define amdgpu_kernel void @flat_atomic_cond_sub_no_rtn_u32(ptr %addr, i32 %in) {
 ; GFX12-SDAG-LABEL: flat_atomic_cond_sub_no_rtn_u32:
 ; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, s6
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-SDAG-NEXT:    flat_atomic_cond_sub_u32 v0, v[0:1], v2 offset:-16 th:TH_ATOMIC_RETURN
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: flat_atomic_cond_sub_no_rtn_u32:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v1, s5
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1
 ; GFX12-GISEL-NEXT:    flat_atomic_cond_sub_u32 v0, v[0:1], v2 offset:-16 th:TH_ATOMIC_RETURN
 ; GFX12-GISEL-NEXT:    s_endpgm
 entry:
@@ -33,19 +33,19 @@ entry:
 define amdgpu_kernel void @flat_atomic_cond_sub_no_rtn_u32_forced(ptr %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" {
 ; GFX12-SDAG-LABEL: flat_atomic_cond_sub_no_rtn_u32_forced:
 ; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, s6
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-SDAG-NEXT:    flat_atomic_cond_sub_u32 v[0:1], v2 offset:-16
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: flat_atomic_cond_sub_no_rtn_u32_forced:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v1, s5
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1
 ; GFX12-GISEL-NEXT:    flat_atomic_cond_sub_u32 v[0:1], v2 offset:-16
 ; GFX12-GISEL-NEXT:    s_endpgm
 entry:
@@ -92,18 +92,18 @@ entry:
 define amdgpu_kernel void @global_atomic_cond_sub_no_rtn_u32(ptr addrspace(1) %addr, i32 %in) {
 ; GFX12-SDAG-LABEL: global_atomic_cond_sub_no_rtn_u32:
 ; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6
-; GFX12-SDAG-NEXT:    global_atomic_cond_sub_u32 v0, v0, v1, s[4:5] offset:-16 th:TH_ATOMIC_RETURN
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-SDAG-NEXT:    global_atomic_cond_sub_u32 v0, v0, v1, s[0:1] offset:-16 th:TH_ATOMIC_RETURN
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: global_atomic_cond_sub_no_rtn_u32:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
-; GFX12-GISEL-NEXT:    global_atomic_cond_sub_u32 v0, v1, v0, s[4:5] offset:-16 th:TH_ATOMIC_RETURN
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-GISEL-NEXT:    global_atomic_cond_sub_u32 v0, v1, v0, s[0:1] offset:-16 th:TH_ATOMIC_RETURN
 ; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr addrspace(1) %addr, i32 -4
@@ -114,20 +114,20 @@ entry:
 define amdgpu_kernel void @global_atomic_cond_sub_no_rtn_u32_forced(ptr addrspace(1) %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" {
 ; GFX12-SDAG-LABEL: global_atomic_cond_sub_no_rtn_u32_forced:
 ; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6
-; GFX12-SDAG-NEXT:    global_atomic_cond_sub_u32 v0, v1, s[4:5] offset:-16
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-SDAG-NEXT:    global_atomic_cond_sub_u32 v0, v1, s[0:1] offset:-16
 ; GFX12-SDAG-NEXT:    s_nop 0
 ; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: global_atomic_cond_sub_no_rtn_u32_forced:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
-; GFX12-GISEL-NEXT:    global_atomic_cond_sub_u32 v1, v0, s[4:5] offset:-16
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-GISEL-NEXT:    global_atomic_cond_sub_u32 v1, v0, s[0:1] offset:-16
 ; GFX12-GISEL-NEXT:    s_nop 0
 ; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll b/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll
index 1639ec6512208..af4116bd6aae5 100644
--- a/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll
+++ b/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll
@@ -131,13 +131,13 @@ define amdgpu_kernel void @s_ubfe_sub_i32(ptr addrspace(1) %out, i32 %src, i32 %
 ;
 ; VI-LABEL: s_ubfe_sub_i32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_sub_i32 s0, 32, s7
-; VI-NEXT:    s_lshl_b32 s1, s6, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s4, v0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT:    s_sub_i32 s0, 32, s3
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_lshl_b32 s1, s2, s0
 ; VI-NEXT:    s_lshr_b32 s0, s1, s0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
@@ -175,13 +175,13 @@ define amdgpu_kernel void @s_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, i
 ;
 ; VI-LABEL: s_ubfe_sub_multi_use_shl_i32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_sub_i32 s0, 32, s7
-; VI-NEXT:    s_lshl_b32 s1, s6, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s4, v0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT:    s_sub_i32 s0, 32, s3
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_lshl_b32 s1, s2, s0
 ; VI-NEXT:    s_lshr_b32 s0, s1, s0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
@@ -329,13 +329,13 @@ define amdgpu_kernel void @s_sbfe_sub_i32(ptr addrspace(1) %out, i32 %src, i32 %
 ;
 ; VI-LABEL: s_sbfe_sub_i32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_sub_i32 s0, 32, s7
-; VI-NEXT:    s_lshl_b32 s1, s6, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s4, v0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT:    s_sub_i32 s0, 32, s3
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_lshl_b32 s1, s2, s0
 ; VI-NEXT:    s_ashr_i32 s0, s1, s0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
@@ -373,13 +373,13 @@ define amdgpu_kernel void @s_sbfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, i
 ;
 ; VI-LABEL: s_sbfe_sub_multi_use_shl_i32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_sub_i32 s0, 32, s7
-; VI-NEXT:    s_lshl_b32 s1, s6, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s4, v0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT:    s_sub_i32 s0, 32, s3
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_lshl_b32 s1, s2, s0
 ; VI-NEXT:    s_ashr_i32 s0, s1, s0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
diff --git a/llvm/test/CodeGen/AMDGPU/bfm.ll b/llvm/test/CodeGen/AMDGPU/bfm.ll
index 8b2f66b438f14..f8bd44b7c98f5 100644
--- a/llvm/test/CodeGen/AMDGPU/bfm.ll
+++ b/llvm/test/CodeGen/AMDGPU/bfm.ll
@@ -18,12 +18,12 @@ define amdgpu_kernel void @s_bfm_pattern(ptr addrspace(1) %out, i32 %x, i32 %y)
 ;
 ; VI-LABEL: s_bfm_pattern:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_bfm_b32 s0, s6, s7
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    s_bfm_b32 s2, s2, s3
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
   %a = shl i32 1, %x
diff --git a/llvm/test/CodeGen/AMDGPU/bitreverse.ll b/llvm/test/CodeGen/AMDGPU/bitreverse.ll
index 49ec09ddb7770..64555f14a55cc 100644
--- a/llvm/test/CodeGen/AMDGPU/bitreverse.ll
+++ b/llvm/test/CodeGen/AMDGPU/bitreverse.ll
@@ -117,64 +117,64 @@ define amdgpu_kernel void @v_brev_i16(ptr addrspace(1) noalias %out, ptr addrspa
 ;
 ; FLAT-LABEL: v_brev_i16:
 ; FLAT:       ; %bb.0:
-; FLAT-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; FLAT-NEXT:    s_mov_b32 s3, 0xf000
-; FLAT-NEXT:    s_mov_b32 s2, -1
-; FLAT-NEXT:    s_mov_b32 s10, s2
-; FLAT-NEXT:    s_mov_b32 s11, s3
+; FLAT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; FLAT-NEXT:    s_mov_b32 s7, 0xf000
+; FLAT-NEXT:    s_mov_b32 s6, -1
+; FLAT-NEXT:    s_mov_b32 s10, s6
+; FLAT-NEXT:    s_mov_b32 s11, s7
 ; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
-; FLAT-NEXT:    s_mov_b32 s8, s6
-; FLAT-NEXT:    s_mov_b32 s9, s7
+; FLAT-NEXT:    s_mov_b32 s8, s2
+; FLAT-NEXT:    s_mov_b32 s9, s3
 ; FLAT-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
-; FLAT-NEXT:    s_mov_b32 s0, s4
-; FLAT-NEXT:    s_mov_b32 s1, s5
+; FLAT-NEXT:    s_mov_b32 s4, s0
+; FLAT-NEXT:    s_mov_b32 s5, s1
 ; FLAT-NEXT:    s_waitcnt vmcnt(0)
 ; FLAT-NEXT:    v_bfrev_b32_e32 v0, v0
 ; FLAT-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; FLAT-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; FLAT-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; FLAT-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: v_brev_i16:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b32_e32 v0, s6
-; GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GISEL-NEXT:    v_mov_b32_e32 v1, s3
 ; GISEL-NEXT:    flat_load_ushort v0, v[0:1]
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    v_bfrev_b32_e32 v0, v0
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; GISEL-NEXT:    flat_store_short v[0:1], v2
 ; GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-FLAT-LABEL: v_brev_i16:
 ; GFX11-FLAT:       ; %bb.0:
-; GFX11-FLAT-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-FLAT-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-FLAT-NEXT:    s_mov_b32 s2, -1
+; GFX11-FLAT-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-FLAT-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-FLAT-NEXT:    s_mov_b32 s6, -1
 ; GFX11-FLAT-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-FLAT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FLAT-NEXT:    s_mov_b32 s0, s6
-; GFX11-FLAT-NEXT:    s_mov_b32 s1, s7
-; GFX11-FLAT-NEXT:    buffer_load_u16 v0, off, s[0:3], 0
+; GFX11-FLAT-NEXT:    s_mov_b32 s4, s2
+; GFX11-FLAT-NEXT:    s_mov_b32 s5, s3
+; GFX11-FLAT-NEXT:    buffer_load_u16 v0, off, s[4:7], 0
 ; GFX11-FLAT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FLAT-NEXT:    v_bfrev_b32_e32 v0, v0
-; GFX11-FLAT-NEXT:    global_store_d16_hi_b16 v1, v0, s[4:5]
+; GFX11-FLAT-NEXT:    global_store_d16_hi_b16 v1, v0, s[0:1]
 ; GFX11-FLAT-NEXT:    s_nop 0
 ; GFX11-FLAT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FLAT-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_brev_i16:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    global_load_u16 v1, v0, s[6:7]
+; GFX11-GISEL-NEXT:    global_load_u16 v1, v0, s[2:3]
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    v_bfrev_b32_e32 v1, v1
-; GFX11-GISEL-NEXT:    global_store_d16_hi_b16 v0, v1, s[4:5]
+; GFX11-GISEL-NEXT:    global_store_d16_hi_b16 v0, v1, s[0:1]
 ; GFX11-GISEL-NEXT:    s_nop 0
 ; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
@@ -278,61 +278,61 @@ define amdgpu_kernel void @v_brev_i32(ptr addrspace(1) noalias %out, ptr addrspa
 ;
 ; FLAT-LABEL: v_brev_i32:
 ; FLAT:       ; %bb.0:
-; FLAT-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; FLAT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; FLAT-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
-; FLAT-NEXT:    v_mov_b32_e32 v1, s7
-; FLAT-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
+; FLAT-NEXT:    v_mov_b32_e32 v1, s3
+; FLAT-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; FLAT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; FLAT-NEXT:    flat_load_dword v0, v[0:1]
-; FLAT-NEXT:    s_mov_b32 s7, 0xf000
-; FLAT-NEXT:    s_mov_b32 s6, -1
+; FLAT-NEXT:    s_mov_b32 s3, 0xf000
+; FLAT-NEXT:    s_mov_b32 s2, -1
 ; FLAT-NEXT:    s_waitcnt vmcnt(0)
 ; FLAT-NEXT:    v_bfrev_b32_e32 v0, v0
-; FLAT-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; FLAT-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; FLAT-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: v_brev_i32:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b32_e32 v0, s6
-; GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GISEL-NEXT:    v_mov_b32_e32 v1, s3
 ; GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GISEL-NEXT:    flat_load_dword v0, v[0:1]
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    v_bfrev_b32_e32 v2, v0
-; GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; GISEL-NEXT:    flat_store_dword v[0:1], v2
 ; GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-FLAT-LABEL: v_brev_i32:
 ; GFX11-FLAT:       ; %bb.0:
-; GFX11-FLAT-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-FLAT-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-FLAT-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-FLAT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FLAT-NEXT:    global_load_b32 v0, v0, s[6:7]
-; GFX11-FLAT-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX11-FLAT-NEXT:    s_mov_b32 s6, -1
+; GFX11-FLAT-NEXT:    global_load_b32 v0, v0, s[2:3]
+; GFX11-FLAT-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-FLAT-NEXT:    s_mov_b32 s2, -1
 ; GFX11-FLAT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FLAT-NEXT:    v_bfrev_b32_e32 v0, v0
-; GFX11-FLAT-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-FLAT-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
 ; GFX11-FLAT-NEXT:    s_nop 0
 ; GFX11-FLAT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FLAT-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_brev_i32:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-GISEL-NEXT:    global_load_b32 v0, v0, s[2:3]
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    v_bfrev_b32_e32 v0, v0
-; GFX11-GISEL-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX11-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-GISEL-NEXT:    s_nop 0
 ; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
@@ -362,59 +362,59 @@ define amdgpu_kernel void @s_brev_v2i32(ptr addrspace(1) noalias %out, <2 x i32>
 ;
 ; FLAT-LABEL: s_brev_v2i32:
 ; FLAT:       ; %bb.0:
-; FLAT-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; FLAT-NEXT:    s_mov_b32 s3, 0xf000
-; FLAT-NEXT:    s_mov_b32 s2, -1
+; FLAT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; FLAT-NEXT:    s_mov_b32 s7, 0xf000
+; FLAT-NEXT:    s_mov_b32 s6, -1
 ; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
-; FLAT-NEXT:    s_mov_b32 s0, s4
-; FLAT-NEXT:    s_mov_b32 s1, s5
-; FLAT-NEXT:    s_brev_b32 s4, s7
-; FLAT-NEXT:    s_brev_b32 s5, s6
-; FLAT-NEXT:    v_mov_b32_e32 v0, s5
-; FLAT-NEXT:    v_mov_b32_e32 v1, s4
-; FLAT-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; FLAT-NEXT:    s_mov_b32 s4, s0
+; FLAT-NEXT:    s_mov_b32 s5, s1
+; FLAT-NEXT:    s_brev_b32 s0, s3
+; FLAT-NEXT:    s_brev_b32 s1, s2
+; FLAT-NEXT:    v_mov_b32_e32 v0, s1
+; FLAT-NEXT:    v_mov_b32_e32 v1, s0
+; FLAT-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; FLAT-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: s_brev_v2i32:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    s_brev_b32 s0, s6
-; GISEL-NEXT:    s_brev_b32 s1, s7
-; GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GISEL-NEXT:    s_brev_b32 s2, s2
+; GISEL-NEXT:    s_brev_b32 s3, s3
+; GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GISEL-NEXT:    v_mov_b32_e32 v2, s0
 ; GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-FLAT-LABEL: s_brev_v2i32:
 ; GFX11-FLAT:       ; %bb.0:
-; GFX11-FLAT-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-FLAT-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-FLAT-NEXT:    s_mov_b32 s2, -1
+; GFX11-FLAT-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-FLAT-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-FLAT-NEXT:    s_mov_b32 s6, -1
 ; GFX11-FLAT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FLAT-NEXT:    s_brev_b32 s0, s6
-; GFX11-FLAT-NEXT:    s_brev_b32 s1, s7
+; GFX11-FLAT-NEXT:    s_brev_b32 s2, s2
+; GFX11-FLAT-NEXT:    s_brev_b32 s3, s3
 ; GFX11-FLAT-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FLAT-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-FLAT-NEXT:    s_mov_b32 s0, s4
-; GFX11-FLAT-NEXT:    s_mov_b32 s1, s5
-; GFX11-FLAT-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-FLAT-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-FLAT-NEXT:    s_mov_b32 s4, s0
+; GFX11-FLAT-NEXT:    s_mov_b32 s5, s1
+; GFX11-FLAT-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
 ; GFX11-FLAT-NEXT:    s_nop 0
 ; GFX11-FLAT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FLAT-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: s_brev_v2i32:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_brev_b32 s0, s6
-; GFX11-GISEL-NEXT:    s_brev_b32 s1, s7
+; GFX11-GISEL-NEXT:    s_brev_b32 s2, s2
+; GFX11-GISEL-NEXT:    s_brev_b32 s3, s3
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-GISEL-NEXT:    s_nop 0
 ; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
@@ -446,33 +446,33 @@ define amdgpu_kernel void @v_brev_v2i32(ptr addrspace(1) noalias %out, ptr addrs
 ;
 ; FLAT-LABEL: v_brev_v2i32:
 ; FLAT:       ; %bb.0:
-; FLAT-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; FLAT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; FLAT-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
-; FLAT-NEXT:    v_mov_b32_e32 v1, s7
-; FLAT-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
+; FLAT-NEXT:    v_mov_b32_e32 v1, s3
+; FLAT-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; FLAT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; FLAT-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; FLAT-NEXT:    s_mov_b32 s7, 0xf000
-; FLAT-NEXT:    s_mov_b32 s6, -1
+; FLAT-NEXT:    s_mov_b32 s3, 0xf000
+; FLAT-NEXT:    s_mov_b32 s2, -1
 ; FLAT-NEXT:    s_waitcnt vmcnt(0)
 ; FLAT-NEXT:    v_bfrev_b32_e32 v1, v1
 ; FLAT-NEXT:    v_bfrev_b32_e32 v0, v0
-; FLAT-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; FLAT-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; FLAT-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: v_brev_v2i32:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b32_e32 v0, s6
-; GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GISEL-NEXT:    v_mov_b32_e32 v1, s3
 ; GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GISEL-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GISEL-NEXT:    v_mov_b32_e32 v2, s0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    v_bfrev_b32_e32 v0, v0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v1, v1
@@ -481,31 +481,31 @@ define amdgpu_kernel void @v_brev_v2i32(ptr addrspace(1) noalias %out, ptr addrs
 ;
 ; GFX11-FLAT-LABEL: v_brev_v2i32:
 ; GFX11-FLAT:       ; %bb.0:
-; GFX11-FLAT-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-FLAT-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-FLAT-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX11-FLAT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FLAT-NEXT:    global_load_b64 v[0:1], v0, s[6:7]
-; GFX11-FLAT-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX11-FLAT-NEXT:    s_mov_b32 s6, -1
+; GFX11-FLAT-NEXT:    global_load_b64 v[0:1], v0, s[2:3]
+; GFX11-FLAT-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-FLAT-NEXT:    s_mov_b32 s2, -1
 ; GFX11-FLAT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FLAT-NEXT:    v_bfrev_b32_e32 v1, v1
 ; GFX11-FLAT-NEXT:    v_bfrev_b32_e32 v0, v0
-; GFX11-FLAT-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-FLAT-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
 ; GFX11-FLAT-NEXT:    s_nop 0
 ; GFX11-FLAT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FLAT-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_brev_v2i32:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    global_load_b64 v[0:1], v0, s[6:7]
+; GFX11-GISEL-NEXT:    global_load_b64 v[0:1], v0, s[2:3]
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    v_bfrev_b32_e32 v0, v0
 ; GFX11-GISEL-NEXT:    v_bfrev_b32_e32 v1, v1
-; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-GISEL-NEXT:    s_nop 0
 ; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
@@ -534,52 +534,52 @@ define amdgpu_kernel void @s_brev_i64(ptr addrspace(1) noalias %out, i64 %val) #
 ;
 ; FLAT-LABEL: s_brev_i64:
 ; FLAT:       ; %bb.0:
-; FLAT-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; FLAT-NEXT:    s_mov_b32 s3, 0xf000
-; FLAT-NEXT:    s_mov_b32 s2, -1
+; FLAT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; FLAT-NEXT:    s_mov_b32 s7, 0xf000
+; FLAT-NEXT:    s_mov_b32 s6, -1
 ; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
-; FLAT-NEXT:    s_mov_b32 s0, s4
-; FLAT-NEXT:    s_mov_b32 s1, s5
-; FLAT-NEXT:    s_brev_b64 s[4:5], s[6:7]
-; FLAT-NEXT:    v_mov_b32_e32 v0, s4
-; FLAT-NEXT:    v_mov_b32_e32 v1, s5
-; FLAT-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; FLAT-NEXT:    s_mov_b32 s4, s0
+; FLAT-NEXT:    s_mov_b32 s5, s1
+; FLAT-NEXT:    s_brev_b64 s[0:1], s[2:3]
+; FLAT-NEXT:    v_mov_b32_e32 v0, s0
+; FLAT-NEXT:    v_mov_b32_e32 v1, s1
+; FLAT-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; FLAT-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: s_brev_i64:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    s_brev_b64 s[0:1], s[6:7]
-; GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GISEL-NEXT:    s_brev_b64 s[2:3], s[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GISEL-NEXT:    v_mov_b32_e32 v2, s0
 ; GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-FLAT-LABEL: s_brev_i64:
 ; GFX11-FLAT:       ; %bb.0:
-; GFX11-FLAT-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-FLAT-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-FLAT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FLAT-NEXT:    s_brev_b64 s[0:1], s[6:7]
-; GFX11-FLAT-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX11-FLAT-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-FLAT-NEXT:    s_mov_b32 s6, -1
-; GFX11-FLAT-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-FLAT-NEXT:    s_brev_b64 s[4:5], s[2:3]
+; GFX11-FLAT-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-FLAT-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX11-FLAT-NEXT:    s_mov_b32 s2, -1
+; GFX11-FLAT-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
 ; GFX11-FLAT-NEXT:    s_nop 0
 ; GFX11-FLAT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FLAT-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: s_brev_i64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_brev_b64 s[0:1], s[6:7]
+; GFX11-GISEL-NEXT:    s_brev_b64 s[2:3], s[2:3]
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-GISEL-NEXT:    s_nop 0
 ; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
@@ -611,33 +611,33 @@ define amdgpu_kernel void @v_brev_i64(ptr addrspace(1) noalias %out, ptr addrspa
 ;
 ; FLAT-LABEL: v_brev_i64:
 ; FLAT:       ; %bb.0:
-; FLAT-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; FLAT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; FLAT-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
-; FLAT-NEXT:    v_mov_b32_e32 v1, s7
-; FLAT-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
+; FLAT-NEXT:    v_mov_b32_e32 v1, s3
+; FLAT-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; FLAT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; FLAT-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; FLAT-NEXT:    s_mov_b32 s7, 0xf000
-; FLAT-NEXT:    s_mov_b32 s6, -1
+; FLAT-NEXT:    s_mov_b32 s3, 0xf000
+; FLAT-NEXT:    s_mov_b32 s2, -1
 ; FLAT-NEXT:    s_waitcnt vmcnt(0)
 ; FLAT-NEXT:    v_bfrev_b32_e32 v2, v0
 ; FLAT-NEXT:    v_bfrev_b32_e32 v1, v1
-; FLAT-NEXT:    buffer_store_dwordx2 v[1:2], off, s[4:7], 0
+; FLAT-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], 0
 ; FLAT-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: v_brev_i64:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b32_e32 v0, s6
-; GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GISEL-NEXT:    v_mov_b32_e32 v1, s3
 ; GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GISEL-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GISEL-NEXT:    v_mov_b32_e32 v3, s4
-; GISEL-NEXT:    v_mov_b32_e32 v4, s5
+; GISEL-NEXT:    v_mov_b32_e32 v4, s1
+; GISEL-NEXT:    v_mov_b32_e32 v3, s0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    v_bfrev_b32_e32 v1, v1
 ; GISEL-NEXT:    v_bfrev_b32_e32 v2, v0
@@ -646,31 +646,31 @@ define amdgpu_kernel void @v_brev_i64(ptr addrspace(1) noalias %out, ptr addrspa
 ;
 ; GFX11-FLAT-LABEL: v_brev_i64:
 ; GFX11-FLAT:       ; %bb.0:
-; GFX11-FLAT-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-FLAT-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-FLAT-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX11-FLAT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FLAT-NEXT:    global_load_b64 v[0:1], v0, s[6:7]
-; GFX11-FLAT-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX11-FLAT-NEXT:    s_mov_b32 s6, -1
+; GFX11-FLAT-NEXT:    global_load_b64 v[0:1], v0, s[2:3]
+; GFX11-FLAT-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-FLAT-NEXT:    s_mov_b32 s2, -1
 ; GFX11-FLAT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FLAT-NEXT:    v_bfrev_b32_e32 v2, v0
 ; GFX11-FLAT-NEXT:    v_bfrev_b32_e32 v1, v1
-; GFX11-FLAT-NEXT:    buffer_store_b64 v[1:2], off, s[4:7], 0
+; GFX11-FLAT-NEXT:    buffer_store_b64 v[1:2], off, s[0:3], 0
 ; GFX11-FLAT-NEXT:    s_nop 0
 ; GFX11-FLAT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FLAT-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_brev_i64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    global_load_b64 v[0:1], v0, s[6:7]
+; GFX11-GISEL-NEXT:    global_load_b64 v[0:1], v0, s[2:3]
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    v_bfrev_b32_e32 v1, v1
 ; GFX11-GISEL-NEXT:    v_bfrev_b32_e32 v2, v0
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-GISEL-NEXT:    global_store_b64 v0, v[1:2], s[4:5]
+; GFX11-GISEL-NEXT:    global_store_b64 v0, v[1:2], s[0:1]
 ; GFX11-GISEL-NEXT:    s_nop 0
 ; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
@@ -793,74 +793,74 @@ define amdgpu_kernel void @v_brev_v2i64(ptr addrspace(1) noalias %out, ptr addrs
 ;
 ; FLAT-LABEL: v_brev_v2i64:
 ; FLAT:       ; %bb.0:
-; FLAT-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; FLAT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; FLAT-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
-; FLAT-NEXT:    v_mov_b32_e32 v1, s7
-; FLAT-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
+; FLAT-NEXT:    v_mov_b32_e32 v1, s3
+; FLAT-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; FLAT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; FLAT-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
-; FLAT-NEXT:    s_mov_b32 s7, 0xf000
-; FLAT-NEXT:    s_mov_b32 s6, -1
+; FLAT-NEXT:    s_mov_b32 s3, 0xf000
+; FLAT-NEXT:    s_mov_b32 s2, -1
 ; FLAT-NEXT:    s_waitcnt vmcnt(0)
 ; FLAT-NEXT:    v_bfrev_b32_e32 v4, v2
 ; FLAT-NEXT:    v_bfrev_b32_e32 v3, v3
 ; FLAT-NEXT:    v_bfrev_b32_e32 v2, v0
 ; FLAT-NEXT:    v_bfrev_b32_e32 v1, v1
-; FLAT-NEXT:    buffer_store_dwordx4 v[1:4], off, s[4:7], 0
+; FLAT-NEXT:    buffer_store_dwordx4 v[1:4], off, s[0:3], 0
 ; FLAT-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: v_brev_v2i64:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 4, v0
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b32_e32 v0, s6
-; GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GISEL-NEXT:    v_mov_b32_e32 v1, s3
 ; GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GISEL-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    v_bfrev_b32_e32 v4, v1
 ; GISEL-NEXT:    v_bfrev_b32_e32 v5, v0
-; GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v6, v3
 ; GISEL-NEXT:    v_bfrev_b32_e32 v7, v2
-; GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; GISEL-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
 ; GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-FLAT-LABEL: v_brev_v2i64:
 ; GFX11-FLAT:       ; %bb.0:
-; GFX11-FLAT-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-FLAT-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-FLAT-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX11-FLAT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FLAT-NEXT:    global_load_b128 v[0:3], v0, s[6:7]
-; GFX11-FLAT-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX11-FLAT-NEXT:    s_mov_b32 s6, -1
+; GFX11-FLAT-NEXT:    global_load_b128 v[0:3], v0, s[2:3]
+; GFX11-FLAT-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-FLAT-NEXT:    s_mov_b32 s2, -1
 ; GFX11-FLAT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FLAT-NEXT:    v_bfrev_b32_e32 v4, v2
 ; GFX11-FLAT-NEXT:    v_bfrev_b32_e32 v3, v3
 ; GFX11-FLAT-NEXT:    v_bfrev_b32_e32 v2, v0
 ; GFX11-FLAT-NEXT:    v_bfrev_b32_e32 v1, v1
-; GFX11-FLAT-NEXT:    buffer_store_b128 v[1:4], off, s[4:7], 0
+; GFX11-FLAT-NEXT:    buffer_store_b128 v[1:4], off, s[0:3], 0
 ; GFX11-FLAT-NEXT:    s_nop 0
 ; GFX11-FLAT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FLAT-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_brev_v2i64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    global_load_b128 v[0:3], v0, s[6:7]
+; GFX11-GISEL-NEXT:    global_load_b128 v[0:3], v0, s[2:3]
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    v_bfrev_b32_e32 v4, v1
 ; GFX11-GISEL-NEXT:    v_bfrev_b32_e32 v5, v0
 ; GFX11-GISEL-NEXT:    v_bfrev_b32_e32 v6, v3
 ; GFX11-GISEL-NEXT:    v_bfrev_b32_e32 v7, v2
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-GISEL-NEXT:    global_store_b128 v0, v[4:7], s[4:5]
+; GFX11-GISEL-NEXT:    global_store_b128 v0, v[4:7], s[0:1]
 ; GFX11-GISEL-NEXT:    s_nop 0
 ; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/build_vector.ll b/llvm/test/CodeGen/AMDGPU/build_vector.ll
index 8d347ae421437..e914635d6c26f 100644
--- a/llvm/test/CodeGen/AMDGPU/build_vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/build_vector.ll
@@ -277,39 +277,39 @@ define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out,
 ;
 ; GFX8-LABEL: build_v2i32_from_v4i16_shuffle:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_lshl_b32 s0, s7, 16
-; GFX8-NEXT:    s_lshl_b32 s1, s6, 16
-; GFX8-NEXT:    v_mov_b32_e32 v2, s4
-; GFX8-NEXT:    v_mov_b32_e32 v0, s1
-; GFX8-NEXT:    v_mov_b32_e32 v1, s0
-; GFX8-NEXT:    v_mov_b32_e32 v3, s5
+; GFX8-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX8-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: build_v2i32_from_v4i16_shuffle:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x0
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_lshl_b32 s0, s6, 16
-; GFX10-NEXT:    s_lshl_b32 s1, s7, 16
-; GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX10-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: build_v2i32_from_v4i16_shuffle:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b32 s0, s6, 16
-; GFX11-NEXT:    s_lshl_b32 s1, s7, 16
-; GFX11-NEXT:    v_mov_b32_e32 v0, s0
-; GFX11-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX11-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX11-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
index 00af922b05f93..15ebdd70ae881 100644
--- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
+++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
@@ -18,20 +18,20 @@ define spir_kernel void @kernel(ptr addrspace(1) %out) {
 ;
 ; VI-LABEL: kernel:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; VI-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: kernel:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_store_b32 v0, v0, s[2:3]
+; GFX11-NEXT:    global_store_b32 v0, v0, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -1245,26 +1245,26 @@ define amdgpu_kernel void @amd_kernel_v5i8(<5 x i8> %arg0) {
 ;
 ; VI-LABEL: amd_kernel_v5i8:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshr_b32 s0, s2, 24
-; VI-NEXT:    s_lshr_b32 s1, s2, 16
-; VI-NEXT:    s_add_i32 s1, s1, s1
-; VI-NEXT:    s_add_i32 s0, s0, s0
-; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    s_lshr_b32 s2, s0, 24
+; VI-NEXT:    s_lshr_b32 s3, s0, 16
+; VI-NEXT:    s_add_i32 s3, s3, s3
 ; VI-NEXT:    s_add_i32 s2, s2, s2
-; VI-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
-; VI-NEXT:    v_mov_b32_e32 v2, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_add_i32 s0, s0, s0
+; VI-NEXT:    v_lshlrev_b16_e64 v1, 8, s2
+; VI-NEXT:    v_mov_b32_e32 v2, s3
 ; VI-NEXT:    v_add_u32_sdwa v0, vcc, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    s_and_b32 s3, s3, 0xff
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    s_and_b32 s1, s1, 0xff
 ; VI-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_add_i32 s3, s3, s3
+; VI-NEXT:    s_add_i32 s1, s1, s1
 ; VI-NEXT:    v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_mov_b32_e32 v0, 4
 ; VI-NEXT:    v_mov_b32_e32 v1, 0
-; VI-NEXT:    v_mov_b32_e32 v5, s3
+; VI-NEXT:    v_mov_b32_e32 v5, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, 0
 ; VI-NEXT:    flat_store_byte v[0:1], v5
@@ -1273,16 +1273,16 @@ define amdgpu_kernel void @amd_kernel_v5i8(<5 x i8> %arg0) {
 ;
 ; GFX11-LABEL: amd_kernel_v5i8:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b16 v0, 8, s2
-; GFX11-NEXT:    s_lshr_b32 s0, s2, 16
-; GFX11-NEXT:    s_lshr_b32 s1, s2, 24
-; GFX11-NEXT:    v_add_nc_u16 v1, s2, s2
-; GFX11-NEXT:    v_add_nc_u16 v2, s1, s1
+; GFX11-NEXT:    v_lshrrev_b16 v0, 8, s0
+; GFX11-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX11-NEXT:    s_lshr_b32 s3, s0, 24
+; GFX11-NEXT:    v_add_nc_u16 v1, s0, s0
+; GFX11-NEXT:    v_add_nc_u16 v2, s3, s3
 ; GFX11-NEXT:    v_add_nc_u16 v0, v0, v0
-; GFX11-NEXT:    v_add_nc_u16 v3, s0, s0
-; GFX11-NEXT:    v_add_nc_u16 v6, s3, s3
+; GFX11-NEXT:    v_add_nc_u16 v3, s2, s2
+; GFX11-NEXT:    v_add_nc_u16 v6, s1, s1
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX11-NEXT:    v_lshlrev_b16 v2, 8, v2
 ; GFX11-NEXT:    v_lshlrev_b16 v0, 8, v0
diff --git a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll
index 9336816866614..d511bb1f4a257 100644
--- a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll
+++ b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll
@@ -30,16 +30,16 @@
 define amdgpu_kernel void @cluster_load_cluster_store(ptr noalias %lb, ptr noalias %sb) {
 ; GFX9-LABEL: cluster_load_cluster_store:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    flat_load_dword v2, v[0:1]
 ; GFX9-NEXT:    flat_load_dword v3, v[0:1] offset:8
 ; GFX9-NEXT:    flat_load_dword v4, v[0:1] offset:16
 ; GFX9-NEXT:    flat_load_dword v5, v[0:1] offset:24
-; GFX9-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    flat_store_dword v[0:1], v2
 ; GFX9-NEXT:    flat_store_dword v[0:1], v3 offset:8
@@ -96,15 +96,15 @@ define amdgpu_kernel void @cluster_load_cluster_store(ptr noalias %lb, ptr noali
 ;
 ; GFX11-LABEL: cluster_load_cluster_store:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    s_clause 0x3
 ; GFX11-NEXT:    flat_load_b32 v2, v[0:1]
 ; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:8
 ; GFX11-NEXT:    flat_load_b32 v4, v[0:1] offset:16
 ; GFX11-NEXT:    flat_load_b32 v5, v[0:1] offset:24
-; GFX11-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
 ; GFX11-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(3)
@@ -155,16 +155,16 @@ bb:
 define amdgpu_kernel void @cluster_load_valu_cluster_store(ptr noalias %lb, ptr noalias %sb) {
 ; GFX9-LABEL: cluster_load_valu_cluster_store:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    flat_load_dword v2, v[0:1]
 ; GFX9-NEXT:    flat_load_dword v3, v[0:1] offset:8
 ; GFX9-NEXT:    flat_load_dword v4, v[0:1] offset:16
 ; GFX9-NEXT:    flat_load_dword v5, v[0:1] offset:24
-; GFX9-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    flat_store_dword v[0:1], v2
 ; GFX9-NEXT:    v_add_u32_e32 v2, 1, v3
@@ -223,15 +223,15 @@ define amdgpu_kernel void @cluster_load_valu_cluster_store(ptr noalias %lb, ptr
 ;
 ; GFX11-LABEL: cluster_load_valu_cluster_store:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    s_clause 0x3
 ; GFX11-NEXT:    flat_load_b32 v2, v[0:1] offset:8
 ; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
 ; GFX11-NEXT:    flat_load_b32 v4, v[0:1] offset:16
 ; GFX11-NEXT:    flat_load_b32 v5, v[0:1] offset:24
-; GFX11-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
 ; GFX11-NEXT:    v_add_nc_u32_e32 v2, 1, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll
index 332b6013803cf..3145ee1f6141e 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll
@@ -127,19 +127,19 @@ define amdgpu_kernel void @v_ctlz_i32(ptr addrspace(1) noalias %out, ptr addrspa
 ;
 ; VI-LABEL: v_ctlz_i32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbh_u32_e32 v0, v0
 ; VI-NEXT:    v_min_u32_e32 v0, 32, v0
-; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_ctlz_i32:
@@ -190,15 +190,15 @@ define amdgpu_kernel void @v_ctlz_i32(ptr addrspace(1) noalias %out, ptr addrspa
 ;
 ; GFX11-LABEL: v_ctlz_i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_clz_i32_u32_e32 v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_min_u32_e32 v0, 32, v0
-; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -235,21 +235,21 @@ define amdgpu_kernel void @v_ctlz_v2i32(ptr addrspace(1) noalias %out, ptr addrs
 ;
 ; VI-LABEL: v_ctlz_v2i32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbh_u32_e32 v1, v1
 ; VI-NEXT:    v_ffbh_u32_e32 v0, v0
 ; VI-NEXT:    v_min_u32_e32 v1, 32, v1
 ; VI-NEXT:    v_min_u32_e32 v0, 32, v0
-; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_ctlz_v2i32:
@@ -307,18 +307,18 @@ define amdgpu_kernel void @v_ctlz_v2i32(ptr addrspace(1) noalias %out, ptr addrs
 ;
 ; GFX11-LABEL: v_ctlz_v2i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[0:1], v0, s[6:7]
+; GFX11-NEXT:    global_load_b64 v[0:1], v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_clz_i32_u32_e32 v1, v1
 ; GFX11-NEXT:    v_clz_i32_u32_e32 v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX11-NEXT:    v_min_u32_e32 v0, 32, v0
-; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -359,15 +359,15 @@ define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrs
 ;
 ; VI-LABEL: v_ctlz_v4i32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbh_u32_e32 v3, v3
 ; VI-NEXT:    v_ffbh_u32_e32 v2, v2
@@ -377,7 +377,7 @@ define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrs
 ; VI-NEXT:    v_min_u32_e32 v2, 32, v2
 ; VI-NEXT:    v_min_u32_e32 v1, 32, v1
 ; VI-NEXT:    v_min_u32_e32 v0, 32, v0
-; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_ctlz_v4i32:
@@ -449,11 +449,11 @@ define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrs
 ;
 ; GFX11-LABEL: v_ctlz_v4i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b128 v[0:3], v0, s[6:7]
+; GFX11-NEXT:    global_load_b128 v[0:3], v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_clz_i32_u32_e32 v3, v3
 ; GFX11-NEXT:    v_clz_i32_u32_e32 v2, v2
@@ -465,7 +465,7 @@ define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrs
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX11-NEXT:    v_min_u32_e32 v0, 32, v0
-; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[4:5]
+; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -500,22 +500,22 @@ define amdgpu_kernel void @v_ctlz_i8(ptr addrspace(1) noalias %out, ptr addrspac
 ;
 ; VI-LABEL: v_ctlz_i8:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
-; VI-NEXT:    s_mov_b32 s10, s2
-; VI-NEXT:    s_mov_b32 s11, s3
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s10, s6
+; VI-NEXT:    s_mov_b32 s11, s7
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s8, s6
-; VI-NEXT:    s_mov_b32 s9, s7
+; VI-NEXT:    s_mov_b32 s8, s2
+; VI-NEXT:    s_mov_b32 s9, s3
 ; VI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
-; VI-NEXT:    s_mov_b32 s0, s4
-; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbh_u32_e32 v0, v0
 ; VI-NEXT:    v_min_u32_e32 v0, 32, v0
 ; VI-NEXT:    v_subrev_u32_e32 v0, vcc, 24, v0
-; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; VI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_ctlz_i8:
@@ -576,16 +576,16 @@ define amdgpu_kernel void @v_ctlz_i8(ptr addrspace(1) noalias %out, ptr addrspac
 ;
 ; GFX11-LABEL: v_ctlz_i8:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_u8 v1, v0, s[6:7]
+; GFX11-NEXT:    global_load_u8 v1, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_clz_i32_u32_e32 v1, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX11-NEXT:    v_subrev_nc_u32_e32 v1, 24, v1
-; GFX11-NEXT:    global_store_b8 v0, v1, s[4:5]
+; GFX11-NEXT:    global_store_b8 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -706,16 +706,16 @@ define amdgpu_kernel void @s_ctlz_i64_trunc(ptr addrspace(1) noalias %out, i64 %
 ;
 ; VI-LABEL: s_ctlz_i64_trunc:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s0, s4
-; VI-NEXT:    s_flbit_i32_b64 s4, s[6:7]
-; VI-NEXT:    s_min_u32 s4, s4, 64
-; VI-NEXT:    s_mov_b32 s1, s5
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_flbit_i32_b64 s0, s[2:3]
+; VI-NEXT:    s_min_u32 s0, s0, 64
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_ctlz_i64_trunc:
@@ -759,13 +759,13 @@ define amdgpu_kernel void @s_ctlz_i64_trunc(ptr addrspace(1) noalias %out, i64 %
 ;
 ; GFX11-LABEL: s_ctlz_i64_trunc:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_clz_i32_u64 s0, s[6:7]
+; GFX11-NEXT:    s_clz_i32_u64 s2, s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_min_u32 s0, s0, 64
-; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
-; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX11-NEXT:    s_min_u32 s2, s2, 64
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -799,16 +799,16 @@ define amdgpu_kernel void @v_ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspa
 ;
 ; VI-LABEL: v_ctlz_i64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
 ; VI-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v3
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v3
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v4, s5
-; VI-NEXT:    v_add_u32_e32 v3, vcc, s4, v3
+; VI-NEXT:    v_mov_b32_e32 v4, s1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, s0, v3
 ; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbh_u32_e32 v0, v0
@@ -878,10 +878,10 @@ define amdgpu_kernel void @v_ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspa
 ;
 ; GFX11-LABEL: v_ctlz_i64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[0:1], v2, s[6:7]
+; GFX11-NEXT:    global_load_b64 v[0:1], v2, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_clz_i32_u32_e32 v0, v0
 ; GFX11-NEXT:    v_clz_i32_u32_e32 v1, v1
@@ -889,7 +889,7 @@ define amdgpu_kernel void @v_ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspa
 ; GFX11-NEXT:    v_add_nc_u32_e64 v0, v0, 32 clamp
 ; GFX11-NEXT:    v_min3_u32 v0, v0, v1, 64
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -926,16 +926,16 @@ define amdgpu_kernel void @v_ctlz_i64_trunc(ptr addrspace(1) noalias %out, ptr a
 ;
 ; VI-LABEL: v_ctlz_i64_trunc:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v2, s7
-; VI-NEXT:    v_add_u32_e32 v1, vcc, s6, v1
+; VI-NEXT:    v_mov_b32_e32 v2, s3
+; VI-NEXT:    v_add_u32_e32 v1, vcc, s2, v1
 ; VI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[1:2], v[1:2]
-; VI-NEXT:    v_add_u32_e32 v3, vcc, s4, v0
-; VI-NEXT:    v_mov_b32_e32 v4, s5
+; VI-NEXT:    v_add_u32_e32 v3, vcc, s0, v0
+; VI-NEXT:    v_mov_b32_e32 v4, s1
 ; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbh_u32_e32 v0, v1
@@ -1005,18 +1005,18 @@ define amdgpu_kernel void @v_ctlz_i64_trunc(ptr addrspace(1) noalias %out, ptr a
 ;
 ; GFX11-LABEL: v_ctlz_i64_trunc:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[1:2], v1, s[6:7]
+; GFX11-NEXT:    global_load_b64 v[1:2], v1, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_clz_i32_u32_e32 v1, v1
 ; GFX11-NEXT:    v_clz_i32_u32_e32 v2, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_nc_u32_e64 v1, v1, 32 clamp
 ; GFX11-NEXT:    v_min3_u32 v1, v1, v2, 64
-; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -1052,18 +1052,18 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out,
 ;
 ; VI-LABEL: v_ctlz_i32_sel_eq_neg1:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbh_u32_e32 v0, v0
-; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_ctlz_i32_sel_eq_neg1:
@@ -1117,13 +1117,13 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out,
 ;
 ; GFX11-LABEL: v_ctlz_i32_sel_eq_neg1:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_clz_i32_u32_e32 v0, v0
-; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -1159,18 +1159,18 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out,
 ;
 ; VI-LABEL: v_ctlz_i32_sel_ne_neg1:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbh_u32_e32 v0, v0
-; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_ctlz_i32_sel_ne_neg1:
@@ -1224,13 +1224,13 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out,
 ;
 ; GFX11-LABEL: v_ctlz_i32_sel_ne_neg1:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_clz_i32_u32_e32 v0, v0
-; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -1270,21 +1270,21 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias %
 ;
 ; VI-LABEL: v_ctlz_i32_sel_eq_bitwidth:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbh_u32_e32 v0, v0
 ; VI-NEXT:    v_min_u32_e32 v0, 32, v0
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
-; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_ctlz_i32_sel_eq_bitwidth:
@@ -1343,17 +1343,17 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias %
 ;
 ; GFX11-LABEL: v_ctlz_i32_sel_eq_bitwidth:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_clz_i32_u32_e32 v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_min_u32_e32 v0, 32, v0
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 32, v0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
-; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -1392,21 +1392,21 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ;
 ; VI-LABEL: v_ctlz_i32_sel_ne_bitwidth:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbh_u32_e32 v0, v0
 ; VI-NEXT:    v_min_u32_e32 v0, 32, v0
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
-; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_ctlz_i32_sel_ne_bitwidth:
@@ -1465,17 +1465,17 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ;
 ; GFX11-LABEL: v_ctlz_i32_sel_ne_bitwidth:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_clz_i32_u32_e32 v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_min_u32_e32 v0, 32, v0
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 32, v0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
-; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -1510,17 +1510,17 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ;
 ; VI-LABEL: v_ctlz_i8_sel_eq_neg1:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_ubyte v0, v[0:1]
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbh_u32_e32 v0, v0
-; VI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
+; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_ctlz_i8_sel_eq_neg1:
@@ -1583,13 +1583,13 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ;
 ; GFX11-LABEL: v_ctlz_i8_sel_eq_neg1:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_u8 v0, v0, s[6:7]
+; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_clz_i32_u32_e32 v0, v0
-; GFX11-NEXT:    global_store_b8 v1, v0, s[4:5]
+; GFX11-NEXT:    global_store_b8 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -1624,25 +1624,25 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ;
 ; VI-LABEL: v_ctlz_i16_sel_eq_neg1:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
-; VI-NEXT:    s_mov_b32 s10, s2
-; VI-NEXT:    s_mov_b32 s11, s3
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s10, s6
+; VI-NEXT:    s_mov_b32 s11, s7
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s8, s6
-; VI-NEXT:    s_mov_b32 s9, s7
+; VI-NEXT:    s_mov_b32 s8, s2
+; VI-NEXT:    s_mov_b32 s9, s3
 ; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
 ; VI-NEXT:    v_mov_b32_e32 v1, 0xffff
-; VI-NEXT:    s_mov_b32 s0, s4
-; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbh_u32_e32 v2, v0
 ; VI-NEXT:    v_min_u32_e32 v2, 32, v2
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, -16, v2
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_ctlz_i16_sel_eq_neg1:
@@ -1705,10 +1705,10 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ;
 ; GFX11-LABEL: v_ctlz_i16_sel_eq_neg1:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_u16 v1, v0, s[6:7]
+; GFX11-NEXT:    global_load_u16 v1, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_clz_i32_u32_e32 v2, v1
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
@@ -1717,7 +1717,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; GFX11-NEXT:    v_add_nc_u32_e32 v2, -16, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo
-; GFX11-NEXT:    global_store_b16 v0, v1, s[4:5]
+; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -1752,18 +1752,18 @@ define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out,
 ;
 ; VI-LABEL: v_ctlz_i7_sel_eq_neg1:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_ubyte v0, v[0:1]
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbh_u32_e32 v0, v0
 ; VI-NEXT:    v_and_b32_e32 v0, 0x7f, v0
-; VI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
+; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_ctlz_i7_sel_eq_neg1:
@@ -1829,14 +1829,14 @@ define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out,
 ;
 ; GFX11-LABEL: v_ctlz_i7_sel_eq_neg1:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_u8 v0, v0, s[6:7]
+; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_clz_i32_u32_e32 v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x7f, v0
-; GFX11-NEXT:    global_store_b8 v1, v0, s[4:5]
+; GFX11-NEXT:    global_store_b8 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index 97529b5687a71..a377714ebf737 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -99,17 +99,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out,
 ;
 ; VI-LABEL: v_ctlz_zero_undef_i32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbh_u32_e32 v2, v0
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -174,15 +174,15 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(ptr addrspace(1) noalias %out
 ;
 ; VI-LABEL: v_ctlz_zero_undef_v2i32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v2, s4
-; VI-NEXT:    v_mov_b32_e32 v3, s5
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbh_u32_e32 v1, v1
 ; VI-NEXT:    v_ffbh_u32_e32 v0, v0
@@ -254,15 +254,15 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v4i32(ptr addrspace(1) noalias %out
 ;
 ; VI-LABEL: v_ctlz_zero_undef_v4i32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v4, s4
-; VI-NEXT:    v_mov_b32_e32 v5, s5
+; VI-NEXT:    v_mov_b32_e32 v5, s1
+; VI-NEXT:    v_mov_b32_e32 v4, s0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbh_u32_e32 v3, v3
 ; VI-NEXT:    v_ffbh_u32_e32 v2, v2
@@ -534,13 +534,13 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no
 ;
 ; VI-LABEL: s_ctlz_zero_undef_i64_with_select:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_flbit_i32_b64 s0, s[6:7]
-; VI-NEXT:    v_mov_b32_e32 v2, s4
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v3, s5
+; VI-NEXT:    s_flbit_i32_b64 s2, s[2:3]
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
 ;
@@ -602,18 +602,18 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
 ;
 ; VI-LABEL: v_ctlz_zero_undef_i8_with_select:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s6
-; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    flat_load_ubyte v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 24, v0
 ; VI-NEXT:    v_ffbh_u32_e32 v1, v1
 ; VI-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v2, 32, v1, vcc
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_byte v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -698,14 +698,14 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
 ;
 ; VI-LABEL: v_ctlz_zero_undef_i16_with_select:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_add_u32 s0, s6, 1
-; VI-NEXT:    s_addc_u32 s1, s7, 0
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v0, s6
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    s_add_u32 s4, s2, 1
+; VI-NEXT:    s_addc_u32 s5, s3, 0
+; VI-NEXT:    v_mov_b32_e32 v2, s4
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v3, s5
+; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    flat_load_ubyte v2, v[2:3]
 ; VI-NEXT:    flat_load_ubyte v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(1)
@@ -716,8 +716,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, -16, v1
 ; VI-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v2, 32, v1, vcc
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_short v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -810,22 +810,22 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no
 ;
 ; VI-LABEL: v_ctlz_zero_undef_i32_with_select:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_add_u32 s0, s6, 3
-; VI-NEXT:    s_addc_u32 s1, s7, 0
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    s_add_u32 s0, s6, 2
-; VI-NEXT:    s_addc_u32 s1, s7, 0
-; VI-NEXT:    v_mov_b32_e32 v5, s1
-; VI-NEXT:    v_mov_b32_e32 v4, s0
-; VI-NEXT:    s_add_u32 s0, s6, 1
-; VI-NEXT:    s_addc_u32 s1, s7, 0
-; VI-NEXT:    v_mov_b32_e32 v0, s6
-; VI-NEXT:    v_mov_b32_e32 v7, s1
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_mov_b32_e32 v6, s0
+; VI-NEXT:    s_add_u32 s4, s2, 3
+; VI-NEXT:    s_addc_u32 s5, s3, 0
+; VI-NEXT:    v_mov_b32_e32 v2, s4
+; VI-NEXT:    v_mov_b32_e32 v3, s5
+; VI-NEXT:    s_add_u32 s4, s2, 2
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    s_addc_u32 s5, s3, 0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    s_add_u32 s2, s2, 1
+; VI-NEXT:    s_addc_u32 s3, s3, 0
+; VI-NEXT:    v_mov_b32_e32 v4, s4
+; VI-NEXT:    v_mov_b32_e32 v7, s3
+; VI-NEXT:    v_mov_b32_e32 v5, s5
+; VI-NEXT:    v_mov_b32_e32 v6, s2
 ; VI-NEXT:    flat_load_ubyte v2, v[2:3]
 ; VI-NEXT:    flat_load_ubyte v3, v[4:5]
 ; VI-NEXT:    flat_load_ubyte v4, v[6:7]
@@ -841,8 +841,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no
 ; VI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; VI-NEXT:    v_ffbh_u32_e32 v0, v0
 ; VI-NEXT:    v_min_u32_e32 v2, 32, v0
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -947,43 +947,43 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no
 ;
 ; VI-LABEL: v_ctlz_zero_undef_i64_with_select:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_add_u32 s0, s6, 5
-; VI-NEXT:    s_addc_u32 s1, s7, 0
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    s_add_u32 s0, s6, 4
-; VI-NEXT:    s_addc_u32 s1, s7, 0
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    s_add_u32 s0, s6, 7
-; VI-NEXT:    s_addc_u32 s1, s7, 0
-; VI-NEXT:    v_mov_b32_e32 v5, s1
-; VI-NEXT:    v_mov_b32_e32 v4, s0
-; VI-NEXT:    s_add_u32 s0, s6, 6
-; VI-NEXT:    s_addc_u32 s1, s7, 0
-; VI-NEXT:    v_mov_b32_e32 v7, s1
-; VI-NEXT:    v_mov_b32_e32 v6, s0
-; VI-NEXT:    s_add_u32 s0, s6, 3
-; VI-NEXT:    s_addc_u32 s1, s7, 0
-; VI-NEXT:    v_mov_b32_e32 v9, s1
-; VI-NEXT:    v_mov_b32_e32 v8, s0
-; VI-NEXT:    s_add_u32 s0, s6, 2
-; VI-NEXT:    s_addc_u32 s1, s7, 0
-; VI-NEXT:    v_mov_b32_e32 v11, s1
-; VI-NEXT:    v_mov_b32_e32 v10, s0
-; VI-NEXT:    s_add_u32 s0, s6, 1
+; VI-NEXT:    s_add_u32 s4, s2, 5
+; VI-NEXT:    s_addc_u32 s5, s3, 0
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    s_add_u32 s4, s2, 4
+; VI-NEXT:    s_addc_u32 s5, s3, 0
+; VI-NEXT:    v_mov_b32_e32 v2, s4
+; VI-NEXT:    v_mov_b32_e32 v3, s5
+; VI-NEXT:    s_add_u32 s4, s2, 7
+; VI-NEXT:    s_addc_u32 s5, s3, 0
+; VI-NEXT:    v_mov_b32_e32 v4, s4
+; VI-NEXT:    v_mov_b32_e32 v5, s5
+; VI-NEXT:    s_add_u32 s4, s2, 6
+; VI-NEXT:    s_addc_u32 s5, s3, 0
+; VI-NEXT:    v_mov_b32_e32 v7, s5
+; VI-NEXT:    v_mov_b32_e32 v6, s4
+; VI-NEXT:    s_add_u32 s4, s2, 3
+; VI-NEXT:    s_addc_u32 s5, s3, 0
+; VI-NEXT:    v_mov_b32_e32 v9, s5
+; VI-NEXT:    v_mov_b32_e32 v8, s4
+; VI-NEXT:    s_add_u32 s4, s2, 2
+; VI-NEXT:    s_addc_u32 s5, s3, 0
+; VI-NEXT:    v_mov_b32_e32 v11, s5
+; VI-NEXT:    v_mov_b32_e32 v10, s4
+; VI-NEXT:    s_add_u32 s4, s2, 1
 ; VI-NEXT:    flat_load_ubyte v12, v[0:1]
 ; VI-NEXT:    flat_load_ubyte v13, v[2:3]
 ; VI-NEXT:    flat_load_ubyte v4, v[4:5]
 ; VI-NEXT:    flat_load_ubyte v5, v[6:7]
-; VI-NEXT:    s_addc_u32 s1, s7, 0
-; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_addc_u32 s5, s3, 0
+; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    flat_load_ubyte v6, v[8:9]
-; VI-NEXT:    v_mov_b32_e32 v2, s6
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_mov_b32_e32 v3, s7
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_load_ubyte v7, v[10:11]
 ; VI-NEXT:    flat_load_ubyte v0, v[0:1]
 ; VI-NEXT:    flat_load_ubyte v2, v[2:3]
@@ -1010,9 +1010,9 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no
 ; VI-NEXT:    v_ffbh_u32_e32 v0, v0
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
 ; VI-NEXT:    v_min_u32_e32 v0, v0, v3
-; VI-NEXT:    v_mov_b32_e32 v2, s4
+; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_min_u32_e32 v0, 64, v0
-; VI-NEXT:    v_mov_b32_e32 v3, s5
+; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
 ;
@@ -1114,17 +1114,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p
 ;
 ; VI-LABEL: v_ctlz_zero_undef_i8:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_ubyte v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; VI-NEXT:    v_ffbh_u32_e32 v2, v0
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_byte v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -1257,12 +1257,12 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias
 ;
 ; VI-LABEL: s_ctlz_zero_undef_i64_trunc:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_flbit_i32_b64 s0, s[6:7]
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    s_flbit_i32_b64 s2, s[2:3]
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -1319,16 +1319,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out,
 ;
 ; VI-LABEL: v_ctlz_zero_undef_i64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
 ; VI-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v3
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v3
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v4, s5
-; VI-NEXT:    v_add_u32_e32 v3, vcc, s4, v3
+; VI-NEXT:    v_mov_b32_e32 v4, s1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, s0, v3
 ; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbh_u32_e32 v0, v0
@@ -1409,16 +1409,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias
 ;
 ; VI-LABEL: v_ctlz_zero_undef_i64_trunc:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v2, s7
-; VI-NEXT:    v_add_u32_e32 v1, vcc, s6, v1
+; VI-NEXT:    v_mov_b32_e32 v2, s3
+; VI-NEXT:    v_add_u32_e32 v1, vcc, s2, v1
 ; VI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[1:2], v[1:2]
-; VI-NEXT:    v_mov_b32_e32 v4, s5
-; VI-NEXT:    v_add_u32_e32 v3, vcc, s4, v0
+; VI-NEXT:    v_mov_b32_e32 v4, s1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, s0, v0
 ; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbh_u32_e32 v0, v1
@@ -1499,17 +1499,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(ptr addrspace(1) no
 ;
 ; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbh_u32_e32 v2, v0
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -1578,17 +1578,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_neg1(ptr addrspace(1) no
 ;
 ; VI-LABEL: v_ctlz_zero_undef_i32_sel_ne_neg1:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbh_u32_e32 v2, v0
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -1656,16 +1656,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noa
 ;
 ; VI-LABEL: v_ctlz_zero_undef_i8_sel_eq_neg1:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_ubyte v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbh_u32_e32 v2, v0
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_byte v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -1751,15 +1751,15 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(ptr addrspa
 ;
 ; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1_two_use:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v2, v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbh_u32_e32 v3, v2
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
@@ -1851,19 +1851,19 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(ptr addrspace(1) noali
 ;
 ; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_0:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbh_u32_e32 v1, v0
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v2, 0, v1, vcc
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -1935,19 +1935,19 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(ptr addrspace(1) noali
 ;
 ; VI-LABEL: v_ctlz_zero_undef_i32_sel_ne_0:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbh_u32_e32 v1, v0
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v2, 0, v1, vcc
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -2019,19 +2019,19 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(ptr addrspace(1
 ;
 ; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_cmp_non0:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbh_u32_e32 v1, v0
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 1, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v2, 0, v1, vcc
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -2104,19 +2104,19 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(ptr addrspace(1
 ;
 ; VI-LABEL: v_ctlz_zero_undef_i32_sel_ne_cmp_non0:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbh_u32_e32 v1, v0
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 1, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v2, 0, v1, vcc
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/ctpop16.ll b/llvm/test/CodeGen/AMDGPU/ctpop16.ll
index 4f2bde8d0842b..b6359f1816979 100644
--- a/llvm/test/CodeGen/AMDGPU/ctpop16.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctpop16.ll
@@ -91,18 +91,18 @@ define amdgpu_kernel void @v_ctpop_i16(ptr addrspace(1) noalias %out, ptr addrsp
 ;
 ; VI-LABEL: v_ctpop_i16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_ushort v0, v[0:1]
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_bcnt_u32_b32 v0, v0, 0
-; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_ctpop_i16:
@@ -344,15 +344,15 @@ define amdgpu_kernel void @v_ctpop_v2i16(ptr addrspace(1) noalias %out, ptr addr
 ;
 ; VI-LABEL: v_ctpop_v2i16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; VI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -360,7 +360,7 @@ define amdgpu_kernel void @v_ctpop_v2i16(ptr addrspace(1) noalias %out, ptr addr
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; VI-NEXT:    v_bcnt_u32_b32 v0, v0, 0
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
-; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_ctpop_v2i16:
@@ -430,15 +430,15 @@ define amdgpu_kernel void @v_ctpop_v4i16(ptr addrspace(1) noalias %out, ptr addr
 ;
 ; VI-LABEL: v_ctpop_v4i16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
 ; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
@@ -452,7 +452,7 @@ define amdgpu_kernel void @v_ctpop_v4i16(ptr addrspace(1) noalias %out, ptr addr
 ; VI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; VI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v3
-; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_ctpop_v4i16:
@@ -562,17 +562,15 @@ define amdgpu_kernel void @v_ctpop_v8i16(ptr addrspace(1) noalias %out, ptr addr
 ;
 ; VI-LABEL: v_ctpop_v8i16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT:    s_mov_b32 s0, s4
-; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
 ; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
@@ -771,20 +769,18 @@ define amdgpu_kernel void @v_ctpop_v16i16(ptr addrspace(1) noalias %out, ptr add
 ;
 ; VI-LABEL: v_ctpop_v16i16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_add_u32_e32 v4, vcc, s6, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v4, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[4:5]
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, 16, v4
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
 ; VI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
-; VI-NEXT:    s_mov_b32 s0, s4
-; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshrrev_b32_e32 v8, 16, v3
 ; VI-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
@@ -1039,18 +1035,18 @@ define amdgpu_kernel void @v_ctpop_i16_add_inline_constant(ptr addrspace(1) noal
 ;
 ; VI-LABEL: v_ctpop_i16_add_inline_constant:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_ushort v0, v[0:1]
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_bcnt_u32_b32 v0, v0, 4
-; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_ctpop_i16_add_inline_constant:
@@ -1111,18 +1107,18 @@ define amdgpu_kernel void @v_ctpop_i16_add_inline_constant_inv(ptr addrspace(1)
 ;
 ; VI-LABEL: v_ctpop_i16_add_inline_constant_inv:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_ushort v0, v[0:1]
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_bcnt_u32_b32 v0, v0, 4
-; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_ctpop_i16_add_inline_constant_inv:
@@ -1184,19 +1180,19 @@ define amdgpu_kernel void @v_ctpop_i16_add_literal(ptr addrspace(1) noalias %out
 ;
 ; VI-LABEL: v_ctpop_i16_add_literal:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
-; VI-NEXT:    s_movk_i32 s0, 0x3e7
+; VI-NEXT:    s_movk_i32 s4, 0x3e7
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_ushort v0, v[0:1]
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_bcnt_u32_b32 v0, v0, s0
-; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT:    v_bcnt_u32_b32 v0, v0, s4
+; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_ctpop_i16_add_literal:
diff --git a/llvm/test/CodeGen/AMDGPU/ctpop64.ll b/llvm/test/CodeGen/AMDGPU/ctpop64.ll
index 633f12047e5b1..131ce14a7847c 100644
--- a/llvm/test/CodeGen/AMDGPU/ctpop64.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctpop64.ll
@@ -66,19 +66,19 @@ define amdgpu_kernel void @v_ctpop_i64(ptr addrspace(1) noalias %out, ptr addrsp
 ;
 ; VI-LABEL: v_ctpop_i64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_bcnt_u32_b32 v0, v0, 0
 ; VI-NEXT:    v_bcnt_u32_b32 v0, v1, v0
-; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
@@ -242,21 +242,21 @@ define amdgpu_kernel void @v_ctpop_v2i64(ptr addrspace(1) noalias %out, ptr addr
 ;
 ; VI-LABEL: v_ctpop_v2i64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_bcnt_u32_b32 v0, v0, 0
 ; VI-NEXT:    v_bcnt_u32_b32 v2, v2, 0
 ; VI-NEXT:    v_bcnt_u32_b32 v0, v1, v0
 ; VI-NEXT:    v_bcnt_u32_b32 v1, v3, v2
-; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr <2 x i64>, ptr addrspace(1) %in, i32 %tid
@@ -298,18 +298,18 @@ define amdgpu_kernel void @v_ctpop_v4i64(ptr addrspace(1) noalias %out, ptr addr
 ;
 ; VI-LABEL: v_ctpop_v4i64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_add_u32_e32 v4, vcc, s6, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v4, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[4:5]
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, 16, v4
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
 ; VI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_bcnt_u32_b32 v0, v0, 0
 ; VI-NEXT:    v_bcnt_u32_b32 v8, v2, 0
@@ -320,7 +320,7 @@ define amdgpu_kernel void @v_ctpop_v4i64(ptr addrspace(1) noalias %out, ptr addr
 ; VI-NEXT:    v_bcnt_u32_b32 v6, v6, 0
 ; VI-NEXT:    v_bcnt_u32_b32 v4, v5, v4
 ; VI-NEXT:    v_bcnt_u32_b32 v5, v7, v6
-; VI-NEXT:    buffer_store_dwordx4 v[2:5], off, s[4:7], 0
+; VI-NEXT:    buffer_store_dwordx4 v[2:5], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr <4 x i64>, ptr addrspace(1) %in, i32 %tid
@@ -507,22 +507,22 @@ define amdgpu_kernel void @v_ctpop_i128(ptr addrspace(1) noalias %out, ptr addrs
 ;
 ; VI-LABEL: v_ctpop_i128:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_bcnt_u32_b32 v2, v2, 0
 ; VI-NEXT:    v_bcnt_u32_b32 v0, v0, 0
 ; VI-NEXT:    v_bcnt_u32_b32 v2, v3, v2
 ; VI-NEXT:    v_bcnt_u32_b32 v0, v1, v0
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
-; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i128, ptr addrspace(1) %in, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/cttz.ll b/llvm/test/CodeGen/AMDGPU/cttz.ll
index 483402d4778d1..ec532c8e4adc3 100644
--- a/llvm/test/CodeGen/AMDGPU/cttz.ll
+++ b/llvm/test/CodeGen/AMDGPU/cttz.ll
@@ -111,19 +111,19 @@ define amdgpu_kernel void @v_cttz_i32(ptr addrspace(1) noalias %out, ptr addrspa
 ;
 ; VI-LABEL: v_cttz_i32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbl_b32_e32 v0, v0
 ; VI-NEXT:    v_min_u32_e32 v0, 32, v0
-; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_cttz_i32:
@@ -204,21 +204,21 @@ define amdgpu_kernel void @v_cttz_v2i32(ptr addrspace(1) noalias %out, ptr addrs
 ;
 ; VI-LABEL: v_cttz_v2i32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbl_b32_e32 v1, v1
 ; VI-NEXT:    v_ffbl_b32_e32 v0, v0
 ; VI-NEXT:    v_min_u32_e32 v1, 32, v1
 ; VI-NEXT:    v_min_u32_e32 v0, 32, v0
-; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_cttz_v2i32:
@@ -310,15 +310,15 @@ define amdgpu_kernel void @v_cttz_v4i32(ptr addrspace(1) noalias %out, ptr addrs
 ;
 ; VI-LABEL: v_cttz_v4i32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbl_b32_e32 v3, v3
 ; VI-NEXT:    v_ffbl_b32_e32 v2, v2
@@ -328,7 +328,7 @@ define amdgpu_kernel void @v_cttz_v4i32(ptr addrspace(1) noalias %out, ptr addrs
 ; VI-NEXT:    v_min_u32_e32 v2, 32, v2
 ; VI-NEXT:    v_min_u32_e32 v1, 32, v1
 ; VI-NEXT:    v_min_u32_e32 v0, 32, v0
-; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_cttz_v4i32:
@@ -427,21 +427,21 @@ define amdgpu_kernel void @v_cttz_i8(ptr addrspace(1) noalias %out, ptr addrspac
 ;
 ; VI-LABEL: v_cttz_i8:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
-; VI-NEXT:    s_mov_b32 s10, s2
-; VI-NEXT:    s_mov_b32 s11, s3
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s10, s6
+; VI-NEXT:    s_mov_b32 s11, s7
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s8, s6
-; VI-NEXT:    s_mov_b32 s9, s7
+; VI-NEXT:    s_mov_b32 s8, s2
+; VI-NEXT:    s_mov_b32 s9, s3
 ; VI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
-; VI-NEXT:    s_mov_b32 s0, s4
-; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_e32 v0, 0x100, v0
 ; VI-NEXT:    v_ffbl_b32_e32 v0, v0
-; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; VI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_cttz_i8:
@@ -598,16 +598,16 @@ define amdgpu_kernel void @s_cttz_i64_trunc(ptr addrspace(1) noalias %out, i64 %
 ;
 ; VI-LABEL: s_cttz_i64_trunc:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s0, s4
-; VI-NEXT:    s_ff1_i32_b64 s4, s[6:7]
-; VI-NEXT:    s_min_u32 s4, s4, 64
-; VI-NEXT:    s_mov_b32 s1, s5
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_ff1_i32_b64 s0, s[2:3]
+; VI-NEXT:    s_min_u32 s0, s0, 64
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_cttz_i64_trunc:
@@ -678,16 +678,16 @@ define amdgpu_kernel void @v_cttz_i64(ptr addrspace(1) noalias %out, ptr addrspa
 ;
 ; VI-LABEL: v_cttz_i64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
 ; VI-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v3
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v3
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v4, s5
-; VI-NEXT:    v_add_u32_e32 v3, vcc, s4, v3
+; VI-NEXT:    v_mov_b32_e32 v4, s1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, s0, v3
 ; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbl_b32_e32 v1, v1
@@ -787,16 +787,16 @@ define amdgpu_kernel void @v_cttz_i64_trunc(ptr addrspace(1) noalias %out, ptr a
 ;
 ; VI-LABEL: v_cttz_i64_trunc:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v2, s7
-; VI-NEXT:    v_add_u32_e32 v1, vcc, s6, v1
+; VI-NEXT:    v_mov_b32_e32 v2, s3
+; VI-NEXT:    v_add_u32_e32 v1, vcc, s2, v1
 ; VI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[1:2], v[1:2]
-; VI-NEXT:    v_add_u32_e32 v3, vcc, s4, v0
-; VI-NEXT:    v_mov_b32_e32 v4, s5
+; VI-NEXT:    v_add_u32_e32 v3, vcc, s0, v0
+; VI-NEXT:    v_mov_b32_e32 v4, s1
 ; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbl_b32_e32 v0, v2
@@ -895,18 +895,18 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out,
 ;
 ; VI-LABEL: v_cttz_i32_sel_eq_neg1:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbl_b32_e32 v0, v0
-; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_cttz_i32_sel_eq_neg1:
@@ -989,18 +989,18 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out,
 ;
 ; VI-LABEL: v_cttz_i32_sel_ne_neg1:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbl_b32_e32 v0, v0
-; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_cttz_i32_sel_ne_neg1:
@@ -1087,21 +1087,21 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias %
 ;
 ; VI-LABEL: v_cttz_i32_sel_eq_bitwidth:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbl_b32_e32 v0, v0
 ; VI-NEXT:    v_min_u32_e32 v0, 32, v0
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
-; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_cttz_i32_sel_eq_bitwidth:
@@ -1192,21 +1192,21 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ;
 ; VI-LABEL: v_cttz_i32_sel_ne_bitwidth:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbl_b32_e32 v0, v0
 ; VI-NEXT:    v_min_u32_e32 v0, 32, v0
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
-; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_cttz_i32_sel_ne_bitwidth:
@@ -1293,17 +1293,17 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ;
 ; VI-LABEL: v_cttz_i8_sel_eq_neg1:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_ubyte v0, v[0:1]
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbl_b32_e32 v0, v0
-; VI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
+; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_cttz_i8_sel_eq_neg1:
@@ -1393,24 +1393,24 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ;
 ; VI-LABEL: v_cttz_i16_sel_eq_neg1:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
-; VI-NEXT:    s_mov_b32 s10, s2
-; VI-NEXT:    s_mov_b32 s11, s3
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s10, s6
+; VI-NEXT:    s_mov_b32 s11, s7
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s8, s6
-; VI-NEXT:    s_mov_b32 s9, s7
+; VI-NEXT:    s_mov_b32 s8, s2
+; VI-NEXT:    s_mov_b32 s9, s3
 ; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
 ; VI-NEXT:    v_mov_b32_e32 v1, 0xffff
-; VI-NEXT:    s_mov_b32 s0, s4
-; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_e32 v2, 0x10000, v0
 ; VI-NEXT:    v_ffbl_b32_e32 v2, v2
 ; VI-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_cttz_i16_sel_eq_neg1:
@@ -1499,18 +1499,18 @@ define amdgpu_kernel void @v_cttz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out,
 ;
 ; VI-LABEL: v_cttz_i7_sel_eq_neg1:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_ubyte v0, v[0:1]
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbl_b32_e32 v0, v0
 ; VI-NEXT:    v_and_b32_e32 v0, 0x7f, v0
-; VI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
+; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_cttz_i7_sel_eq_neg1:
diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
index a6cbfa52db532..086d99916ba04 100644
--- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
@@ -86,17 +86,17 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32(ptr addrspace(1) noalias %out,
 ;
 ; VI-LABEL: v_cttz_zero_undef_i32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbl_b32_e32 v2, v0
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -161,15 +161,15 @@ define amdgpu_kernel void @v_cttz_zero_undef_v2i32(ptr addrspace(1) noalias %out
 ;
 ; VI-LABEL: v_cttz_zero_undef_v2i32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v2, s4
-; VI-NEXT:    v_mov_b32_e32 v3, s5
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbl_b32_e32 v1, v1
 ; VI-NEXT:    v_ffbl_b32_e32 v0, v0
@@ -241,15 +241,15 @@ define amdgpu_kernel void @v_cttz_zero_undef_v4i32(ptr addrspace(1) noalias %out
 ;
 ; VI-LABEL: v_cttz_zero_undef_v4i32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v4, s4
-; VI-NEXT:    v_mov_b32_e32 v5, s5
+; VI-NEXT:    v_mov_b32_e32 v5, s1
+; VI-NEXT:    v_mov_b32_e32 v4, s0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbl_b32_e32 v3, v3
 ; VI-NEXT:    v_ffbl_b32_e32 v2, v2
@@ -510,13 +510,13 @@ define amdgpu_kernel void @s_cttz_zero_undef_i64_with_select(ptr addrspace(1) no
 ;
 ; VI-LABEL: s_cttz_zero_undef_i64_with_select:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_ff1_i32_b64 s0, s[6:7]
-; VI-NEXT:    v_mov_b32_e32 v2, s4
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v3, s5
+; VI-NEXT:    s_ff1_i32_b64 s2, s[2:3]
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
 ;
@@ -577,17 +577,17 @@ define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa
 ;
 ; VI-LABEL: v_cttz_zero_undef_i8_with_select:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s6
-; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    flat_load_ubyte v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbl_b32_e32 v1, v0
 ; VI-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v2, 32, v1, vcc
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_byte v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -668,14 +668,14 @@ define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(ptr addrspace(1) no
 ;
 ; VI-LABEL: v_cttz_zero_undef_i16_with_select:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_add_u32 s0, s6, 1
-; VI-NEXT:    s_addc_u32 s1, s7, 0
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v0, s6
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    s_add_u32 s4, s2, 1
+; VI-NEXT:    s_addc_u32 s5, s3, 0
+; VI-NEXT:    v_mov_b32_e32 v2, s4
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v3, s5
+; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    flat_load_ubyte v2, v[2:3]
 ; VI-NEXT:    flat_load_ubyte v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(1)
@@ -685,8 +685,8 @@ define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(ptr addrspace(1) no
 ; VI-NEXT:    v_ffbl_b32_e32 v1, v0
 ; VI-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v2, 32, v1, vcc
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_short v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -776,22 +776,22 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(ptr addrspace(1) no
 ;
 ; VI-LABEL: v_cttz_zero_undef_i32_with_select:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_add_u32 s0, s6, 3
-; VI-NEXT:    s_addc_u32 s1, s7, 0
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    s_add_u32 s0, s6, 2
-; VI-NEXT:    s_addc_u32 s1, s7, 0
-; VI-NEXT:    v_mov_b32_e32 v5, s1
-; VI-NEXT:    v_mov_b32_e32 v4, s0
-; VI-NEXT:    s_add_u32 s0, s6, 1
-; VI-NEXT:    s_addc_u32 s1, s7, 0
-; VI-NEXT:    v_mov_b32_e32 v0, s6
-; VI-NEXT:    v_mov_b32_e32 v7, s1
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_mov_b32_e32 v6, s0
+; VI-NEXT:    s_add_u32 s4, s2, 3
+; VI-NEXT:    s_addc_u32 s5, s3, 0
+; VI-NEXT:    v_mov_b32_e32 v2, s4
+; VI-NEXT:    v_mov_b32_e32 v3, s5
+; VI-NEXT:    s_add_u32 s4, s2, 2
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    s_addc_u32 s5, s3, 0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    s_add_u32 s2, s2, 1
+; VI-NEXT:    s_addc_u32 s3, s3, 0
+; VI-NEXT:    v_mov_b32_e32 v4, s4
+; VI-NEXT:    v_mov_b32_e32 v7, s3
+; VI-NEXT:    v_mov_b32_e32 v5, s5
+; VI-NEXT:    v_mov_b32_e32 v6, s2
 ; VI-NEXT:    flat_load_ubyte v2, v[2:3]
 ; VI-NEXT:    flat_load_ubyte v3, v[4:5]
 ; VI-NEXT:    flat_load_ubyte v4, v[6:7]
@@ -807,8 +807,8 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(ptr addrspace(1) no
 ; VI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; VI-NEXT:    v_ffbl_b32_e32 v0, v0
 ; VI-NEXT:    v_min_u32_e32 v2, 32, v0
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -913,43 +913,43 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no
 ;
 ; VI-LABEL: v_cttz_zero_undef_i64_with_select:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_add_u32 s0, s6, 5
-; VI-NEXT:    s_addc_u32 s1, s7, 0
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    s_add_u32 s0, s6, 4
-; VI-NEXT:    s_addc_u32 s1, s7, 0
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    s_add_u32 s0, s6, 7
-; VI-NEXT:    s_addc_u32 s1, s7, 0
-; VI-NEXT:    v_mov_b32_e32 v5, s1
-; VI-NEXT:    v_mov_b32_e32 v4, s0
-; VI-NEXT:    s_add_u32 s0, s6, 6
-; VI-NEXT:    s_addc_u32 s1, s7, 0
-; VI-NEXT:    v_mov_b32_e32 v7, s1
-; VI-NEXT:    v_mov_b32_e32 v6, s0
-; VI-NEXT:    s_add_u32 s0, s6, 3
-; VI-NEXT:    s_addc_u32 s1, s7, 0
-; VI-NEXT:    v_mov_b32_e32 v9, s1
-; VI-NEXT:    v_mov_b32_e32 v8, s0
-; VI-NEXT:    s_add_u32 s0, s6, 2
-; VI-NEXT:    s_addc_u32 s1, s7, 0
-; VI-NEXT:    v_mov_b32_e32 v11, s1
-; VI-NEXT:    v_mov_b32_e32 v10, s0
+; VI-NEXT:    s_add_u32 s4, s2, 5
+; VI-NEXT:    s_addc_u32 s5, s3, 0
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    s_add_u32 s4, s2, 4
+; VI-NEXT:    s_addc_u32 s5, s3, 0
+; VI-NEXT:    v_mov_b32_e32 v2, s4
+; VI-NEXT:    v_mov_b32_e32 v3, s5
+; VI-NEXT:    s_add_u32 s4, s2, 7
+; VI-NEXT:    s_addc_u32 s5, s3, 0
+; VI-NEXT:    v_mov_b32_e32 v4, s4
+; VI-NEXT:    v_mov_b32_e32 v5, s5
+; VI-NEXT:    s_add_u32 s4, s2, 6
+; VI-NEXT:    s_addc_u32 s5, s3, 0
+; VI-NEXT:    v_mov_b32_e32 v7, s5
+; VI-NEXT:    v_mov_b32_e32 v6, s4
+; VI-NEXT:    s_add_u32 s4, s2, 3
+; VI-NEXT:    s_addc_u32 s5, s3, 0
+; VI-NEXT:    v_mov_b32_e32 v9, s5
+; VI-NEXT:    v_mov_b32_e32 v8, s4
+; VI-NEXT:    s_add_u32 s4, s2, 2
+; VI-NEXT:    s_addc_u32 s5, s3, 0
+; VI-NEXT:    v_mov_b32_e32 v11, s5
+; VI-NEXT:    v_mov_b32_e32 v10, s4
 ; VI-NEXT:    flat_load_ubyte v12, v[0:1]
 ; VI-NEXT:    flat_load_ubyte v13, v[2:3]
 ; VI-NEXT:    flat_load_ubyte v4, v[4:5]
 ; VI-NEXT:    flat_load_ubyte v5, v[6:7]
-; VI-NEXT:    s_add_u32 s0, s6, 1
+; VI-NEXT:    s_add_u32 s4, s2, 1
 ; VI-NEXT:    flat_load_ubyte v6, v[8:9]
-; VI-NEXT:    s_addc_u32 s1, s7, 0
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v2, s6
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_mov_b32_e32 v3, s7
+; VI-NEXT:    s_addc_u32 s5, s3, 0
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_load_ubyte v7, v[10:11]
 ; VI-NEXT:    flat_load_ubyte v0, v[0:1]
 ; VI-NEXT:    flat_load_ubyte v2, v[2:3]
@@ -976,9 +976,9 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no
 ; VI-NEXT:    v_or_b32_e32 v0, v4, v0
 ; VI-NEXT:    v_ffbl_b32_e32 v0, v0
 ; VI-NEXT:    v_min_u32_e32 v0, v3, v0
-; VI-NEXT:    v_mov_b32_e32 v2, s4
+; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_min_u32_e32 v0, 64, v0
-; VI-NEXT:    v_mov_b32_e32 v3, s5
+; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
 ;
@@ -1091,22 +1091,22 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out,
 ;
 ; VI-LABEL: v_cttz_i32_sel_eq_neg1:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_add_u32 s0, s6, 3
-; VI-NEXT:    s_addc_u32 s1, s7, 0
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    s_add_u32 s0, s6, 2
-; VI-NEXT:    s_addc_u32 s1, s7, 0
-; VI-NEXT:    v_mov_b32_e32 v5, s1
-; VI-NEXT:    v_mov_b32_e32 v4, s0
-; VI-NEXT:    s_add_u32 s0, s6, 1
-; VI-NEXT:    s_addc_u32 s1, s7, 0
-; VI-NEXT:    v_mov_b32_e32 v0, s6
-; VI-NEXT:    v_mov_b32_e32 v7, s1
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_mov_b32_e32 v6, s0
+; VI-NEXT:    s_add_u32 s4, s2, 3
+; VI-NEXT:    s_addc_u32 s5, s3, 0
+; VI-NEXT:    v_mov_b32_e32 v2, s4
+; VI-NEXT:    v_mov_b32_e32 v3, s5
+; VI-NEXT:    s_add_u32 s4, s2, 2
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    s_addc_u32 s5, s3, 0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    s_add_u32 s2, s2, 1
+; VI-NEXT:    s_addc_u32 s3, s3, 0
+; VI-NEXT:    v_mov_b32_e32 v4, s4
+; VI-NEXT:    v_mov_b32_e32 v7, s3
+; VI-NEXT:    v_mov_b32_e32 v5, s5
+; VI-NEXT:    v_mov_b32_e32 v6, s2
 ; VI-NEXT:    flat_load_ubyte v2, v[2:3]
 ; VI-NEXT:    flat_load_ubyte v3, v[4:5]
 ; VI-NEXT:    flat_load_ubyte v4, v[6:7]
@@ -1121,8 +1121,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out,
 ; VI-NEXT:    v_or_b32_e32 v0, v2, v0
 ; VI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; VI-NEXT:    v_ffbl_b32_e32 v2, v0
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -1213,22 +1213,22 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out,
 ;
 ; VI-LABEL: v_cttz_i32_sel_ne_neg1:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_add_u32 s0, s6, 3
-; VI-NEXT:    s_addc_u32 s1, s7, 0
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    s_add_u32 s0, s6, 2
-; VI-NEXT:    s_addc_u32 s1, s7, 0
-; VI-NEXT:    v_mov_b32_e32 v5, s1
-; VI-NEXT:    v_mov_b32_e32 v4, s0
-; VI-NEXT:    s_add_u32 s0, s6, 1
-; VI-NEXT:    s_addc_u32 s1, s7, 0
-; VI-NEXT:    v_mov_b32_e32 v0, s6
-; VI-NEXT:    v_mov_b32_e32 v7, s1
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_mov_b32_e32 v6, s0
+; VI-NEXT:    s_add_u32 s4, s2, 3
+; VI-NEXT:    s_addc_u32 s5, s3, 0
+; VI-NEXT:    v_mov_b32_e32 v2, s4
+; VI-NEXT:    v_mov_b32_e32 v3, s5
+; VI-NEXT:    s_add_u32 s4, s2, 2
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    s_addc_u32 s5, s3, 0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    s_add_u32 s2, s2, 1
+; VI-NEXT:    s_addc_u32 s3, s3, 0
+; VI-NEXT:    v_mov_b32_e32 v4, s4
+; VI-NEXT:    v_mov_b32_e32 v7, s3
+; VI-NEXT:    v_mov_b32_e32 v5, s5
+; VI-NEXT:    v_mov_b32_e32 v6, s2
 ; VI-NEXT:    flat_load_ubyte v2, v[2:3]
 ; VI-NEXT:    flat_load_ubyte v3, v[4:5]
 ; VI-NEXT:    flat_load_ubyte v4, v[6:7]
@@ -1243,8 +1243,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out,
 ; VI-NEXT:    v_or_b32_e32 v0, v2, v0
 ; VI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; VI-NEXT:    v_ffbl_b32_e32 v2, v0
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -1338,22 +1338,22 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ;
 ; VI-LABEL: v_cttz_i32_sel_ne_bitwidth:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_add_u32 s0, s6, 3
-; VI-NEXT:    s_addc_u32 s1, s7, 0
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    s_add_u32 s0, s6, 2
-; VI-NEXT:    s_addc_u32 s1, s7, 0
-; VI-NEXT:    v_mov_b32_e32 v5, s1
-; VI-NEXT:    v_mov_b32_e32 v4, s0
-; VI-NEXT:    s_add_u32 s0, s6, 1
-; VI-NEXT:    s_addc_u32 s1, s7, 0
-; VI-NEXT:    v_mov_b32_e32 v0, s6
-; VI-NEXT:    v_mov_b32_e32 v7, s1
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_mov_b32_e32 v6, s0
+; VI-NEXT:    s_add_u32 s4, s2, 3
+; VI-NEXT:    s_addc_u32 s5, s3, 0
+; VI-NEXT:    v_mov_b32_e32 v2, s4
+; VI-NEXT:    v_mov_b32_e32 v3, s5
+; VI-NEXT:    s_add_u32 s4, s2, 2
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    s_addc_u32 s5, s3, 0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    s_add_u32 s2, s2, 1
+; VI-NEXT:    s_addc_u32 s3, s3, 0
+; VI-NEXT:    v_mov_b32_e32 v4, s4
+; VI-NEXT:    v_mov_b32_e32 v7, s3
+; VI-NEXT:    v_mov_b32_e32 v5, s5
+; VI-NEXT:    v_mov_b32_e32 v6, s2
 ; VI-NEXT:    flat_load_ubyte v2, v[2:3]
 ; VI-NEXT:    flat_load_ubyte v3, v[4:5]
 ; VI-NEXT:    flat_load_ubyte v4, v[6:7]
@@ -1371,8 +1371,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; VI-NEXT:    v_min_u32_e32 v0, 32, v0
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -1453,10 +1453,10 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ;
 ; VI-LABEL: v_cttz_i8_sel_eq_neg1:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s6
-; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    flat_load_ubyte v0, v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v1, 0xff
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -1464,8 +1464,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; VI-NEXT:    v_ffbl_b32_e32 v2, v2
 ; VI-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v2, v1, v2, vcc
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_byte v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -1544,14 +1544,14 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ;
 ; VI-LABEL: v_cttz_i16_sel_eq_neg1:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_add_u32 s0, s6, 1
-; VI-NEXT:    s_addc_u32 s1, s7, 0
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v0, s6
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    s_add_u32 s4, s2, 1
+; VI-NEXT:    s_addc_u32 s5, s3, 0
+; VI-NEXT:    v_mov_b32_e32 v2, s4
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v3, s5
+; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    flat_load_ubyte v2, v[2:3]
 ; VI-NEXT:    flat_load_ubyte v0, v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v1, 0xffff
@@ -1563,8 +1563,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; VI-NEXT:    v_ffbl_b32_e32 v2, v2
 ; VI-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v2, v1, v2, vcc
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_short v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
index fd4e182f6804e..4226728dbe118 100644
--- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@@ -918,17 +918,17 @@ define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr add
 ;
 ; VI-LABEL: load_i8_to_f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_ubyte v0, v[0:1]
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: load_i8_to_f32:
@@ -955,13 +955,13 @@ define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr add
 ;
 ; GFX11-LABEL: load_i8_to_f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_u8 v0, v0, s[6:7]
+; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -996,19 +996,19 @@ define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr
 ;
 ; VI-LABEL: load_v2i8_to_v2f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_ushort v0, v[0:1]
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
 ; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: load_v2i8_to_v2f32:
@@ -1039,15 +1039,15 @@ define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr
 ;
 ; GFX11-LABEL: load_v2i8_to_v2f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_u16 v0, v0, s[6:7]
+; GFX11-NEXT:    global_load_u16 v0, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -1084,20 +1084,20 @@ define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr
 ;
 ; VI-LABEL: load_v3i8_to_v3f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
 ; VI-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
 ; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; VI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[4:7], 0
+; VI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: load_v3i8_to_v3f32:
@@ -1130,15 +1130,15 @@ define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr
 ;
 ; GFX11-LABEL: load_v3i8_to_v3f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
 ; GFX11-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX11-NEXT:    global_store_b96 v3, v[0:2], s[4:5]
+; GFX11-NEXT:    global_store_b96 v3, v[0:2], s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -1175,21 +1175,21 @@ define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr
 ;
 ; VI-LABEL: load_v4i8_to_v4f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
 ; VI-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
 ; VI-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
 ; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: load_v4i8_to_v4f32:
@@ -1224,17 +1224,17 @@ define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr
 ;
 ; GFX11-LABEL: load_v4i8_to_v4f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
 ; GFX11-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
 ; GFX11-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[4:5]
+; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -1281,11 +1281,11 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias
 ;
 ; VI-LABEL: load_v4i8_to_v4f32_unaligned:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -1297,8 +1297,8 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 1, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_ubyte v1, v[0:1]
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v2
 ; VI-NEXT:    s_waitcnt vmcnt(2)
@@ -1307,7 +1307,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias
 ; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v4
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v1
-; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: load_v4i8_to_v4f32_unaligned:
@@ -1355,14 +1355,14 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias
 ;
 ; GFX11-LABEL: load_v4i8_to_v4f32_unaligned:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    v_dual_mov_b32 v5, 0 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    global_load_u8 v1, v0, s[6:7] offset:3
-; GFX11-NEXT:    global_load_u8 v2, v0, s[6:7] offset:2
-; GFX11-NEXT:    global_load_u8 v4, v0, s[6:7] offset:1
-; GFX11-NEXT:    global_load_u8 v0, v0, s[6:7]
+; GFX11-NEXT:    global_load_u8 v1, v0, s[2:3] offset:3
+; GFX11-NEXT:    global_load_u8 v2, v0, s[2:3] offset:2
+; GFX11-NEXT:    global_load_u8 v4, v0, s[2:3] offset:1
+; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(3)
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v3, v1
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
@@ -1371,7 +1371,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v1, v4
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX11-NEXT:    global_store_b128 v5, v[0:3], s[4:5]
+; GFX11-NEXT:    global_store_b128 v5, v[0:3], s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -1426,16 +1426,15 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1
 ;
 ; VI-LABEL: load_v4i8_to_v4f32_unaligned_multiuse:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s8, 0x4000405
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s9
-; VI-NEXT:    v_add_u32_e32 v2, vcc, s8, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v0
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; VI-NEXT:    v_mov_b32_e32 v1, s11
-; VI-NEXT:    v_add_u32_e32 v4, vcc, s10, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v4, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
@@ -1449,11 +1448,12 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1
 ; VI-NEXT:    flat_load_ubyte v2, v[2:3]
 ; VI-NEXT:    flat_load_ubyte v3, v[4:5]
 ; VI-NEXT:    flat_load_ubyte v4, v[0:1]
-; VI-NEXT:    s_mov_b32 s8, 0x4000405
-; VI-NEXT:    s_mov_b32 s0, s6
-; VI-NEXT:    s_mov_b32 s1, s7
-; VI-NEXT:    s_mov_b32 s6, s2
-; VI-NEXT:    s_mov_b32 s7, s3
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s4, s2
+; VI-NEXT:    s_mov_b32 s5, s3
+; VI-NEXT:    s_mov_b32 s2, s6
+; VI-NEXT:    s_mov_b32 s3, s7
 ; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_lshlrev_b32_e32 v5, 8, v6
 ; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v6
@@ -1467,8 +1467,8 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1
 ; VI-NEXT:    v_or_b32_e32 v5, v7, v3
 ; VI-NEXT:    v_mov_b32_e32 v3, v1
 ; VI-NEXT:    v_perm_b32 v4, v4, v5, s8
-; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], 0
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: load_v4i8_to_v4f32_unaligned_multiuse:
@@ -1521,15 +1521,15 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1
 ;
 ; GFX11-LABEL: load_v4i8_to_v4f32_unaligned_multiuse:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    global_load_u8 v1, v0, s[8:9] offset:2
-; GFX11-NEXT:    global_load_u8 v3, v0, s[8:9] offset:3
-; GFX11-NEXT:    global_load_u8 v2, v0, s[10:11] offset:3
-; GFX11-NEXT:    global_load_u8 v0, v0, s[10:11] offset:2
+; GFX11-NEXT:    global_load_u8 v1, v0, s[4:5] offset:2
+; GFX11-NEXT:    global_load_u8 v3, v0, s[4:5] offset:3
+; GFX11-NEXT:    global_load_u8 v2, v0, s[6:7] offset:3
+; GFX11-NEXT:    global_load_u8 v0, v0, s[6:7] offset:2
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
 ; GFX11-NEXT:    v_lshl_or_b32 v4, v3, 8, v1
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v1, v1
@@ -1541,8 +1541,8 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-NEXT:    v_perm_b32 v4, v4, v5, 0x4000405
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_store_b128 v6, v[0:3], s[4:5]
-; GFX11-NEXT:    global_store_b32 v6, v4, s[6:7]
+; GFX11-NEXT:    global_store_b128 v6, v[0:3], s[0:1]
+; GFX11-NEXT:    global_store_b32 v6, v4, s[2:3]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -1605,22 +1605,23 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
 ; VI-LABEL: load_v4i8_to_v4f32_2_uses:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    v_mov_b32_e32 v5, 0xffffff00
-; VI-NEXT:    v_mov_b32_e32 v6, 9
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v4, v[0:1]
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
-; VI-NEXT:    s_mov_b32 s0, s6
-; VI-NEXT:    s_mov_b32 s1, s7
-; VI-NEXT:    s_mov_b32 s6, s2
-; VI-NEXT:    s_mov_b32 s7, s3
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    v_mov_b32_e32 v6, 9
 ; VI-NEXT:    v_mov_b32_e32 v7, 0x900
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s4, s2
+; VI-NEXT:    s_mov_b32 s5, s3
+; VI-NEXT:    s_mov_b32 s2, s6
+; VI-NEXT:    s_mov_b32 s3, s7
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v4
 ; VI-NEXT:    v_cvt_f32_ubyte2_e32 v2, v4
@@ -1630,14 +1631,14 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
 ; VI-NEXT:    v_add_u16_e32 v9, 9, v4
 ; VI-NEXT:    v_and_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_add_u16_sdwa v4, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; VI-NEXT:    s_nop 0
 ; VI-NEXT:    v_or_b32_sdwa v0, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_or_b32_sdwa v1, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u16_e32 v0, 0x900, v0
 ; VI-NEXT:    v_add_u16_sdwa v1, v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: load_v4i8_to_v4f32_2_uses:
@@ -1701,12 +1702,11 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
 ;
 ; GFX11-LABEL: load_v4i8_to_v4f32_2_uses:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x34
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3]
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX11-NEXT:    v_add_nc_u16 v2, v0, 9
@@ -1732,9 +1732,10 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
 ; GFX11-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
 ; GFX11-NEXT:    v_or_b32_e32 v5, v5, v6
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[4:5]
-; GFX11-NEXT:    global_store_b32 v4, v5, s[6:7]
+; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
+; GFX11-NEXT:    global_store_b32 v4, v5, s[2:3]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -1791,11 +1792,11 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr
 ;
 ; VI-LABEL: load_v7i8_to_v7f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 5, v0
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -1816,8 +1817,8 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 4, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_ubyte v9, v[0:1]
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt vmcnt(6)
 ; VI-NEXT:    v_cvt_f32_ubyte0_e32 v5, v10
 ; VI-NEXT:    s_waitcnt vmcnt(4)
@@ -1830,8 +1831,8 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr
 ; VI-NEXT:    v_cvt_f32_ubyte0_e32 v6, v8
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte0_e32 v4, v9
-; VI-NEXT:    buffer_store_dwordx3 v[4:6], off, s[4:7], 0 offset:16
-; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; VI-NEXT:    buffer_store_dwordx3 v[4:6], off, s[0:3], 0 offset:16
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: load_v7i8_to_v7f32:
@@ -1895,17 +1896,17 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr
 ;
 ; GFX11-LABEL: load_v7i8_to_v7f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x5
-; GFX11-NEXT:    global_load_u8 v4, v0, s[6:7] offset:6
-; GFX11-NEXT:    global_load_u8 v1, v0, s[6:7] offset:3
-; GFX11-NEXT:    global_load_u8 v2, v0, s[6:7] offset:2
-; GFX11-NEXT:    global_load_u8 v5, v0, s[6:7] offset:1
-; GFX11-NEXT:    global_load_d16_b16 v7, v0, s[6:7] offset:4
-; GFX11-NEXT:    global_load_u8 v0, v0, s[6:7]
+; GFX11-NEXT:    global_load_u8 v4, v0, s[2:3] offset:6
+; GFX11-NEXT:    global_load_u8 v1, v0, s[2:3] offset:3
+; GFX11-NEXT:    global_load_u8 v2, v0, s[2:3] offset:2
+; GFX11-NEXT:    global_load_u8 v5, v0, s[2:3] offset:1
+; GFX11-NEXT:    global_load_d16_b16 v7, v0, s[2:3] offset:4
+; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(5)
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v6, v4
 ; GFX11-NEXT:    s_waitcnt vmcnt(4)
@@ -1920,8 +1921,8 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_store_b96 v8, v[4:6], s[4:5] offset:16
-; GFX11-NEXT:    global_store_b128 v8, v[0:3], s[4:5]
+; GFX11-NEXT:    global_store_b96 v8, v[4:6], s[0:1] offset:16
+; GFX11-NEXT:    global_store_b128 v8, v[0:3], s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -1963,15 +1964,15 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr
 ;
 ; VI-LABEL: load_v8i8_to_v8f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[7:8], v[0:1]
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v7
 ; VI-NEXT:    v_cvt_f32_ubyte2_e32 v2, v7
@@ -1981,8 +1982,8 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr
 ; VI-NEXT:    v_cvt_f32_ubyte2_e32 v6, v8
 ; VI-NEXT:    v_cvt_f32_ubyte1_e32 v5, v8
 ; VI-NEXT:    v_cvt_f32_ubyte0_e32 v4, v8
-; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
-; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: load_v8i8_to_v8f32:
@@ -2027,11 +2028,11 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr
 ;
 ; GFX11-LABEL: load_v8i8_to_v8f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v10, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[8:9], v0, s[6:7]
+; GFX11-NEXT:    global_load_b64 v[8:9], v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_ubyte3_e32 v7, v9
 ; GFX11-NEXT:    v_cvt_f32_ubyte2_e32 v6, v9
@@ -2042,8 +2043,8 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr
 ; GFX11-NEXT:    v_cvt_f32_ubyte1_e32 v1, v8
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, v8
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_store_b128 v10, v[4:7], s[4:5] offset:16
-; GFX11-NEXT:    global_store_b128 v10, v[0:3], s[4:5]
+; GFX11-NEXT:    global_store_b128 v10, v[4:7], s[0:1] offset:16
+; GFX11-NEXT:    global_store_b128 v10, v[0:3], s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -2078,19 +2079,19 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %ou
 ;
 ; VI-LABEL: i8_zext_inreg_i32_to_f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 2, v0
 ; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: i8_zext_inreg_i32_to_f32:
@@ -2121,15 +2122,15 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %ou
 ;
 ; GFX11-LABEL: i8_zext_inreg_i32_to_f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_add_nc_u32_e32 v0, 2, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -2165,18 +2166,18 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %ou
 ;
 ; VI-LABEL: i8_zext_inreg_hi1_to_f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
-; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: i8_zext_inreg_hi1_to_f32:
@@ -2205,13 +2206,13 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %ou
 ;
 ; GFX11-LABEL: i8_zext_inreg_hi1_to_f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
-; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -2248,17 +2249,17 @@ define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr
 ;
 ; VI-LABEL: i8_zext_i32_to_f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_ubyte v0, v[0:1]
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: i8_zext_i32_to_f32:
@@ -2285,13 +2286,13 @@ define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr
 ;
 ; GFX11-LABEL: i8_zext_i32_to_f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_u8 v0, v0, s[6:7]
+; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -2335,11 +2336,11 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou
 ;
 ; VI-LABEL: v4i8_zext_v4i32_to_v4f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -2351,8 +2352,8 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 1, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_ubyte v1, v[0:1]
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v2
 ; VI-NEXT:    s_waitcnt vmcnt(2)
@@ -2361,7 +2362,7 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou
 ; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v4
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v1
-; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v4i8_zext_v4i32_to_v4f32:
@@ -2409,14 +2410,14 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou
 ;
 ; GFX11-LABEL: v4i8_zext_v4i32_to_v4f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    v_dual_mov_b32 v5, 0 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    global_load_u8 v1, v0, s[6:7] offset:3
-; GFX11-NEXT:    global_load_u8 v2, v0, s[6:7] offset:2
-; GFX11-NEXT:    global_load_u8 v4, v0, s[6:7] offset:1
-; GFX11-NEXT:    global_load_u8 v0, v0, s[6:7]
+; GFX11-NEXT:    global_load_u8 v1, v0, s[2:3] offset:3
+; GFX11-NEXT:    global_load_u8 v2, v0, s[2:3] offset:2
+; GFX11-NEXT:    global_load_u8 v4, v0, s[2:3] offset:1
+; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(3)
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v3, v1
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
@@ -2425,7 +2426,7 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v1, v4
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX11-NEXT:    global_store_b128 v5, v[0:3], s[4:5]
+; GFX11-NEXT:    global_store_b128 v5, v[0:3], s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -2460,18 +2461,18 @@ define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, p
 ;
 ; VI-LABEL: extract_byte0_to_f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: extract_byte0_to_f32:
@@ -2500,13 +2501,13 @@ define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, p
 ;
 ; GFX11-LABEL: extract_byte0_to_f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -2541,18 +2542,18 @@ define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, p
 ;
 ; VI-LABEL: extract_byte1_to_f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
-; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: extract_byte1_to_f32:
@@ -2581,13 +2582,13 @@ define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, p
 ;
 ; GFX11-LABEL: extract_byte1_to_f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
-; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -2623,18 +2624,18 @@ define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, p
 ;
 ; VI-LABEL: extract_byte2_to_f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte2_e32 v0, v0
-; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: extract_byte2_to_f32:
@@ -2663,13 +2664,13 @@ define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, p
 ;
 ; GFX11-LABEL: extract_byte2_to_f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_ubyte2_e32 v0, v0
-; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -2705,18 +2706,18 @@ define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, p
 ;
 ; VI-LABEL: extract_byte3_to_f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte3_e32 v0, v0
-; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: extract_byte3_to_f32:
@@ -2745,13 +2746,13 @@ define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, p
 ;
 ; GFX11-LABEL: extract_byte3_to_f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_ubyte3_e32 v0, v0
-; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
index 67b0cef92d4d3..739fff5084135 100644
--- a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
@@ -396,12 +396,12 @@ define amdgpu_kernel void @uniform_vec_i16_LH(ptr addrspace(1) %out, i16 %a, i32
 ;
 ; GFX11-LABEL: uniform_vec_i16_LH:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_pack_lh_b32_b16 s0, s6, s7
+; GFX11-NEXT:    s_pack_lh_b32_b16 s2, s2, s3
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
-; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -486,12 +486,12 @@ define amdgpu_kernel void @uniform_vec_i16_HH(ptr addrspace(1) %out, i32 %a, i32
 ;
 ; GFX11-LABEL: uniform_vec_i16_HH:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_pack_hh_b32_b16 s0, s6, s7
+; GFX11-NEXT:    s_pack_hh_b32_b16 s2, s2, s3
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
-; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
index 44d65c9e50086..70011e56d016e 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
@@ -208,30 +208,30 @@ define amdgpu_kernel void @extract_vector_elt_v3f16(ptr addrspace(1) %out, <3 x
 ;
 ; VI-LABEL: extract_vector_elt_v3f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s0, s4
-; VI-NEXT:    s_mov_b32 s1, s5
-; VI-NEXT:    v_mov_b32_e32 v0, s7
-; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
-; VI-NEXT:    v_mov_b32_e32 v0, s6
-; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:2
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s3
+; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:2
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: extract_vector_elt_v3f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s6, -1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v0, s7 :: v_dual_mov_b32 v1, s6
-; GFX11-NEXT:    s_mov_b32 s0, s4
-; GFX11-NEXT:    s_mov_b32 s1, s5
+; GFX11-NEXT:    v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT:    s_mov_b32 s4, s0
+; GFX11-NEXT:    s_mov_b32 s5, s1
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
-; GFX11-NEXT:    buffer_store_b16 v1, off, s[0:3], 0 offset:2
+; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-NEXT:    buffer_store_b16 v1, off, s[4:7], 0 offset:2
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -278,16 +278,16 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v3f16(ptr addrspace(1) %ou
 ; GFX11-LABEL: dynamic_extract_vector_elt_v3f16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x34
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[0:1], 0x34
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b32 s0, s2, 4
+; GFX11-NEXT:    s_lshl_b32 s4, s4, 4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_lshr_b64 s[0:1], s[6:7], s0
-; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX11-NEXT:    v_mov_b32_e32 v0, s0
-; GFX11-NEXT:    s_mov_b32 s6, -1
-; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-NEXT:    s_lshr_b64 s[2:3], s[2:3], s4
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
index c35e1e2a7e68a..f34824cd6cefe 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
@@ -212,14 +212,14 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) {
 ;
 ; GFX11-LABEL: s_fabs_v4f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_and_b32 s0, s6, 0x7fff7fff
-; GFX11-NEXT:    s_and_b32 s1, s7, 0x7fff7fff
+; GFX11-NEXT:    s_and_b32 s2, s2, 0x7fff7fff
+; GFX11-NEXT:    s_and_b32 s3, s3, 0x7fff7fff
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT:    v_mov_b32_e32 v0, s0
-; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fabs.ll b/llvm/test/CodeGen/AMDGPU/fabs.ll
index 8f0d6393641a2..07581ade57ccd 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.ll
@@ -113,14 +113,14 @@ define amdgpu_kernel void @fabs_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
 ;
 ; VI-LABEL: fabs_v2f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_and_b32 s0, s7, 0x7fffffff
-; VI-NEXT:    s_and_b32 s1, s6, 0x7fffffff
-; VI-NEXT:    v_mov_b32_e32 v2, s4
-; VI-NEXT:    v_mov_b32_e32 v0, s1
-; VI-NEXT:    v_mov_b32_e32 v1, s0
-; VI-NEXT:    v_mov_b32_e32 v3, s5
+; VI-NEXT:    s_bitset0_b32 s3, 31
+; VI-NEXT:    s_bitset0_b32 s2, 31
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
   %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in)
@@ -185,12 +185,12 @@ define amdgpu_kernel void @fabsf_fn_fold(ptr addrspace(1) %out, float %in0, floa
 ;
 ; VI-LABEL: fabsf_fn_fold:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s7
-; VI-NEXT:    v_mul_f32_e64 v2, |s6|, v0
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v0, s3
+; VI-NEXT:    v_mul_f32_e64 v2, |s2|, v0
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
   %fabs = call float @fabsf(float %in0)
@@ -215,12 +215,12 @@ define amdgpu_kernel void @fabs_fold(ptr addrspace(1) %out, float %in0, float %i
 ;
 ; VI-LABEL: fabs_fold:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s7
-; VI-NEXT:    v_mul_f32_e64 v2, |s6|, v0
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v0, s3
+; VI-NEXT:    v_mul_f32_e64 v2, |s2|, v0
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
   %fabs = call float @llvm.fabs.f32(float %in0)
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
index bdd08aa6287a9..d53c0411ad88c 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
@@ -123,11 +123,11 @@ define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, fl
 ;
 ; GFX12-LABEL: s_test_canonicalize_var_f32:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b96 s[4:6], s[0:1], 0x0
+; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_max_num_f32_e64 v1, s6, s6
-; GFX12-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX12-NEXT:    v_max_num_f32_e64 v1, s2, s2
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
index fd80580c2fb27..ae280c5a443e1 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
@@ -1811,21 +1811,21 @@ define amdgpu_kernel void @s_copysign_v2f16(ptr addrspace(1) %arg_out, <2 x half
 ;
 ; VI-LABEL: s_copysign_v2f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT:    s_movk_i32 s0, 0x7fff
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_movk_i32 s4, 0x7fff
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s6
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    s_lshr_b32 s1, s7, 16
-; VI-NEXT:    s_lshr_b32 s2, s6, 16
-; VI-NEXT:    v_bfi_b32 v0, s0, v0, v1
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    s_lshr_b32 s3, s3, 16
+; VI-NEXT:    s_lshr_b32 s2, s2, 16
+; VI-NEXT:    v_bfi_b32 v0, s4, v0, v1
 ; VI-NEXT:    v_mov_b32_e32 v1, s2
-; VI-NEXT:    v_mov_b32_e32 v2, s1
-; VI-NEXT:    v_bfi_b32 v1, s0, v1, v2
+; VI-NEXT:    v_mov_b32_e32 v2, s3
+; VI-NEXT:    v_bfi_b32 v1, s4, v1, v2
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; VI-NEXT:    v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -1850,20 +1850,21 @@ define amdgpu_kernel void @s_copysign_v2f16(ptr addrspace(1) %arg_out, <2 x half
 ;
 ; GFX11-LABEL: s_copysign_v2f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshr_b32 s0, s7, 16
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v0, s7 :: v_dual_mov_b32 v1, s0
-; GFX11-NEXT:    s_lshr_b32 s0, s6, 16
-; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff, s6, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_bfi_b32 v1, 0x7fff, s0, v1
+; GFX11-NEXT:    v_mov_b32_e32 v0, s3
+; GFX11-NEXT:    s_lshr_b32 s3, s3, 16
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff, s2, v0
+; GFX11-NEXT:    s_lshr_b32 s2, s2, 16
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_bfi_b32 v1, 0x7fff, s2, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-NEXT:    global_store_b32 v2, v0, s[4:5]
+; GFX11-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
index fb04b66e1a6ad..f48961c905f58 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
@@ -21,25 +21,25 @@ define amdgpu_kernel void @s_test_copysign_f32(ptr addrspace(1) %out, float %mag
 ;
 ; VI-LABEL: s_test_copysign_f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT:    s_brev_b32 s0, -2
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_brev_b32 s4, -2
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s6
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_bfi_b32 v2, s0, v0, v1
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_bfi_b32 v2, s4, v0, v1
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_test_copysign_f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s7
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, s6, v0
-; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, s2, v0
+; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -1018,26 +1018,26 @@ define amdgpu_kernel void @s_test_copysign_f32_fpext_f16(ptr addrspace(1) %out,
 ;
 ; VI-LABEL: s_test_copysign_f32_fpext_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT:    s_brev_b32 s0, -2
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_brev_b32 s4, -2
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_lshlrev_b32_e64 v0, 16, s7
-; VI-NEXT:    v_mov_b32_e32 v1, s6
-; VI-NEXT:    v_bfi_b32 v2, s0, v1, v0
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_lshlrev_b32_e64 v0, 16, s3
+; VI-NEXT:    v_mov_b32_e32 v1, s2
+; VI-NEXT:    v_bfi_b32 v2, s4, v1, v0
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_test_copysign_f32_fpext_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e64 v0, 16, s7
+; GFX11-NEXT:    v_lshlrev_b32_e64 v0, 16, s3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, s6, v0
-; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, s2, v0
+; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -1116,26 +1116,26 @@ define amdgpu_kernel void @s_test_copysign_f32_fpext_bf16(ptr addrspace(1) %out,
 ;
 ; VI-LABEL: s_test_copysign_f32_fpext_bf16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT:    s_brev_b32 s0, -2
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_brev_b32 s4, -2
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_lshlrev_b32_e64 v0, 16, s7
-; VI-NEXT:    v_mov_b32_e32 v1, s6
-; VI-NEXT:    v_bfi_b32 v2, s0, v1, v0
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_lshlrev_b32_e64 v0, 16, s3
+; VI-NEXT:    v_mov_b32_e32 v1, s2
+; VI-NEXT:    v_bfi_b32 v2, s4, v1, v0
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_test_copysign_f32_fpext_bf16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e64 v0, 16, s7
+; GFX11-NEXT:    v_lshlrev_b32_e64 v0, 16, s3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, s6, v0
-; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, s2, v0
+; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.ll b/llvm/test/CodeGen/AMDGPU/fdiv.ll
index b63976876c41c..6c5b2917855fc 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.ll
@@ -94,12 +94,12 @@ define amdgpu_kernel void @s_fdiv_f32_ninf(ptr addrspace(1) %out, float %a, floa
 ;
 ; GFX8-LABEL: s_fdiv_f32_ninf:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v0, s6
-; GFX8-NEXT:    v_div_scale_f32 v1, s[0:1], s7, s7, v0
-; GFX8-NEXT:    v_mov_b32_e32 v2, s7
-; GFX8-NEXT:    v_div_scale_f32 v2, vcc, s6, v2, s6
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_div_scale_f32 v1, s[4:5], s3, s3, v0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s3
+; GFX8-NEXT:    v_div_scale_f32 v2, vcc, s2, v2, s2
 ; GFX8-NEXT:    v_rcp_f32_e32 v3, v1
 ; GFX8-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX8-NEXT:    v_fma_f32 v4, -v1, v3, 1.0
@@ -110,9 +110,9 @@ define amdgpu_kernel void @s_fdiv_f32_ninf(ptr addrspace(1) %out, float %a, floa
 ; GFX8-NEXT:    v_fma_f32 v1, -v1, v4, v2
 ; GFX8-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX8-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
-; GFX8-NEXT:    v_div_fixup_f32 v2, v1, s7, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s4
-; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_div_fixup_f32 v2, v1, s3, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -139,10 +139,10 @@ define amdgpu_kernel void @s_fdiv_f32_ninf(ptr addrspace(1) %out, float %a, floa
 ;
 ; GFX11-LABEL: s_fdiv_f32_ninf:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_div_scale_f32 v0, null, s7, s7, s6
-; GFX11-NEXT:    v_div_scale_f32 v2, vcc_lo, s6, s7, s6
+; GFX11-NEXT:    v_div_scale_f32 v0, null, s3, s3, s2
+; GFX11-NEXT:    v_div_scale_f32 v2, vcc_lo, s2, s3, s2
 ; GFX11-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX11-NEXT:    s_denorm_mode 15
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
@@ -155,8 +155,8 @@ define amdgpu_kernel void @s_fdiv_f32_ninf(ptr addrspace(1) %out, float %a, floa
 ; GFX11-NEXT:    s_denorm_mode 12
 ; GFX11-NEXT:    v_div_fmas_f32 v0, v0, v1, v3
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    v_div_fixup_f32 v0, v0, s7, s6
-; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX11-NEXT:    v_div_fixup_f32 v0, v0, s3, s2
+; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -253,12 +253,12 @@ define amdgpu_kernel void @s_fdiv_f32_ieee(ptr addrspace(1) %out, float %a, floa
 ;
 ; GFX8-LABEL: s_fdiv_f32_ieee:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v0, s6
-; GFX8-NEXT:    v_div_scale_f32 v1, s[0:1], s7, s7, v0
-; GFX8-NEXT:    v_mov_b32_e32 v2, s7
-; GFX8-NEXT:    v_div_scale_f32 v2, vcc, s6, v2, s6
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_div_scale_f32 v1, s[4:5], s3, s3, v0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s3
+; GFX8-NEXT:    v_div_scale_f32 v2, vcc, s2, v2, s2
 ; GFX8-NEXT:    v_rcp_f32_e32 v3, v1
 ; GFX8-NEXT:    v_fma_f32 v4, -v1, v3, 1.0
 ; GFX8-NEXT:    v_fma_f32 v3, v4, v3, v3
@@ -267,9 +267,9 @@ define amdgpu_kernel void @s_fdiv_f32_ieee(ptr addrspace(1) %out, float %a, floa
 ; GFX8-NEXT:    v_fma_f32 v4, v5, v3, v4
 ; GFX8-NEXT:    v_fma_f32 v1, -v1, v4, v2
 ; GFX8-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
-; GFX8-NEXT:    v_div_fixup_f32 v2, v1, s7, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s4
-; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_div_fixup_f32 v2, v1, s3, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -294,22 +294,22 @@ define amdgpu_kernel void @s_fdiv_f32_ieee(ptr addrspace(1) %out, float %a, floa
 ;
 ; GFX11-LABEL: s_fdiv_f32_ieee:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_div_scale_f32 v0, null, s7, s7, s6
+; GFX11-NEXT:    v_div_scale_f32 v0, null, s3, s3, s2
 ; GFX11-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_fma_f32 v2, -v0, v1, 1.0
 ; GFX11-NEXT:    v_fmac_f32_e32 v1, v2, v1
-; GFX11-NEXT:    v_div_scale_f32 v2, vcc_lo, s6, s7, s6
+; GFX11-NEXT:    v_div_scale_f32 v2, vcc_lo, s2, s3, s2
 ; GFX11-NEXT:    v_mul_f32_e32 v3, v2, v1
 ; GFX11-NEXT:    v_fma_f32 v4, -v0, v3, v2
 ; GFX11-NEXT:    v_fmac_f32_e32 v3, v4, v1
 ; GFX11-NEXT:    v_fma_f32 v0, -v0, v3, v2
 ; GFX11-NEXT:    v_div_fmas_f32 v0, v0, v1, v3
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    v_div_fixup_f32 v0, v0, s7, s6
-; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX11-NEXT:    v_div_fixup_f32 v0, v0, s3, s2
+; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -353,18 +353,18 @@ define amdgpu_kernel void @s_fdiv_25ulp_f32(ptr addrspace(1) %out, float %a, flo
 ;
 ; GFX8-LABEL: s_fdiv_25ulp_f32:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0x6f800000
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0x2f800000
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_gt_f32_e64 vcc, |s7|, v0
+; GFX8-NEXT:    v_cmp_gt_f32_e64 vcc, |s3|, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; GFX8-NEXT:    v_mul_f32_e32 v1, s7, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, s3, v0
 ; GFX8-NEXT:    v_rcp_f32_e32 v1, v1
-; GFX8-NEXT:    v_mul_f32_e32 v1, s6, v1
+; GFX8-NEXT:    v_mul_f32_e32 v1, s2, v1
 ; GFX8-NEXT:    v_mul_f32_e32 v2, v0, v1
-; GFX8-NEXT:    v_mov_b32_e32 v0, s4
-; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -384,17 +384,17 @@ define amdgpu_kernel void @s_fdiv_25ulp_f32(ptr addrspace(1) %out, float %a, flo
 ;
 ; GFX11-LABEL: s_fdiv_25ulp_f32:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_lt_f32_e64 s0, 0x6f800000, |s7|
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x2f800000, s0
-; GFX11-NEXT:    v_mul_f32_e32 v1, s7, v0
+; GFX11-NEXT:    v_cmp_lt_f32_e64 s4, 0x6f800000, |s3|
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x2f800000, s4
+; GFX11-NEXT:    v_mul_f32_e32 v1, s3, v0
 ; GFX11-NEXT:    v_rcp_f32_e32 v1, v1
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v1, s6, v1
+; GFX11-NEXT:    v_mul_f32_e32 v1, s2, v1
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX11-NEXT:    global_store_b32 v2, v0, s[4:5]
+; GFX11-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -465,18 +465,18 @@ define amdgpu_kernel void @s_fdiv_25ulp_ieee_f32(ptr addrspace(1) %out, float %a
 ;
 ; GFX8-LABEL: s_fdiv_25ulp_ieee_f32:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_frexp_mant_f32_e32 v1, s7
+; GFX8-NEXT:    v_frexp_mant_f32_e32 v1, s3
 ; GFX8-NEXT:    v_rcp_f32_e32 v1, v1
-; GFX8-NEXT:    v_frexp_exp_i32_f32_e32 v0, s7
-; GFX8-NEXT:    v_frexp_exp_i32_f32_e32 v2, s6
-; GFX8-NEXT:    v_frexp_mant_f32_e32 v3, s6
+; GFX8-NEXT:    v_frexp_exp_i32_f32_e32 v0, s3
+; GFX8-NEXT:    v_frexp_exp_i32_f32_e32 v2, s2
+; GFX8-NEXT:    v_frexp_mant_f32_e32 v3, s2
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v2, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v1, v3, v1
 ; GFX8-NEXT:    v_ldexp_f32 v2, v1, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s4
-; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -498,19 +498,19 @@ define amdgpu_kernel void @s_fdiv_25ulp_ieee_f32(ptr addrspace(1) %out, float %a
 ;
 ; GFX11-LABEL: s_fdiv_25ulp_ieee_f32:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_frexp_mant_f32_e32 v0, s7
-; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v1, s7
-; GFX11-NEXT:    v_frexp_mant_f32_e32 v2, s6
-; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v3, s6
+; GFX11-NEXT:    v_frexp_mant_f32_e32 v0, s3
+; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v1, s3
+; GFX11-NEXT:    v_frexp_mant_f32_e32 v2, s2
+; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v3, s2
 ; GFX11-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v1, v3, v1
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v2, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX11-NEXT:    global_store_b32 v2, v0, s[4:5]
+; GFX11-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -548,12 +548,12 @@ define amdgpu_kernel void @s_fdiv_fast_ieee_f32(ptr addrspace(1) %out, float %a,
 ;
 ; GFX8-LABEL: s_fdiv_fast_ieee_f32:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_rcp_f32_e32 v0, s7
-; GFX8-NEXT:    v_mul_f32_e32 v2, s6, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s4
-; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_rcp_f32_e32 v0, s3
+; GFX8-NEXT:    v_mul_f32_e32 v2, s2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -569,12 +569,12 @@ define amdgpu_kernel void @s_fdiv_fast_ieee_f32(ptr addrspace(1) %out, float %a,
 ;
 ; GFX11-LABEL: s_fdiv_fast_ieee_f32:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_rcp_f32_e32 v0, s7
+; GFX11-NEXT:    v_rcp_f32_e32 v0, s3
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, s6, v0
-; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, s2, v0
+; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -612,12 +612,12 @@ define amdgpu_kernel void @s_fdiv_f32_fast_math(ptr addrspace(1) %out, float %a,
 ;
 ; GFX8-LABEL: s_fdiv_f32_fast_math:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_rcp_f32_e32 v0, s7
-; GFX8-NEXT:    v_mul_f32_e32 v2, s6, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s4
-; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_rcp_f32_e32 v0, s3
+; GFX8-NEXT:    v_mul_f32_e32 v2, s2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -633,12 +633,12 @@ define amdgpu_kernel void @s_fdiv_f32_fast_math(ptr addrspace(1) %out, float %a,
 ;
 ; GFX11-LABEL: s_fdiv_f32_fast_math:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_rcp_f32_e32 v0, s7
+; GFX11-NEXT:    v_rcp_f32_e32 v0, s3
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, s6, v0
-; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, s2, v0
+; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -676,12 +676,12 @@ define amdgpu_kernel void @s_fdiv_ulp25_f32_fast_math(ptr addrspace(1) %out, flo
 ;
 ; GFX8-LABEL: s_fdiv_ulp25_f32_fast_math:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_rcp_f32_e32 v0, s7
-; GFX8-NEXT:    v_mul_f32_e32 v2, s6, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s4
-; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_rcp_f32_e32 v0, s3
+; GFX8-NEXT:    v_mul_f32_e32 v2, s2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -697,12 +697,12 @@ define amdgpu_kernel void @s_fdiv_ulp25_f32_fast_math(ptr addrspace(1) %out, flo
 ;
 ; GFX11-LABEL: s_fdiv_ulp25_f32_fast_math:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_rcp_f32_e32 v0, s7
+; GFX11-NEXT:    v_rcp_f32_e32 v0, s3
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, s6, v0
-; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, s2, v0
+; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -805,12 +805,12 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_daz(ptr addrspace(1) %out, float %a,
 ;
 ; GFX8-LABEL: s_fdiv_f32_arcp_daz:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v0, s6
-; GFX8-NEXT:    v_div_scale_f32 v1, s[0:1], s7, s7, v0
-; GFX8-NEXT:    v_mov_b32_e32 v2, s7
-; GFX8-NEXT:    v_div_scale_f32 v2, vcc, s6, v2, s6
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_div_scale_f32 v1, s[4:5], s3, s3, v0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s3
+; GFX8-NEXT:    v_div_scale_f32 v2, vcc, s2, v2, s2
 ; GFX8-NEXT:    v_rcp_f32_e32 v3, v1
 ; GFX8-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX8-NEXT:    v_fma_f32 v4, -v1, v3, 1.0
@@ -821,9 +821,9 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_daz(ptr addrspace(1) %out, float %a,
 ; GFX8-NEXT:    v_fma_f32 v1, -v1, v4, v2
 ; GFX8-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX8-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
-; GFX8-NEXT:    v_div_fixup_f32 v2, v1, s7, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s4
-; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_div_fixup_f32 v2, v1, s3, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -850,10 +850,10 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_daz(ptr addrspace(1) %out, float %a,
 ;
 ; GFX11-LABEL: s_fdiv_f32_arcp_daz:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_div_scale_f32 v0, null, s7, s7, s6
-; GFX11-NEXT:    v_div_scale_f32 v2, vcc_lo, s6, s7, s6
+; GFX11-NEXT:    v_div_scale_f32 v0, null, s3, s3, s2
+; GFX11-NEXT:    v_div_scale_f32 v2, vcc_lo, s2, s3, s2
 ; GFX11-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX11-NEXT:    s_denorm_mode 15
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
@@ -866,8 +866,8 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_daz(ptr addrspace(1) %out, float %a,
 ; GFX11-NEXT:    s_denorm_mode 12
 ; GFX11-NEXT:    v_div_fmas_f32 v0, v0, v1, v3
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    v_div_fixup_f32 v0, v0, s7, s6
-; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX11-NEXT:    v_div_fixup_f32 v0, v0, s3, s2
+; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -905,12 +905,12 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_ninf(ptr addrspace(1) %out, float %a,
 ;
 ; GFX8-LABEL: s_fdiv_f32_arcp_ninf:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_rcp_f32_e32 v0, s7
-; GFX8-NEXT:    v_mul_f32_e32 v2, s6, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s4
-; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_rcp_f32_e32 v0, s3
+; GFX8-NEXT:    v_mul_f32_e32 v2, s2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -926,12 +926,12 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_ninf(ptr addrspace(1) %out, float %a,
 ;
 ; GFX11-LABEL: s_fdiv_f32_arcp_ninf:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_rcp_f32_e32 v0, s7
+; GFX11-NEXT:    v_rcp_f32_e32 v0, s3
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, s6, v0
-; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, s2, v0
+; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
index ab3650f5650ac..26714dcc6dfac 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
@@ -4300,14 +4300,14 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr %out, i32 %in, i32 %old
 ;
 ; GCN2-LABEL: atomic_cmpxchg_i32_offset:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_add_u32 s0, s4, 16
-; GCN2-NEXT:    s_addc_u32 s1, s5, 0
+; GCN2-NEXT:    s_add_u32 s0, s0, 16
+; GCN2-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
-; GCN2-NEXT:    v_mov_b32_e32 v2, s6
+; GCN2-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
-; GCN2-NEXT:    v_mov_b32_e32 v3, s7
+; GCN2-NEXT:    v_mov_b32_e32 v3, s3
 ; GCN2-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
@@ -4557,12 +4557,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32(ptr %out, i32 %in, i32 %old) {
 ;
 ; GCN2-LABEL: atomic_cmpxchg_i32:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    v_mov_b32_e32 v0, s4
-; GCN2-NEXT:    v_mov_b32_e32 v2, s6
-; GCN2-NEXT:    v_mov_b32_e32 v1, s5
-; GCN2-NEXT:    v_mov_b32_e32 v3, s7
+; GCN2-NEXT:    v_mov_b32_e32 v0, s0
+; GCN2-NEXT:    v_mov_b32_e32 v2, s2
+; GCN2-NEXT:    v_mov_b32_e32 v1, s1
+; GCN2-NEXT:    v_mov_b32_e32 v3, s3
 ; GCN2-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll
index 816142dd17cce..66aacd7062a6d 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll
@@ -3853,13 +3853,13 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 %
 ;
 ; GCN2-LABEL: atomic_max_i32_addr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_ashr_i32 s1, s7, 31
-; GCN2-NEXT:    s_mov_b32 s0, s7
-; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN2-NEXT:    s_add_u32 s0, s4, s0
-; GCN2-NEXT:    s_addc_u32 s1, s5, s1
+; GCN2-NEXT:    s_ashr_i32 s5, s3, 31
+; GCN2-NEXT:    s_mov_b32 s4, s3
+; GCN2-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
+; GCN2-NEXT:    s_add_u32 s0, s0, s4
+; GCN2-NEXT:    s_addc_u32 s1, s1, s5
 ; GCN2-NEXT:    s_add_u32 s0, s0, 16
 ; GCN2-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
@@ -3869,7 +3869,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 %
 ; GCN2-NEXT:  .LBB88_1: ; %atomicrmw.start
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_max_i32_e32 v2, s6, v3
+; GCN2-NEXT:    v_max_i32_e32 v2, s2, v3
 ; GCN2-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
@@ -4057,13 +4057,13 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index)
 ;
 ; GCN2-LABEL: atomic_max_i32_addr64:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_ashr_i32 s1, s7, 31
-; GCN2-NEXT:    s_mov_b32 s0, s7
-; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN2-NEXT:    s_add_u32 s0, s4, s0
-; GCN2-NEXT:    s_addc_u32 s1, s5, s1
+; GCN2-NEXT:    s_ashr_i32 s5, s3, 31
+; GCN2-NEXT:    s_mov_b32 s4, s3
+; GCN2-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
+; GCN2-NEXT:    s_add_u32 s0, s0, s4
+; GCN2-NEXT:    s_addc_u32 s1, s1, s5
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN2-NEXT:    flat_load_dword v3, v[0:1]
@@ -4071,7 +4071,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index)
 ; GCN2-NEXT:  .LBB90_1: ; %atomicrmw.start
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_max_i32_e32 v2, s6, v3
+; GCN2-NEXT:    v_max_i32_e32 v2, s2, v3
 ; GCN2-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
@@ -4996,13 +4996,13 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32
 ;
 ; GCN2-LABEL: atomic_umax_i32_addr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_ashr_i32 s1, s7, 31
-; GCN2-NEXT:    s_mov_b32 s0, s7
-; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN2-NEXT:    s_add_u32 s0, s4, s0
-; GCN2-NEXT:    s_addc_u32 s1, s5, s1
+; GCN2-NEXT:    s_ashr_i32 s5, s3, 31
+; GCN2-NEXT:    s_mov_b32 s4, s3
+; GCN2-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
+; GCN2-NEXT:    s_add_u32 s0, s0, s4
+; GCN2-NEXT:    s_addc_u32 s1, s1, s5
 ; GCN2-NEXT:    s_add_u32 s0, s0, 16
 ; GCN2-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
@@ -5012,7 +5012,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32
 ; GCN2-NEXT:  .LBB102_1: ; %atomicrmw.start
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_max_u32_e32 v2, s6, v3
+; GCN2-NEXT:    v_max_u32_e32 v2, s2, v3
 ; GCN2-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
@@ -6790,13 +6790,13 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 %
 ;
 ; GCN2-LABEL: atomic_min_i32_addr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
-; GCN2-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_ashr_i32 s1, s7, 31
-; GCN2-NEXT:    s_mov_b32 s0, s7
-; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN2-NEXT:    s_add_u32 s0, s4, s0
-; GCN2-NEXT:    s_addc_u32 s1, s5, s1
+; GCN2-NEXT:    s_ashr_i32 s5, s3, 31
+; GCN2-NEXT:    s_mov_b32 s4, s3
+; GCN2-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
+; GCN2-NEXT:    s_add_u32 s0, s0, s4
+; GCN2-NEXT:    s_addc_u32 s1, s1, s5
 ; GCN2-NEXT:    s_add_u32 s0, s0, 16
 ; GCN2-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
@@ -6806,7 +6806,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 %
 ; GCN2-NEXT:  .LBB125_1: ; %atomicrmw.start
 ; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_min_i32_e32 v2, s6, v3
+; GCN2-NEXT:    v_min_i32_e32 v2, s2, v3
 ; GCN2-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
diff --git a/llvm/test/CodeGen/AMDGPU/fma-combine.ll b/llvm/test/CodeGen/AMDGPU/fma-combine.ll
index 2a9a9ef7c43b6..bac2d8b8b40c2 100644
--- a/llvm/test/CodeGen/AMDGPU/fma-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/fma-combine.ll
@@ -41,17 +41,17 @@ define amdgpu_kernel void @combine_to_fma_f64_0(ptr addrspace(1) noalias %out, p
 ;
 ; GFX11-LABEL: combine_to_fma_f64_0:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 3, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[0:1], v6, s[6:7] glc dlc
+; GFX11-NEXT:    global_load_b64 v[0:1], v6, s[2:3] glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[2:3], v6, s[6:7] offset:8 glc dlc
+; GFX11-NEXT:    global_load_b64 v[2:3], v6, s[2:3] offset:8 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[4:5], v6, s[6:7] offset:16 glc dlc
+; GFX11-NEXT:    global_load_b64 v[4:5], v6, s[2:3] offset:16 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
-; GFX11-NEXT:    global_store_b64 v6, v[0:1], s[4:5]
+; GFX11-NEXT:    global_store_b64 v6, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -101,22 +101,22 @@ define amdgpu_kernel void @combine_to_fma_f64_0_2use(ptr addrspace(1) noalias %o
 ;
 ; GFX11-LABEL: combine_to_fma_f64_0_2use:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 3, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[0:1], v8, s[6:7] glc dlc
+; GFX11-NEXT:    global_load_b64 v[0:1], v8, s[2:3] glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[2:3], v8, s[6:7] offset:8 glc dlc
+; GFX11-NEXT:    global_load_b64 v[2:3], v8, s[2:3] offset:8 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[4:5], v8, s[6:7] offset:16 glc dlc
+; GFX11-NEXT:    global_load_b64 v[4:5], v8, s[2:3] offset:16 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[6:7], v8, s[6:7] offset:24 glc dlc
+; GFX11-NEXT:    global_load_b64 v[6:7], v8, s[2:3] offset:24 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_fma_f64 v[4:5], v[0:1], v[2:3], v[4:5]
 ; GFX11-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[6:7]
-; GFX11-NEXT:    global_store_b64 v8, v[4:5], s[4:5] dlc
+; GFX11-NEXT:    global_store_b64 v8, v[4:5], s[0:1] dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_store_b64 v8, v[0:1], s[4:5] offset:8 dlc
+; GFX11-NEXT:    global_store_b64 v8, v[0:1], s[0:1] offset:8 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -166,17 +166,17 @@ define amdgpu_kernel void @combine_to_fma_f64_1(ptr addrspace(1) noalias %out, p
 ;
 ; GFX11-LABEL: combine_to_fma_f64_1:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 3, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[0:1], v6, s[6:7] glc dlc
+; GFX11-NEXT:    global_load_b64 v[0:1], v6, s[2:3] glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[2:3], v6, s[6:7] offset:8 glc dlc
+; GFX11-NEXT:    global_load_b64 v[2:3], v6, s[2:3] offset:8 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[4:5], v6, s[6:7] offset:16 glc dlc
+; GFX11-NEXT:    global_load_b64 v[4:5], v6, s[2:3] offset:16 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
-; GFX11-NEXT:    global_store_b64 v6, v[0:1], s[4:5]
+; GFX11-NEXT:    global_store_b64 v6, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -220,17 +220,17 @@ define amdgpu_kernel void @combine_to_fma_fsub_0_f64(ptr addrspace(1) noalias %o
 ;
 ; GFX11-LABEL: combine_to_fma_fsub_0_f64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 3, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[0:1], v6, s[6:7] glc dlc
+; GFX11-NEXT:    global_load_b64 v[0:1], v6, s[2:3] glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[2:3], v6, s[6:7] offset:8 glc dlc
+; GFX11-NEXT:    global_load_b64 v[2:3], v6, s[2:3] offset:8 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[4:5], v6, s[6:7] offset:16 glc dlc
+; GFX11-NEXT:    global_load_b64 v[4:5], v6, s[2:3] offset:16 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], -v[4:5]
-; GFX11-NEXT:    global_store_b64 v6, v[0:1], s[4:5]
+; GFX11-NEXT:    global_store_b64 v6, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -280,22 +280,22 @@ define amdgpu_kernel void @combine_to_fma_fsub_f64_0_2use(ptr addrspace(1) noali
 ;
 ; GFX11-LABEL: combine_to_fma_fsub_f64_0_2use:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 3, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[0:1], v8, s[6:7] glc dlc
+; GFX11-NEXT:    global_load_b64 v[0:1], v8, s[2:3] glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[2:3], v8, s[6:7] offset:8 glc dlc
+; GFX11-NEXT:    global_load_b64 v[2:3], v8, s[2:3] offset:8 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[4:5], v8, s[6:7] offset:16 glc dlc
+; GFX11-NEXT:    global_load_b64 v[4:5], v8, s[2:3] offset:16 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[6:7], v8, s[6:7] offset:24 glc dlc
+; GFX11-NEXT:    global_load_b64 v[6:7], v8, s[2:3] offset:24 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_fma_f64 v[4:5], v[0:1], v[2:3], -v[4:5]
 ; GFX11-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], -v[6:7]
-; GFX11-NEXT:    global_store_b64 v8, v[4:5], s[4:5] dlc
+; GFX11-NEXT:    global_store_b64 v8, v[4:5], s[0:1] dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_store_b64 v8, v[0:1], s[4:5] offset:8 dlc
+; GFX11-NEXT:    global_store_b64 v8, v[0:1], s[0:1] offset:8 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -345,17 +345,17 @@ define amdgpu_kernel void @combine_to_fma_fsub_1_f64(ptr addrspace(1) noalias %o
 ;
 ; GFX11-LABEL: combine_to_fma_fsub_1_f64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 3, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[0:1], v6, s[6:7] glc dlc
+; GFX11-NEXT:    global_load_b64 v[0:1], v6, s[2:3] glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[2:3], v6, s[6:7] offset:8 glc dlc
+; GFX11-NEXT:    global_load_b64 v[2:3], v6, s[2:3] offset:8 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[4:5], v6, s[6:7] offset:16 glc dlc
+; GFX11-NEXT:    global_load_b64 v[4:5], v6, s[2:3] offset:16 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], v[4:5]
-; GFX11-NEXT:    global_store_b64 v6, v[0:1], s[4:5]
+; GFX11-NEXT:    global_store_b64 v6, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -405,22 +405,22 @@ define amdgpu_kernel void @combine_to_fma_fsub_1_f64_2use(ptr addrspace(1) noali
 ;
 ; GFX11-LABEL: combine_to_fma_fsub_1_f64_2use:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 3, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[0:1], v8, s[6:7] glc dlc
+; GFX11-NEXT:    global_load_b64 v[0:1], v8, s[2:3] glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[2:3], v8, s[6:7] offset:8 glc dlc
+; GFX11-NEXT:    global_load_b64 v[2:3], v8, s[2:3] offset:8 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[4:5], v8, s[6:7] offset:16 glc dlc
+; GFX11-NEXT:    global_load_b64 v[4:5], v8, s[2:3] offset:16 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[6:7], v8, s[6:7] offset:24 glc dlc
+; GFX11-NEXT:    global_load_b64 v[6:7], v8, s[2:3] offset:24 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], v[4:5]
 ; GFX11-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], v[6:7]
-; GFX11-NEXT:    global_store_b64 v8, v[4:5], s[4:5] dlc
+; GFX11-NEXT:    global_store_b64 v8, v[4:5], s[0:1] dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_store_b64 v8, v[0:1], s[4:5] offset:8 dlc
+; GFX11-NEXT:    global_store_b64 v8, v[0:1], s[0:1] offset:8 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -470,17 +470,17 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64(ptr addrspace(1) noalias %o
 ;
 ; GFX11-LABEL: combine_to_fma_fsub_2_f64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 3, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[0:1], v6, s[6:7] glc dlc
+; GFX11-NEXT:    global_load_b64 v[0:1], v6, s[2:3] glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[2:3], v6, s[6:7] offset:8 glc dlc
+; GFX11-NEXT:    global_load_b64 v[2:3], v6, s[2:3] offset:8 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[4:5], v6, s[6:7] offset:16 glc dlc
+; GFX11-NEXT:    global_load_b64 v[4:5], v6, s[2:3] offset:16 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], -v[4:5]
-; GFX11-NEXT:    global_store_b64 v6, v[0:1], s[4:5]
+; GFX11-NEXT:    global_store_b64 v6, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -532,22 +532,22 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_neg(ptr addrspace(1)
 ;
 ; GFX11-LABEL: combine_to_fma_fsub_2_f64_2uses_neg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 3, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[0:1], v8, s[6:7] glc dlc
+; GFX11-NEXT:    global_load_b64 v[0:1], v8, s[2:3] glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[2:3], v8, s[6:7] offset:8 glc dlc
+; GFX11-NEXT:    global_load_b64 v[2:3], v8, s[2:3] offset:8 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[4:5], v8, s[6:7] offset:16 glc dlc
+; GFX11-NEXT:    global_load_b64 v[4:5], v8, s[2:3] offset:16 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[6:7], v8, s[6:7] offset:24 glc dlc
+; GFX11-NEXT:    global_load_b64 v[6:7], v8, s[2:3] offset:24 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], -v[4:5]
 ; GFX11-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], -v[6:7]
-; GFX11-NEXT:    global_store_b64 v8, v[4:5], s[4:5] dlc
+; GFX11-NEXT:    global_store_b64 v8, v[4:5], s[0:1] dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_store_b64 v8, v[0:1], s[4:5] offset:8 dlc
+; GFX11-NEXT:    global_store_b64 v8, v[0:1], s[0:1] offset:8 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -605,22 +605,22 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_mul(ptr addrspace(1)
 ;
 ; GFX11-LABEL: combine_to_fma_fsub_2_f64_2uses_mul:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 3, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[0:1], v8, s[6:7] glc dlc
+; GFX11-NEXT:    global_load_b64 v[0:1], v8, s[2:3] glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[2:3], v8, s[6:7] offset:8 glc dlc
+; GFX11-NEXT:    global_load_b64 v[2:3], v8, s[2:3] offset:8 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[4:5], v8, s[6:7] offset:16 glc dlc
+; GFX11-NEXT:    global_load_b64 v[4:5], v8, s[2:3] offset:16 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[6:7], v8, s[6:7] offset:24 glc dlc
+; GFX11-NEXT:    global_load_b64 v[6:7], v8, s[2:3] offset:24 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], -v[4:5]
 ; GFX11-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], -v[6:7]
-; GFX11-NEXT:    global_store_b64 v8, v[4:5], s[4:5] dlc
+; GFX11-NEXT:    global_store_b64 v8, v[4:5], s[0:1] dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_store_b64 v8, v[0:1], s[4:5] offset:8 dlc
+; GFX11-NEXT:    global_store_b64 v8, v[0:1], s[0:1] offset:8 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -703,47 +703,47 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(ptr addrspace(1)
 ;
 ; GFX11-NOFMA-LABEL: aggressive_combine_to_fma_fsub_0_f64:
 ; GFX11-NOFMA:       ; %bb.0:
-; GFX11-NOFMA-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NOFMA-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NOFMA-NEXT:    v_lshlrev_b32_e32 v10, 3, v0
 ; GFX11-NOFMA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NOFMA-NEXT:    global_load_b64 v[0:1], v10, s[6:7] glc dlc
+; GFX11-NOFMA-NEXT:    global_load_b64 v[0:1], v10, s[2:3] glc dlc
 ; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NOFMA-NEXT:    global_load_b64 v[2:3], v10, s[6:7] offset:8 glc dlc
+; GFX11-NOFMA-NEXT:    global_load_b64 v[2:3], v10, s[2:3] offset:8 glc dlc
 ; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NOFMA-NEXT:    global_load_b64 v[4:5], v10, s[6:7] offset:16 glc dlc
+; GFX11-NOFMA-NEXT:    global_load_b64 v[4:5], v10, s[2:3] offset:16 glc dlc
 ; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NOFMA-NEXT:    global_load_b64 v[6:7], v10, s[6:7] offset:24 glc dlc
+; GFX11-NOFMA-NEXT:    global_load_b64 v[6:7], v10, s[2:3] offset:24 glc dlc
 ; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NOFMA-NEXT:    global_load_b64 v[8:9], v10, s[6:7] offset:32 glc dlc
+; GFX11-NOFMA-NEXT:    global_load_b64 v[8:9], v10, s[2:3] offset:32 glc dlc
 ; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NOFMA-NEXT:    v_mul_f64 v[6:7], v[6:7], v[8:9]
 ; GFX11-NOFMA-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NOFMA-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[6:7]
 ; GFX11-NOFMA-NEXT:    v_add_f64 v[0:1], v[0:1], -v[4:5]
-; GFX11-NOFMA-NEXT:    global_store_b64 v10, v[0:1], s[4:5]
+; GFX11-NOFMA-NEXT:    global_store_b64 v10, v[0:1], s[0:1]
 ; GFX11-NOFMA-NEXT:    s_nop 0
 ; GFX11-NOFMA-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NOFMA-NEXT:    s_endpgm
 ;
 ; GFX11-FMA-LABEL: aggressive_combine_to_fma_fsub_0_f64:
 ; GFX11-FMA:       ; %bb.0:
-; GFX11-FMA-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-FMA-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-FMA-NEXT:    v_lshlrev_b32_e32 v10, 3, v0
 ; GFX11-FMA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FMA-NEXT:    global_load_b64 v[0:1], v10, s[6:7] glc dlc
+; GFX11-FMA-NEXT:    global_load_b64 v[0:1], v10, s[2:3] glc dlc
 ; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FMA-NEXT:    global_load_b64 v[2:3], v10, s[6:7] offset:8 glc dlc
+; GFX11-FMA-NEXT:    global_load_b64 v[2:3], v10, s[2:3] offset:8 glc dlc
 ; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FMA-NEXT:    global_load_b64 v[4:5], v10, s[6:7] offset:16 glc dlc
+; GFX11-FMA-NEXT:    global_load_b64 v[4:5], v10, s[2:3] offset:16 glc dlc
 ; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FMA-NEXT:    global_load_b64 v[6:7], v10, s[6:7] offset:24 glc dlc
+; GFX11-FMA-NEXT:    global_load_b64 v[6:7], v10, s[2:3] offset:24 glc dlc
 ; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FMA-NEXT:    global_load_b64 v[8:9], v10, s[6:7] offset:32 glc dlc
+; GFX11-FMA-NEXT:    global_load_b64 v[8:9], v10, s[2:3] offset:32 glc dlc
 ; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FMA-NEXT:    v_fma_f64 v[4:5], v[6:7], v[8:9], -v[4:5]
 ; GFX11-FMA-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FMA-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
-; GFX11-FMA-NEXT:    global_store_b64 v10, v[0:1], s[4:5]
+; GFX11-FMA-NEXT:    global_store_b64 v10, v[0:1], s[0:1]
 ; GFX11-FMA-NEXT:    s_nop 0
 ; GFX11-FMA-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FMA-NEXT:    s_endpgm
@@ -825,47 +825,47 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(ptr addrspace(1)
 ;
 ; GFX11-NOFMA-LABEL: aggressive_combine_to_fma_fsub_1_f64:
 ; GFX11-NOFMA:       ; %bb.0:
-; GFX11-NOFMA-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NOFMA-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NOFMA-NEXT:    v_lshlrev_b32_e32 v10, 3, v0
 ; GFX11-NOFMA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NOFMA-NEXT:    global_load_b64 v[0:1], v10, s[6:7] glc dlc
+; GFX11-NOFMA-NEXT:    global_load_b64 v[0:1], v10, s[2:3] glc dlc
 ; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NOFMA-NEXT:    global_load_b64 v[2:3], v10, s[6:7] offset:8 glc dlc
+; GFX11-NOFMA-NEXT:    global_load_b64 v[2:3], v10, s[2:3] offset:8 glc dlc
 ; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NOFMA-NEXT:    global_load_b64 v[4:5], v10, s[6:7] offset:16 glc dlc
+; GFX11-NOFMA-NEXT:    global_load_b64 v[4:5], v10, s[2:3] offset:16 glc dlc
 ; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NOFMA-NEXT:    global_load_b64 v[6:7], v10, s[6:7] offset:24 glc dlc
+; GFX11-NOFMA-NEXT:    global_load_b64 v[6:7], v10, s[2:3] offset:24 glc dlc
 ; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NOFMA-NEXT:    global_load_b64 v[8:9], v10, s[6:7] offset:32 glc dlc
+; GFX11-NOFMA-NEXT:    global_load_b64 v[8:9], v10, s[2:3] offset:32 glc dlc
 ; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NOFMA-NEXT:    v_mul_f64 v[6:7], v[6:7], v[8:9]
 ; GFX11-NOFMA-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NOFMA-NEXT:    v_fma_f64 v[2:3], v[2:3], v[4:5], v[6:7]
 ; GFX11-NOFMA-NEXT:    v_add_f64 v[0:1], v[0:1], -v[2:3]
-; GFX11-NOFMA-NEXT:    global_store_b64 v10, v[0:1], s[4:5]
+; GFX11-NOFMA-NEXT:    global_store_b64 v10, v[0:1], s[0:1]
 ; GFX11-NOFMA-NEXT:    s_nop 0
 ; GFX11-NOFMA-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NOFMA-NEXT:    s_endpgm
 ;
 ; GFX11-FMA-LABEL: aggressive_combine_to_fma_fsub_1_f64:
 ; GFX11-FMA:       ; %bb.0:
-; GFX11-FMA-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-FMA-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-FMA-NEXT:    v_lshlrev_b32_e32 v10, 3, v0
 ; GFX11-FMA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FMA-NEXT:    global_load_b64 v[0:1], v10, s[6:7] glc dlc
+; GFX11-FMA-NEXT:    global_load_b64 v[0:1], v10, s[2:3] glc dlc
 ; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FMA-NEXT:    global_load_b64 v[2:3], v10, s[6:7] offset:8 glc dlc
+; GFX11-FMA-NEXT:    global_load_b64 v[2:3], v10, s[2:3] offset:8 glc dlc
 ; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FMA-NEXT:    global_load_b64 v[4:5], v10, s[6:7] offset:16 glc dlc
+; GFX11-FMA-NEXT:    global_load_b64 v[4:5], v10, s[2:3] offset:16 glc dlc
 ; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FMA-NEXT:    global_load_b64 v[6:7], v10, s[6:7] offset:24 glc dlc
+; GFX11-FMA-NEXT:    global_load_b64 v[6:7], v10, s[2:3] offset:24 glc dlc
 ; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FMA-NEXT:    global_load_b64 v[8:9], v10, s[6:7] offset:32 glc dlc
+; GFX11-FMA-NEXT:    global_load_b64 v[8:9], v10, s[2:3] offset:32 glc dlc
 ; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FMA-NEXT:    v_fma_f64 v[0:1], -v[6:7], v[8:9], v[0:1]
 ; GFX11-FMA-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FMA-NEXT:    v_fma_f64 v[0:1], -v[2:3], v[4:5], v[0:1]
-; GFX11-FMA-NEXT:    global_store_b64 v10, v[0:1], s[4:5]
+; GFX11-FMA-NEXT:    global_store_b64 v10, v[0:1], s[0:1]
 ; GFX11-FMA-NEXT:    s_nop 0
 ; GFX11-FMA-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FMA-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
index 0cb6b783f496e..7830c91851bfa 100644
--- a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
@@ -46,11 +46,11 @@ define amdgpu_kernel void @multiple_fadd_use_test_f32(ptr addrspace(1) %out, flo
 ;
 ; GFX11-LABEL: multiple_fadd_use_test_f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_add_f32_e64 v0, s7, -1.0
-; GFX11-NEXT:    v_add_f32_e64 v1, s6, -1.0
+; GFX11-NEXT:    v_add_f32_e64 v0, s3, -1.0
+; GFX11-NEXT:    v_add_f32_e64 v1, s2, -1.0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_gt_f32_e64 vcc_lo, |v0|, |v1|
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
@@ -58,7 +58,7 @@ define amdgpu_kernel void @multiple_fadd_use_test_f32(ptr addrspace(1) %out, flo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_f32_e32 v1, v0, v0
 ; GFX11-NEXT:    v_fma_f32 v0, -v1, v0, 1.0
-; GFX11-NEXT:    global_store_b32 v2, v0, s[4:5]
+; GFX11-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -174,14 +174,14 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f32(ptr addrspace(1) %out, flo
 ;
 ; GFX11-LABEL: multiple_use_fadd_fmad_f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_add_f32_e64 v1, |s6|, |s6|
-; GFX11-NEXT:    v_fma_f32 v2, |s6|, 2.0, s7
-; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5] dlc
+; GFX11-NEXT:    v_add_f32_e64 v1, |s2|, |s2|
+; GFX11-NEXT:    v_fma_f32 v2, |s2|, 2.0, s3
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1] dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_store_b32 v0, v2, s[4:5] offset:4 dlc
+; GFX11-NEXT:    global_store_b32 v0, v2, s[0:1] offset:4 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
diff --git a/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll
index bde0dc326f9ac..9943976dd86da 100644
--- a/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll
@@ -1339,11 +1339,11 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(ptr addrspace(1) %out, ptr ad
 define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 {
 ; VI-FLUSH-LABEL: mad_sub_f16:
 ; VI-FLUSH:       ; %bb.0:
-; VI-FLUSH-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-FLUSH-NEXT:    v_lshlrev_b32_e32 v6, 1, v0
 ; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s7
-; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s6, v6
+; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s3
+; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-FLUSH-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -1355,8 +1355,8 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out,
 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; VI-FLUSH-NEXT:    flat_load_ushort v3, v[4:5] glc
 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s5
-; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s4, v6
+; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s1
+; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s0, v6
 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-FLUSH-NEXT:    v_mad_f16 v2, v7, v2, -v3
 ; VI-FLUSH-NEXT:    flat_store_short v[0:1], v2
@@ -1364,11 +1364,11 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out,
 ;
 ; VI-DENORM-CONTRACT-LABEL: mad_sub_f16:
 ; VI-DENORM-CONTRACT:       ; %bb.0:
-; VI-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v6, 1, v0
 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s7
-; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s6, v6
+; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s3
+; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -1380,8 +1380,8 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out,
 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v3, v[4:5] glc
 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s5
-; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s4, v6
+; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s1
+; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s0, v6
 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-DENORM-CONTRACT-NEXT:    v_fma_f16 v2, v7, v2, -v3
 ; VI-DENORM-CONTRACT-NEXT:    flat_store_short v[0:1], v2
@@ -1436,55 +1436,55 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out,
 ;
 ; GFX11-FLUSH-LABEL: mad_sub_f16:
 ; GFX11-FLUSH:       ; %bb.0:
-; GFX11-FLUSH-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-FLUSH-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[6:7] glc dlc
+; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[6:7] offset:2 glc dlc
+; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT:    global_load_u16 v3, v0, s[6:7] offset:4 glc dlc
+; GFX11-FLUSH-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FLUSH-NEXT:    v_sub_f16_e32 v1, v1, v3
-; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[4:5]
+; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
 ; GFX11-FLUSH-NEXT:    s_nop 0
 ; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-STRICT-LABEL: mad_sub_f16:
 ; GFX11-DENORM-STRICT:       ; %bb.0:
-; GFX11-DENORM-STRICT-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DENORM-STRICT-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v1, v0, s[6:7] glc dlc
+; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v2, v0, s[6:7] offset:2 glc dlc
+; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v3, v0, s[6:7] offset:4 glc dlc
+; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DENORM-STRICT-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX11-DENORM-STRICT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-DENORM-STRICT-NEXT:    v_sub_f16_e32 v1, v1, v3
-; GFX11-DENORM-STRICT-NEXT:    global_store_b16 v0, v1, s[4:5]
+; GFX11-DENORM-STRICT-NEXT:    global_store_b16 v0, v1, s[0:1]
 ; GFX11-DENORM-STRICT-NEXT:    s_nop 0
 ; GFX11-DENORM-STRICT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DENORM-STRICT-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-CONTRACT-LABEL: mad_sub_f16:
 ; GFX11-DENORM-CONTRACT:       ; %bb.0:
-; GFX11-DENORM-CONTRACT-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DENORM-CONTRACT-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v1, v0, s[6:7] glc dlc
+; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v2, v0, s[6:7] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v3, v0, s[6:7] offset:4 glc dlc
+; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, v1, v2, -v3
-; GFX11-DENORM-CONTRACT-NEXT:    global_store_b16 v0, v1, s[4:5]
+; GFX11-DENORM-CONTRACT-NEXT:    global_store_b16 v0, v1, s[0:1]
 ; GFX11-DENORM-CONTRACT-NEXT:    s_nop 0
 ; GFX11-DENORM-CONTRACT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DENORM-CONTRACT-NEXT:    s_endpgm
@@ -1508,11 +1508,11 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out,
 define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 {
 ; VI-FLUSH-LABEL: mad_sub_inv_f16:
 ; VI-FLUSH:       ; %bb.0:
-; VI-FLUSH-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-FLUSH-NEXT:    v_lshlrev_b32_e32 v6, 1, v0
 ; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s7
-; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s6, v6
+; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s3
+; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-FLUSH-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -1524,8 +1524,8 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o
 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; VI-FLUSH-NEXT:    flat_load_ushort v3, v[4:5] glc
 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s5
-; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s4, v6
+; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s1
+; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s0, v6
 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-FLUSH-NEXT:    v_mad_f16 v2, -v7, v2, v3
 ; VI-FLUSH-NEXT:    flat_store_short v[0:1], v2
@@ -1533,11 +1533,11 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o
 ;
 ; VI-DENORM-CONTRACT-LABEL: mad_sub_inv_f16:
 ; VI-DENORM-CONTRACT:       ; %bb.0:
-; VI-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v6, 1, v0
 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s7
-; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s6, v6
+; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s3
+; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -1549,8 +1549,8 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o
 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v3, v[4:5] glc
 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s5
-; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s4, v6
+; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s1
+; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s0, v6
 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-DENORM-CONTRACT-NEXT:    v_fma_f16 v2, -v7, v2, v3
 ; VI-DENORM-CONTRACT-NEXT:    flat_store_short v[0:1], v2
@@ -1605,55 +1605,55 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o
 ;
 ; GFX11-FLUSH-LABEL: mad_sub_inv_f16:
 ; GFX11-FLUSH:       ; %bb.0:
-; GFX11-FLUSH-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-FLUSH-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[6:7] glc dlc
+; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[6:7] offset:2 glc dlc
+; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT:    global_load_u16 v3, v0, s[6:7] offset:4 glc dlc
+; GFX11-FLUSH-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FLUSH-NEXT:    v_sub_f16_e32 v1, v3, v1
-; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[4:5]
+; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
 ; GFX11-FLUSH-NEXT:    s_nop 0
 ; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-STRICT-LABEL: mad_sub_inv_f16:
 ; GFX11-DENORM-STRICT:       ; %bb.0:
-; GFX11-DENORM-STRICT-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DENORM-STRICT-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v1, v0, s[6:7] glc dlc
+; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v2, v0, s[6:7] offset:2 glc dlc
+; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v3, v0, s[6:7] offset:4 glc dlc
+; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DENORM-STRICT-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX11-DENORM-STRICT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-DENORM-STRICT-NEXT:    v_sub_f16_e32 v1, v3, v1
-; GFX11-DENORM-STRICT-NEXT:    global_store_b16 v0, v1, s[4:5]
+; GFX11-DENORM-STRICT-NEXT:    global_store_b16 v0, v1, s[0:1]
 ; GFX11-DENORM-STRICT-NEXT:    s_nop 0
 ; GFX11-DENORM-STRICT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DENORM-STRICT-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-CONTRACT-LABEL: mad_sub_inv_f16:
 ; GFX11-DENORM-CONTRACT:       ; %bb.0:
-; GFX11-DENORM-CONTRACT-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DENORM-CONTRACT-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v1, v0, s[6:7] glc dlc
+; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v2, v0, s[6:7] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v3, v0, s[6:7] offset:4 glc dlc
+; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, -v1, v2, v3
-; GFX11-DENORM-CONTRACT-NEXT:    global_store_b16 v0, v1, s[4:5]
+; GFX11-DENORM-CONTRACT-NEXT:    global_store_b16 v0, v1, s[0:1]
 ; GFX11-DENORM-CONTRACT-NEXT:    s_nop 0
 ; GFX11-DENORM-CONTRACT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DENORM-CONTRACT-NEXT:    s_endpgm
@@ -1677,11 +1677,11 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o
 define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 {
 ; VI-FLUSH-LABEL: mad_sub_fabs_f16:
 ; VI-FLUSH:       ; %bb.0:
-; VI-FLUSH-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-FLUSH-NEXT:    v_lshlrev_b32_e32 v6, 1, v0
 ; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s7
-; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s6, v6
+; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s3
+; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-FLUSH-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -1693,8 +1693,8 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture %
 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; VI-FLUSH-NEXT:    flat_load_ushort v3, v[4:5] glc
 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s5
-; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s4, v6
+; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s1
+; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s0, v6
 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-FLUSH-NEXT:    v_mad_f16 v2, v7, v2, -|v3|
 ; VI-FLUSH-NEXT:    flat_store_short v[0:1], v2
@@ -1702,11 +1702,11 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture %
 ;
 ; VI-DENORM-CONTRACT-LABEL: mad_sub_fabs_f16:
 ; VI-DENORM-CONTRACT:       ; %bb.0:
-; VI-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v6, 1, v0
 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s7
-; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s6, v6
+; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s3
+; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -1718,8 +1718,8 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture %
 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v3, v[4:5] glc
 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s5
-; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s4, v6
+; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s1
+; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s0, v6
 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-DENORM-CONTRACT-NEXT:    v_fma_f16 v2, v7, v2, -|v3|
 ; VI-DENORM-CONTRACT-NEXT:    flat_store_short v[0:1], v2
@@ -1774,55 +1774,55 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture %
 ;
 ; GFX11-FLUSH-LABEL: mad_sub_fabs_f16:
 ; GFX11-FLUSH:       ; %bb.0:
-; GFX11-FLUSH-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-FLUSH-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[6:7] glc dlc
+; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[6:7] offset:2 glc dlc
+; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT:    global_load_u16 v3, v0, s[6:7] offset:4 glc dlc
+; GFX11-FLUSH-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FLUSH-NEXT:    v_sub_f16_e64 v1, v1, |v3|
-; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[4:5]
+; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
 ; GFX11-FLUSH-NEXT:    s_nop 0
 ; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-STRICT-LABEL: mad_sub_fabs_f16:
 ; GFX11-DENORM-STRICT:       ; %bb.0:
-; GFX11-DENORM-STRICT-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DENORM-STRICT-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v1, v0, s[6:7] glc dlc
+; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v2, v0, s[6:7] offset:2 glc dlc
+; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v3, v0, s[6:7] offset:4 glc dlc
+; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DENORM-STRICT-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX11-DENORM-STRICT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-DENORM-STRICT-NEXT:    v_sub_f16_e64 v1, v1, |v3|
-; GFX11-DENORM-STRICT-NEXT:    global_store_b16 v0, v1, s[4:5]
+; GFX11-DENORM-STRICT-NEXT:    global_store_b16 v0, v1, s[0:1]
 ; GFX11-DENORM-STRICT-NEXT:    s_nop 0
 ; GFX11-DENORM-STRICT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DENORM-STRICT-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-CONTRACT-LABEL: mad_sub_fabs_f16:
 ; GFX11-DENORM-CONTRACT:       ; %bb.0:
-; GFX11-DENORM-CONTRACT-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DENORM-CONTRACT-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v1, v0, s[6:7] glc dlc
+; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v2, v0, s[6:7] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v3, v0, s[6:7] offset:4 glc dlc
+; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, v1, v2, -|v3|
-; GFX11-DENORM-CONTRACT-NEXT:    global_store_b16 v0, v1, s[4:5]
+; GFX11-DENORM-CONTRACT-NEXT:    global_store_b16 v0, v1, s[0:1]
 ; GFX11-DENORM-CONTRACT-NEXT:    s_nop 0
 ; GFX11-DENORM-CONTRACT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DENORM-CONTRACT-NEXT:    s_endpgm
@@ -1847,11 +1847,11 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture %
 define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 {
 ; VI-FLUSH-LABEL: mad_sub_fabs_inv_f16:
 ; VI-FLUSH:       ; %bb.0:
-; VI-FLUSH-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-FLUSH-NEXT:    v_lshlrev_b32_e32 v6, 1, v0
 ; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s7
-; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s6, v6
+; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s3
+; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-FLUSH-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -1863,8 +1863,8 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu
 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; VI-FLUSH-NEXT:    flat_load_ushort v3, v[4:5] glc
 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s5
-; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s4, v6
+; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s1
+; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s0, v6
 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-FLUSH-NEXT:    v_mad_f16 v2, -v7, v2, |v3|
 ; VI-FLUSH-NEXT:    flat_store_short v[0:1], v2
@@ -1872,11 +1872,11 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu
 ;
 ; VI-DENORM-CONTRACT-LABEL: mad_sub_fabs_inv_f16:
 ; VI-DENORM-CONTRACT:       ; %bb.0:
-; VI-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v6, 1, v0
 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s7
-; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s6, v6
+; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s3
+; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -1888,8 +1888,8 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu
 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v3, v[4:5] glc
 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s5
-; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s4, v6
+; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s1
+; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s0, v6
 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-DENORM-CONTRACT-NEXT:    v_fma_f16 v2, -v7, v2, |v3|
 ; VI-DENORM-CONTRACT-NEXT:    flat_store_short v[0:1], v2
@@ -1944,55 +1944,55 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu
 ;
 ; GFX11-FLUSH-LABEL: mad_sub_fabs_inv_f16:
 ; GFX11-FLUSH:       ; %bb.0:
-; GFX11-FLUSH-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-FLUSH-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[6:7] glc dlc
+; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[6:7] offset:2 glc dlc
+; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT:    global_load_u16 v3, v0, s[6:7] offset:4 glc dlc
+; GFX11-FLUSH-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FLUSH-NEXT:    v_sub_f16_e64 v1, |v3|, v1
-; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[4:5]
+; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
 ; GFX11-FLUSH-NEXT:    s_nop 0
 ; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-STRICT-LABEL: mad_sub_fabs_inv_f16:
 ; GFX11-DENORM-STRICT:       ; %bb.0:
-; GFX11-DENORM-STRICT-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DENORM-STRICT-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v1, v0, s[6:7] glc dlc
+; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v2, v0, s[6:7] offset:2 glc dlc
+; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v3, v0, s[6:7] offset:4 glc dlc
+; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DENORM-STRICT-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX11-DENORM-STRICT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-DENORM-STRICT-NEXT:    v_sub_f16_e64 v1, |v3|, v1
-; GFX11-DENORM-STRICT-NEXT:    global_store_b16 v0, v1, s[4:5]
+; GFX11-DENORM-STRICT-NEXT:    global_store_b16 v0, v1, s[0:1]
 ; GFX11-DENORM-STRICT-NEXT:    s_nop 0
 ; GFX11-DENORM-STRICT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DENORM-STRICT-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-CONTRACT-LABEL: mad_sub_fabs_inv_f16:
 ; GFX11-DENORM-CONTRACT:       ; %bb.0:
-; GFX11-DENORM-CONTRACT-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DENORM-CONTRACT-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v1, v0, s[6:7] glc dlc
+; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v2, v0, s[6:7] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v3, v0, s[6:7] offset:4 glc dlc
+; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, -v1, v2, |v3|
-; GFX11-DENORM-CONTRACT-NEXT:    global_store_b16 v0, v1, s[4:5]
+; GFX11-DENORM-CONTRACT-NEXT:    global_store_b16 v0, v1, s[0:1]
 ; GFX11-DENORM-CONTRACT-NEXT:    s_nop 0
 ; GFX11-DENORM-CONTRACT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DENORM-CONTRACT-NEXT:    s_endpgm
@@ -2017,11 +2017,11 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu
 define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 {
 ; VI-FLUSH-LABEL: neg_neg_mad_f16:
 ; VI-FLUSH:       ; %bb.0:
-; VI-FLUSH-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-FLUSH-NEXT:    v_lshlrev_b32_e32 v6, 1, v0
 ; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s7
-; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s6, v6
+; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s3
+; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-FLUSH-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -2033,8 +2033,8 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o
 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; VI-FLUSH-NEXT:    flat_load_ushort v3, v[4:5] glc
 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s5
-; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s4, v6
+; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s1
+; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s0, v6
 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-FLUSH-NEXT:    v_mac_f16_e32 v3, v7, v2
 ; VI-FLUSH-NEXT:    flat_store_short v[0:1], v3
@@ -2042,11 +2042,11 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o
 ;
 ; VI-DENORM-CONTRACT-LABEL: neg_neg_mad_f16:
 ; VI-DENORM-CONTRACT:       ; %bb.0:
-; VI-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v6, 1, v0
 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s7
-; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s6, v6
+; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s3
+; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -2058,8 +2058,8 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o
 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v3, v[4:5] glc
 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s5
-; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s4, v6
+; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s1
+; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s0, v6
 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-DENORM-CONTRACT-NEXT:    v_fma_f16 v2, v7, v2, v3
 ; VI-DENORM-CONTRACT-NEXT:    flat_store_short v[0:1], v2
@@ -2114,55 +2114,55 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o
 ;
 ; GFX11-FLUSH-LABEL: neg_neg_mad_f16:
 ; GFX11-FLUSH:       ; %bb.0:
-; GFX11-FLUSH-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-FLUSH-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[6:7] glc dlc
+; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[6:7] offset:2 glc dlc
+; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT:    global_load_u16 v3, v0, s[6:7] offset:4 glc dlc
+; GFX11-FLUSH-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FLUSH-NEXT:    v_add_f16_e32 v1, v3, v1
-; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[4:5]
+; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
 ; GFX11-FLUSH-NEXT:    s_nop 0
 ; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-STRICT-LABEL: neg_neg_mad_f16:
 ; GFX11-DENORM-STRICT:       ; %bb.0:
-; GFX11-DENORM-STRICT-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DENORM-STRICT-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v1, v0, s[6:7] glc dlc
+; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v2, v0, s[6:7] offset:2 glc dlc
+; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v3, v0, s[6:7] offset:4 glc dlc
+; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DENORM-STRICT-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX11-DENORM-STRICT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-DENORM-STRICT-NEXT:    v_add_f16_e32 v1, v3, v1
-; GFX11-DENORM-STRICT-NEXT:    global_store_b16 v0, v1, s[4:5]
+; GFX11-DENORM-STRICT-NEXT:    global_store_b16 v0, v1, s[0:1]
 ; GFX11-DENORM-STRICT-NEXT:    s_nop 0
 ; GFX11-DENORM-STRICT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DENORM-STRICT-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-CONTRACT-LABEL: neg_neg_mad_f16:
 ; GFX11-DENORM-CONTRACT:       ; %bb.0:
-; GFX11-DENORM-CONTRACT-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DENORM-CONTRACT-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v1, v0, s[6:7] glc dlc
+; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v2, v0, s[6:7] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v3, v0, s[6:7] offset:4 glc dlc
+; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DENORM-CONTRACT-NEXT:    v_fmac_f16_e32 v3, v1, v2
-; GFX11-DENORM-CONTRACT-NEXT:    global_store_b16 v0, v3, s[4:5]
+; GFX11-DENORM-CONTRACT-NEXT:    global_store_b16 v0, v3, s[0:1]
 ; GFX11-DENORM-CONTRACT-NEXT:    s_nop 0
 ; GFX11-DENORM-CONTRACT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DENORM-CONTRACT-NEXT:    s_endpgm
@@ -2188,11 +2188,11 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o
 define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 {
 ; VI-FLUSH-LABEL: mad_fabs_sub_f16:
 ; VI-FLUSH:       ; %bb.0:
-; VI-FLUSH-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-FLUSH-NEXT:    v_lshlrev_b32_e32 v6, 1, v0
 ; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s7
-; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s6, v6
+; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s3
+; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-FLUSH-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -2204,8 +2204,8 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture %
 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; VI-FLUSH-NEXT:    flat_load_ushort v3, v[4:5] glc
 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s5
-; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s4, v6
+; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s1
+; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s0, v6
 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-FLUSH-NEXT:    v_mad_f16 v2, v7, |v2|, -v3
 ; VI-FLUSH-NEXT:    flat_store_short v[0:1], v2
@@ -2213,11 +2213,11 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture %
 ;
 ; VI-DENORM-CONTRACT-LABEL: mad_fabs_sub_f16:
 ; VI-DENORM-CONTRACT:       ; %bb.0:
-; VI-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v6, 1, v0
 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s7
-; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s6, v6
+; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s3
+; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -2229,8 +2229,8 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture %
 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v3, v[4:5] glc
 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s5
-; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s4, v6
+; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s1
+; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s0, v6
 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-DENORM-CONTRACT-NEXT:    v_fma_f16 v2, v7, |v2|, -v3
 ; VI-DENORM-CONTRACT-NEXT:    flat_store_short v[0:1], v2
@@ -2285,55 +2285,55 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture %
 ;
 ; GFX11-FLUSH-LABEL: mad_fabs_sub_f16:
 ; GFX11-FLUSH:       ; %bb.0:
-; GFX11-FLUSH-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-FLUSH-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[6:7] glc dlc
+; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[6:7] offset:2 glc dlc
+; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT:    global_load_u16 v3, v0, s[6:7] offset:4 glc dlc
+; GFX11-FLUSH-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FLUSH-NEXT:    v_mul_f16_e64 v1, v1, |v2|
 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FLUSH-NEXT:    v_sub_f16_e32 v1, v1, v3
-; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[4:5]
+; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
 ; GFX11-FLUSH-NEXT:    s_nop 0
 ; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-STRICT-LABEL: mad_fabs_sub_f16:
 ; GFX11-DENORM-STRICT:       ; %bb.0:
-; GFX11-DENORM-STRICT-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DENORM-STRICT-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v1, v0, s[6:7] glc dlc
+; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v2, v0, s[6:7] offset:2 glc dlc
+; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v3, v0, s[6:7] offset:4 glc dlc
+; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DENORM-STRICT-NEXT:    v_mul_f16_e64 v1, v1, |v2|
 ; GFX11-DENORM-STRICT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-DENORM-STRICT-NEXT:    v_sub_f16_e32 v1, v1, v3
-; GFX11-DENORM-STRICT-NEXT:    global_store_b16 v0, v1, s[4:5]
+; GFX11-DENORM-STRICT-NEXT:    global_store_b16 v0, v1, s[0:1]
 ; GFX11-DENORM-STRICT-NEXT:    s_nop 0
 ; GFX11-DENORM-STRICT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DENORM-STRICT-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-CONTRACT-LABEL: mad_fabs_sub_f16:
 ; GFX11-DENORM-CONTRACT:       ; %bb.0:
-; GFX11-DENORM-CONTRACT-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DENORM-CONTRACT-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v1, v0, s[6:7] glc dlc
+; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v2, v0, s[6:7] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v3, v0, s[6:7] offset:4 glc dlc
+; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, v1, |v2|, -v3
-; GFX11-DENORM-CONTRACT-NEXT:    global_store_b16 v0, v1, s[4:5]
+; GFX11-DENORM-CONTRACT-NEXT:    global_store_b16 v0, v1, s[0:1]
 ; GFX11-DENORM-CONTRACT-NEXT:    s_nop 0
 ; GFX11-DENORM-CONTRACT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DENORM-CONTRACT-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll
index ce5bb6617d9f4..f411a76e75ab6 100644
--- a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll
+++ b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll
@@ -124,23 +124,23 @@ define amdgpu_kernel void @fnearbyint_v2f32(ptr addrspace(1) %out, <2 x float> %
 ;
 ; VI-LABEL: fnearbyint_v2f32:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v2, s4
-; VI-NEXT:    v_rndne_f32_e32 v1, s7
-; VI-NEXT:    v_rndne_f32_e32 v0, s6
-; VI-NEXT:    v_mov_b32_e32 v3, s5
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_rndne_f32_e32 v1, s3
+; VI-NEXT:    v_rndne_f32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fnearbyint_v2f32:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_rndne_f32_e32 v1, s7
-; GFX11-NEXT:    v_rndne_f32_e32 v0, s6
-; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-NEXT:    v_rndne_f32_e32 v1, s3
+; GFX11-NEXT:    v_rndne_f32_e32 v0, s2
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
index 277dc01ccd99c..b5440b9c38c9f 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
@@ -2813,15 +2813,15 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_f32(float %arg, i1 %
 ;
 ; VI-LABEL: s_fneg_select_infloop_regression_f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_bitcmp1_b32 s5, 0
-; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    s_bitcmp1_b32 s1, 0
+; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[0:1]
 ; VI-NEXT:    v_cndmask_b32_e64 v2, -v0, 0, s[0:1]
-; VI-NEXT:    v_mov_b32_e32 v0, s6
-; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
   %i = select i1 %arg1, float 0.0, float %arg
@@ -3161,15 +3161,15 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_v2f16(<2 x half> %ar
 ;
 ; VI-LABEL: s_fneg_select_infloop_regression_v2f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_and_b32 s0, 1, s5
-; VI-NEXT:    s_cselect_b32 s1, 0, s4
-; VI-NEXT:    s_xor_b32 s1, s1, 0x80008000
-; VI-NEXT:    s_cmp_eq_u32 s0, 1
-; VI-NEXT:    s_cselect_b32 s0, 0, s1
-; VI-NEXT:    v_mov_b32_e32 v0, s6
-; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    s_and_b32 s1, 1, s1
+; VI-NEXT:    s_cselect_b32 s0, 0, s0
+; VI-NEXT:    s_xor_b32 s0, s0, 0x80008000
+; VI-NEXT:    s_cmp_eq_u32 s1, 1
+; VI-NEXT:    s_cselect_b32 s0, 0, s0
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
@@ -3293,15 +3293,15 @@ define amdgpu_kernel void @s_fabs_select_infloop_regression_f32(float %arg, i1 %
 ;
 ; VI-LABEL: s_fabs_select_infloop_regression_f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_bitcmp1_b32 s5, 0
-; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    s_bitcmp1_b32 s1, 0
+; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[0:1]
 ; VI-NEXT:    v_cndmask_b32_e64 v2, |v0|, 0, s[0:1]
-; VI-NEXT:    v_mov_b32_e32 v0, s6
-; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
   %i = select i1 %arg1, float 0.0, float %arg
@@ -3343,15 +3343,15 @@ define amdgpu_kernel void @s_fneg_fabs_select_infloop_regression(float %arg, i1
 ;
 ; VI-LABEL: s_fneg_fabs_select_infloop_regression:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_bitcmp1_b32 s5, 0
-; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    s_bitcmp1_b32 s1, 0
+; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[0:1]
 ; VI-NEXT:    v_cndmask_b32_e64 v2, -|v0|, 0, s[0:1]
-; VI-NEXT:    v_mov_b32_e32 v0, s6
-; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
   %i = select i1 %arg1, float 0.0, float %arg
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
index 79f1057af22e7..4364b32e62f8c 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
@@ -447,14 +447,14 @@ define amdgpu_kernel void @fneg_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in
 ;
 ; GFX11-LABEL: fneg_fabs_v4f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_or_b32 s0, s6, 0x80008000
-; GFX11-NEXT:    s_or_b32 s1, s7, 0x80008000
+; GFX11-NEXT:    s_or_b32 s2, s2, 0x80008000
+; GFX11-NEXT:    s_or_b32 s3, s3, 0x80008000
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT:    v_mov_b32_e32 v0, s0
-; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll
index 32033c52fc998..3c000d4fa63a3 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll
@@ -18,12 +18,12 @@ define amdgpu_kernel void @fneg_fabsf_fadd_f32(ptr addrspace(1) %out, float %x,
 ;
 ; VI-LABEL: fneg_fabsf_fadd_f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s6
-; VI-NEXT:    v_sub_f32_e64 v2, s7, |v0|
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_sub_f32_e64 v2, s3, |v0|
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
   %fabs = call float @llvm.fabs.f32(float %x)
@@ -49,12 +49,12 @@ define amdgpu_kernel void @fneg_fabsf_fmul_f32(ptr addrspace(1) %out, float %x,
 ;
 ; VI-LABEL: fneg_fabsf_fmul_f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s6
-; VI-NEXT:    v_mul_f32_e64 v2, s7, -|v0|
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mul_f32_e64 v2, s3, -|v0|
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
   %fabs = call float @llvm.fabs.f32(float %x)
@@ -213,14 +213,14 @@ define amdgpu_kernel void @fneg_fabsf_v2f32(ptr addrspace(1) %out, <2 x float> %
 ;
 ; VI-LABEL: fneg_fabsf_v2f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_or_b32 s0, s7, 0x80000000
-; VI-NEXT:    s_or_b32 s1, s6, 0x80000000
-; VI-NEXT:    v_mov_b32_e32 v2, s4
-; VI-NEXT:    v_mov_b32_e32 v0, s1
-; VI-NEXT:    v_mov_b32_e32 v1, s0
-; VI-NEXT:    v_mov_b32_e32 v3, s5
+; VI-NEXT:    s_bitset1_b32 s3, 31
+; VI-NEXT:    s_bitset1_b32 s2, 31
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
   %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in)
diff --git a/llvm/test/CodeGen/AMDGPU/fneg.ll b/llvm/test/CodeGen/AMDGPU/fneg.ll
index 94fc9293e774c..d78bdfe08772a 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg.ll
@@ -65,27 +65,27 @@ define amdgpu_kernel void @s_fneg_v2f32(ptr addrspace(1) nocapture %out, <2 x fl
 ;
 ; VI-LABEL: s_fneg_v2f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_xor_b32 s0, s7, 0x80000000
-; VI-NEXT:    s_xor_b32 s1, s6, 0x80000000
-; VI-NEXT:    v_mov_b32_e32 v2, s4
-; VI-NEXT:    v_mov_b32_e32 v0, s1
-; VI-NEXT:    v_mov_b32_e32 v1, s0
-; VI-NEXT:    v_mov_b32_e32 v3, s5
+; VI-NEXT:    s_xor_b32 s3, s3, 0x80000000
+; VI-NEXT:    s_xor_b32 s2, s2, 0x80000000
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_fneg_v2f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_xor_b32 s0, s6, 0x80000000
-; GFX11-NEXT:    s_xor_b32 s1, s7, 0x80000000
+; GFX11-NEXT:    s_xor_b32 s2, s2, 0x80000000
+; GFX11-NEXT:    s_xor_b32 s3, s3, 0x80000000
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT:    v_mov_b32_e32 v0, s0
-; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll
index 0095bcd89e33b..9f339af0f5580 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll
@@ -16,17 +16,17 @@ declare <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %d
 define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, <2 x half> %data) {
 ; GFX12-SDAG-LABEL: local_atomic_fadd_v2f16_noret:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-SDAG-NEXT:    ds_pk_add_f16 v0, v1
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: local_atomic_fadd_v2f16_noret:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-GISEL-NEXT:    ds_pk_add_f16 v0, v1
 ; GFX12-GISEL-NEXT:    s_endpgm
   %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0)
@@ -36,9 +36,9 @@ define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr,
 define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, <2 x i16> %data) {
 ; GFX12-SDAG-LABEL: local_atomic_fadd_v2bf16_noret:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-SDAG-NEXT:    ds_pk_add_bf16 v0, v1
 ; GFX12-SDAG-NEXT:    s_wait_dscnt 0x0
 ; GFX12-SDAG-NEXT:    global_inv scope:SCOPE_SYS
@@ -46,9 +46,9 @@ define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr,
 ;
 ; GFX12-GISEL-LABEL: local_atomic_fadd_v2bf16_noret:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
 ; GFX12-GISEL-NEXT:    ds_pk_add_bf16 v1, v0
 ; GFX12-GISEL-NEXT:    s_wait_dscnt 0x0
 ; GFX12-GISEL-NEXT:    global_inv scope:SCOPE_SYS
@@ -116,19 +116,19 @@ define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16>
 define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(ptr %ptr, <2 x half> %data) {
 ; GFX12-SDAG-LABEL: flat_atomic_fadd_v2f16_noret:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, s6
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-SDAG-NEXT:    flat_atomic_pk_add_f16 v[0:1], v2
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: flat_atomic_fadd_v2f16_noret:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-GISEL-NEXT:    flat_atomic_pk_add_f16 v[0:1], v2
 ; GFX12-GISEL-NEXT:    s_endpgm
   %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data)
@@ -164,19 +164,19 @@ define <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) {
 define amdgpu_kernel void @flat_atomic_fadd_v2bf16_noret(ptr %ptr, <2 x i16> %data) {
 ; GFX12-SDAG-LABEL: flat_atomic_fadd_v2bf16_noret:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, s6
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-SDAG-NEXT:    flat_atomic_pk_add_bf16 v[0:1], v2
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: flat_atomic_fadd_v2bf16_noret:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-GISEL-NEXT:    flat_atomic_pk_add_bf16 v[0:1], v2
 ; GFX12-GISEL-NEXT:    s_endpgm
   %ret = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data)
@@ -212,20 +212,20 @@ define <2 x i16> @flat_atomic_fadd_v2bf16_rtn(ptr %ptr, <2 x i16> %data) {
 define amdgpu_kernel void @global_atomic_fadd_v2bf16_noret(ptr addrspace(1) %ptr, <2 x i16> %data) {
 ; GFX12-SDAG-LABEL: global_atomic_fadd_v2bf16_noret:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6
-; GFX12-SDAG-NEXT:    global_atomic_pk_add_bf16 v0, v1, s[4:5]
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-SDAG-NEXT:    global_atomic_pk_add_bf16 v0, v1, s[0:1]
 ; GFX12-SDAG-NEXT:    s_nop 0
 ; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: global_atomic_fadd_v2bf16_noret:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
-; GFX12-GISEL-NEXT:    global_atomic_pk_add_bf16 v1, v0, s[4:5]
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-GISEL-NEXT:    global_atomic_pk_add_bf16 v1, v0, s[0:1]
 ; GFX12-GISEL-NEXT:    s_nop 0
 ; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll
index 7f87b41127767..f55e9f4821b47 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll
@@ -24,10 +24,10 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret(ptr %ptr, float %data) {
 ;
 ; GFX12-LABEL: flat_atomic_fadd_f32_noret:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT:    v_mov_b32_e32 v2, s6
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-NEXT:    flat_atomic_add_f32 v[0:1], v2
 ; GFX12-NEXT:    s_endpgm
   %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %ptr, float %data)
@@ -190,10 +190,10 @@ define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(ptr %ptr, <2 x half> %da
 ;
 ; GFX12-LABEL: flat_atomic_fadd_v2f16_noret:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT:    v_mov_b32_e32 v2, s6
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-NEXT:    flat_atomic_pk_add_f16 v[0:1], v2
 ; GFX12-NEXT:    s_endpgm
   %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data)
@@ -235,10 +235,10 @@ define amdgpu_kernel void @flat_atomic_fadd_v2bf16_noret(ptr %ptr, <2 x i16> %da
 ;
 ; GFX12-LABEL: flat_atomic_fadd_v2bf16_noret:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT:    v_mov_b32_e32 v2, s6
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-NEXT:    flat_atomic_pk_add_bf16 v[0:1], v2
 ; GFX12-NEXT:    s_endpgm
   %ret = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data)
@@ -280,10 +280,10 @@ define amdgpu_kernel void @global_atomic_fadd_v2bf16_noret(ptr addrspace(1) %ptr
 ;
 ; GFX12-LABEL: global_atomic_fadd_v2bf16_noret:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6
-; GFX12-NEXT:    global_atomic_pk_add_bf16 v0, v1, s[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT:    global_atomic_pk_add_bf16 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
@@ -325,9 +325,9 @@ define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr,
 ;
 ; GFX12-LABEL: local_atomic_fadd_v2f16_noret:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    ds_pk_add_f16 v0, v1
 ; GFX12-NEXT:    s_endpgm
   %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0)
@@ -371,9 +371,9 @@ define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr,
 ;
 ; GFX12-LABEL: local_atomic_fadd_v2bf16_noret:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    ds_pk_add_bf16 v0, v1
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
diff --git a/llvm/test/CodeGen/AMDGPU/fp-classify.ll b/llvm/test/CodeGen/AMDGPU/fp-classify.ll
index ca2fa0f20f0f5..18d2e52e8f900 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-classify.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-classify.ll
@@ -362,29 +362,29 @@ define amdgpu_kernel void @test_isfinite_not_pattern_2(ptr addrspace(1) nocaptur
 ;
 ; VI-LABEL: test_isfinite_not_pattern_2:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_mov_b32_e32 v0, 0x7f800000
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_cmp_o_f32_e64 s[0:1], s6, s6
-; VI-NEXT:    v_cmp_neq_f32_e64 s[2:3], |s7|, v0
-; VI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
-; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_cmp_o_f32_e64 s[4:5], s2, s2
+; VI-NEXT:    v_cmp_neq_f32_e64 s[2:3], |s3|, v0
+; VI-NEXT:    s_and_b64 s[2:3], s[4:5], s[2:3]
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[2:3]
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_isfinite_not_pattern_2:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_o_f32_e64 s0, s6, s6
-; GFX11-NEXT:    v_cmp_neq_f32_e64 s1, 0x7f800000, |s7|
+; GFX11-NEXT:    v_cmp_o_f32_e64 s2, s2, s2
+; GFX11-NEXT:    v_cmp_neq_f32_e64 s3, 0x7f800000, |s3|
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_b32 s0, s0, s1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
-; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX11-NEXT:    s_and_b32 s2, s2, s3
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s2
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll
index 2928647a9627d..a6fadbed33c86 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll
@@ -676,39 +676,39 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre
 ;
 ; GFX1030-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc:
 ; GFX1030:       ; %bb.0: ; %main_body
-; GFX1030-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX1030-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
 ; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v0, s8
-; GFX1030-NEXT:    v_mov_b32_e32 v1, s9
-; GFX1030-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc
+; GFX1030-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1030-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1030-NEXT:    buffer_atomic_fmax v0, v1, s[0:3], 4 offen glc slc
 ; GFX1030-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0)
-; GFX1030-NEXT:    global_store_dword v1, v0, s[10:11]
+; GFX1030-NEXT:    global_store_dword v1, v0, s[6:7]
 ; GFX1030-NEXT:    s_endpgm
 ;
 ; GFX1100-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc:
 ; GFX1100:       ; %bb.0: ; %main_body
-; GFX1100-NEXT:    s_load_b256 s[4:11], s[0:1], 0x24
+; GFX1100-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
 ; GFX1100-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
-; GFX1100-NEXT:    buffer_atomic_max_f32 v0, v1, s[4:7], 4 offen glc slc
+; GFX1100-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX1100-NEXT:    buffer_atomic_max_f32 v0, v1, s[0:3], 4 offen glc slc
 ; GFX1100-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1100-NEXT:    s_waitcnt vmcnt(0)
-; GFX1100-NEXT:    global_store_b32 v1, v0, s[10:11]
+; GFX1100-NEXT:    global_store_b32 v1, v0, s[6:7]
 ; GFX1100-NEXT:    s_nop 0
 ; GFX1100-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc:
 ; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    s_load_b256 s[4:11], s[0:1], 0x24
-; GFX12-NEXT:    s_mov_b32 s0, 4
+; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
-; GFX12-NEXT:    buffer_atomic_max_num_f32 v0, v1, s[4:7], s0 offen th:TH_ATOMIC_NT_RETURN
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    s_mov_b32 s4, 4
+; GFX12-NEXT:    buffer_atomic_max_num_f32 v0, v1, s[0:3], s4 offen th:TH_ATOMIC_NT_RETURN
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_store_b32 v1, v0, s[10:11]
+; GFX12-NEXT:    global_store_b32 v1, v0, s[6:7]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
@@ -755,25 +755,25 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre
 ;
 ; G_GFX1030-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc:
 ; G_GFX1030:       ; %bb.0: ; %main_body
-; G_GFX1030-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
+; G_GFX1030-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
 ; G_GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX1030-NEXT:    v_mov_b32_e32 v0, s8
-; G_GFX1030-NEXT:    v_mov_b32_e32 v1, s9
-; G_GFX1030-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc
+; G_GFX1030-NEXT:    v_mov_b32_e32 v0, s4
+; G_GFX1030-NEXT:    v_mov_b32_e32 v1, s5
+; G_GFX1030-NEXT:    buffer_atomic_fmax v0, v1, s[0:3], 4 offen glc slc
 ; G_GFX1030-NEXT:    v_mov_b32_e32 v1, 0
 ; G_GFX1030-NEXT:    s_waitcnt vmcnt(0)
-; G_GFX1030-NEXT:    global_store_dword v1, v0, s[10:11]
+; G_GFX1030-NEXT:    global_store_dword v1, v0, s[6:7]
 ; G_GFX1030-NEXT:    s_endpgm
 ;
 ; G_GFX1100-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc:
 ; G_GFX1100:       ; %bb.0: ; %main_body
-; G_GFX1100-NEXT:    s_load_b256 s[4:11], s[0:1], 0x24
+; G_GFX1100-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
 ; G_GFX1100-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX1100-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
-; G_GFX1100-NEXT:    buffer_atomic_max_f32 v0, v1, s[4:7], 4 offen glc slc
+; G_GFX1100-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; G_GFX1100-NEXT:    buffer_atomic_max_f32 v0, v1, s[0:3], 4 offen glc slc
 ; G_GFX1100-NEXT:    v_mov_b32_e32 v1, 0
 ; G_GFX1100-NEXT:    s_waitcnt vmcnt(0)
-; G_GFX1100-NEXT:    global_store_b32 v1, v0, s[10:11]
+; G_GFX1100-NEXT:    global_store_b32 v1, v0, s[6:7]
 ; G_GFX1100-NEXT:    s_nop 0
 ; G_GFX1100-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; G_GFX1100-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll
index f4745a5acbcd6..ea26ad06cbb16 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll
@@ -50,22 +50,22 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f32(ptr addrspace(8)
 ; GFX1030-LABEL: raw_ptr_buffer_atomic_min_noret_f32:
 ; GFX1030:       ; %bb.0: ; %main_body
 ; GFX1030-NEXT:    s_clause 0x1
-; GFX1030-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX1030-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX1030-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v0, s2
-; GFX1030-NEXT:    v_mov_b32_e32 v1, s3
-; GFX1030-NEXT:    buffer_atomic_fmin v0, v1, s[4:7], 0 offen
+; GFX1030-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1030-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1030-NEXT:    buffer_atomic_fmin v0, v1, s[0:3], 0 offen
 ; GFX1030-NEXT:    s_endpgm
 ;
 ; GFX1100-LABEL: raw_ptr_buffer_atomic_min_noret_f32:
 ; GFX1100:       ; %bb.0: ; %main_body
 ; GFX1100-NEXT:    s_clause 0x1
-; GFX1100-NEXT:    s_load_b64 s[2:3], s[0:1], 0x34
-; GFX1100-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1100-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
+; GFX1100-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX1100-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX1100-NEXT:    buffer_atomic_min_f32 v0, v1, s[4:7], 0 offen
+; GFX1100-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX1100-NEXT:    buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen
 ; GFX1100-NEXT:    s_nop 0
 ; GFX1100-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-NEXT:    s_endpgm
@@ -408,22 +408,22 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f32(ptr addrspace(8)
 ; GFX1030-LABEL: raw_ptr_buffer_atomic_max_noret_f32:
 ; GFX1030:       ; %bb.0: ; %main_body
 ; GFX1030-NEXT:    s_clause 0x1
-; GFX1030-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX1030-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX1030-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v0, s2
-; GFX1030-NEXT:    v_mov_b32_e32 v1, s3
-; GFX1030-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 0 offen
+; GFX1030-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1030-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1030-NEXT:    buffer_atomic_fmax v0, v1, s[0:3], 0 offen
 ; GFX1030-NEXT:    s_endpgm
 ;
 ; GFX1100-LABEL: raw_ptr_buffer_atomic_max_noret_f32:
 ; GFX1100:       ; %bb.0: ; %main_body
 ; GFX1100-NEXT:    s_clause 0x1
-; GFX1100-NEXT:    s_load_b64 s[2:3], s[0:1], 0x34
-; GFX1100-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1100-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
+; GFX1100-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX1100-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX1100-NEXT:    buffer_atomic_max_f32 v0, v1, s[4:7], 0 offen
+; GFX1100-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX1100-NEXT:    buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen
 ; GFX1100-NEXT:    s_nop 0
 ; GFX1100-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-NEXT:    s_endpgm
@@ -619,25 +619,25 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrsp
 ;
 ; GFX1030-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc:
 ; GFX1030:       ; %bb.0: ; %main_body
-; GFX1030-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX1030-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
 ; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v0, s8
-; GFX1030-NEXT:    v_mov_b32_e32 v1, s9
-; GFX1030-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc
+; GFX1030-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1030-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1030-NEXT:    buffer_atomic_fmax v0, v1, s[0:3], 4 offen glc slc
 ; GFX1030-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0)
-; GFX1030-NEXT:    global_store_dword v1, v0, s[10:11]
+; GFX1030-NEXT:    global_store_dword v1, v0, s[6:7]
 ; GFX1030-NEXT:    s_endpgm
 ;
 ; GFX1100-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc:
 ; GFX1100:       ; %bb.0: ; %main_body
-; GFX1100-NEXT:    s_load_b256 s[4:11], s[0:1], 0x24
+; GFX1100-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
 ; GFX1100-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
-; GFX1100-NEXT:    buffer_atomic_max_f32 v0, v1, s[4:7], 4 offen glc slc
+; GFX1100-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX1100-NEXT:    buffer_atomic_max_f32 v0, v1, s[0:3], 4 offen glc slc
 ; GFX1100-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1100-NEXT:    s_waitcnt vmcnt(0)
-; GFX1100-NEXT:    global_store_b32 v1, v0, s[10:11]
+; GFX1100-NEXT:    global_store_b32 v1, v0, s[6:7]
 ; GFX1100-NEXT:    s_nop 0
 ; GFX1100-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-NEXT:    s_endpgm
@@ -684,25 +684,25 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrsp
 ;
 ; G_GFX1030-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc:
 ; G_GFX1030:       ; %bb.0: ; %main_body
-; G_GFX1030-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
+; G_GFX1030-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
 ; G_GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX1030-NEXT:    v_mov_b32_e32 v0, s8
-; G_GFX1030-NEXT:    v_mov_b32_e32 v1, s9
-; G_GFX1030-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc
+; G_GFX1030-NEXT:    v_mov_b32_e32 v0, s4
+; G_GFX1030-NEXT:    v_mov_b32_e32 v1, s5
+; G_GFX1030-NEXT:    buffer_atomic_fmax v0, v1, s[0:3], 4 offen glc slc
 ; G_GFX1030-NEXT:    v_mov_b32_e32 v1, 0
 ; G_GFX1030-NEXT:    s_waitcnt vmcnt(0)
-; G_GFX1030-NEXT:    global_store_dword v1, v0, s[10:11]
+; G_GFX1030-NEXT:    global_store_dword v1, v0, s[6:7]
 ; G_GFX1030-NEXT:    s_endpgm
 ;
 ; G_GFX1100-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc:
 ; G_GFX1100:       ; %bb.0: ; %main_body
-; G_GFX1100-NEXT:    s_load_b256 s[4:11], s[0:1], 0x24
+; G_GFX1100-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
 ; G_GFX1100-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX1100-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
-; G_GFX1100-NEXT:    buffer_atomic_max_f32 v0, v1, s[4:7], 4 offen glc slc
+; G_GFX1100-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; G_GFX1100-NEXT:    buffer_atomic_max_f32 v0, v1, s[0:3], 4 offen glc slc
 ; G_GFX1100-NEXT:    v_mov_b32_e32 v1, 0
 ; G_GFX1100-NEXT:    s_waitcnt vmcnt(0)
-; G_GFX1100-NEXT:    global_store_b32 v1, v0, s[10:11]
+; G_GFX1100-NEXT:    global_store_b32 v1, v0, s[6:7]
 ; G_GFX1100-NEXT:    s_nop 0
 ; G_GFX1100-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; G_GFX1100-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll b/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll
index dc1f8ca0b7d0f..d827ea0503a3b 100644
--- a/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll
@@ -28,38 +28,38 @@ define amdgpu_kernel void @test_convert_fp16_to_fp32(ptr addrspace(1) noalias %o
 ;
 ; GFX8-LABEL: test_convert_fp16_to_fp32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT:    s_mov_b32 s3, 0xf000
-; GFX8-NEXT:    s_mov_b32 s2, -1
-; GFX8-NEXT:    s_mov_b32 s10, s2
-; GFX8-NEXT:    s_mov_b32 s11, s3
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT:    s_mov_b32 s7, 0xf000
+; GFX8-NEXT:    s_mov_b32 s6, -1
+; GFX8-NEXT:    s_mov_b32 s10, s6
+; GFX8-NEXT:    s_mov_b32 s11, s7
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 s8, s6
-; GFX8-NEXT:    s_mov_b32 s9, s7
+; GFX8-NEXT:    s_mov_b32 s8, s2
+; GFX8-NEXT:    s_mov_b32 s9, s3
 ; GFX8-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
-; GFX8-NEXT:    s_mov_b32 s0, s4
-; GFX8-NEXT:    s_mov_b32 s1, s5
+; GFX8-NEXT:    s_mov_b32 s4, s0
+; GFX8-NEXT:    s_mov_b32 s5, s1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_convert_fp16_to_fp32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s10, s2
-; GFX11-NEXT:    s_mov_b32 s11, s3
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_mov_b32 s6, -1
+; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s10, s6
+; GFX11-NEXT:    s_mov_b32 s11, s7
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s8, s6
-; GFX11-NEXT:    s_mov_b32 s9, s7
-; GFX11-NEXT:    s_mov_b32 s0, s4
+; GFX11-NEXT:    s_mov_b32 s8, s2
+; GFX11-NEXT:    s_mov_b32 s9, s3
+; GFX11-NEXT:    s_mov_b32 s4, s0
 ; GFX11-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_mov_b32 s1, s5
+; GFX11-NEXT:    s_mov_b32 s5, s1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll b/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll
index 03ee67dc16678..03b8251ea4640 100644
--- a/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll
@@ -27,41 +27,41 @@ define amdgpu_kernel void @test_convert_fp16_to_fp64(ptr addrspace(1) noalias %o
 ;
 ; GFX8-LABEL: test_convert_fp16_to_fp64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT:    s_mov_b32 s3, 0xf000
-; GFX8-NEXT:    s_mov_b32 s2, -1
-; GFX8-NEXT:    s_mov_b32 s10, s2
-; GFX8-NEXT:    s_mov_b32 s11, s3
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT:    s_mov_b32 s7, 0xf000
+; GFX8-NEXT:    s_mov_b32 s6, -1
+; GFX8-NEXT:    s_mov_b32 s10, s6
+; GFX8-NEXT:    s_mov_b32 s11, s7
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 s8, s6
-; GFX8-NEXT:    s_mov_b32 s9, s7
+; GFX8-NEXT:    s_mov_b32 s8, s2
+; GFX8-NEXT:    s_mov_b32 s9, s3
 ; GFX8-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
-; GFX8-NEXT:    s_mov_b32 s0, s4
-; GFX8-NEXT:    s_mov_b32 s1, s5
+; GFX8-NEXT:    s_mov_b32 s4, s0
+; GFX8-NEXT:    s_mov_b32 s5, s1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX8-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_convert_fp16_to_fp64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s10, s2
-; GFX11-NEXT:    s_mov_b32 s11, s3
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_mov_b32 s6, -1
+; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s10, s6
+; GFX11-NEXT:    s_mov_b32 s11, s7
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s8, s6
-; GFX11-NEXT:    s_mov_b32 s9, s7
-; GFX11-NEXT:    s_mov_b32 s0, s4
+; GFX11-NEXT:    s_mov_b32 s8, s2
+; GFX11-NEXT:    s_mov_b32 s9, s3
+; GFX11-NEXT:    s_mov_b32 s4, s0
 ; GFX11-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_mov_b32 s1, s5
+; GFX11-NEXT:    s_mov_b32 s5, s1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll b/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll
index 4f6ea77205133..8ab82b722445e 100644
--- a/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll
@@ -27,38 +27,38 @@ define amdgpu_kernel void @test_convert_fp32_to_fp16(ptr addrspace(1) noalias %o
 ;
 ; GFX8-LABEL: test_convert_fp32_to_fp16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT:    s_mov_b32 s3, 0xf000
-; GFX8-NEXT:    s_mov_b32 s2, -1
-; GFX8-NEXT:    s_mov_b32 s10, s2
-; GFX8-NEXT:    s_mov_b32 s11, s3
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT:    s_mov_b32 s7, 0xf000
+; GFX8-NEXT:    s_mov_b32 s6, -1
+; GFX8-NEXT:    s_mov_b32 s10, s6
+; GFX8-NEXT:    s_mov_b32 s11, s7
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 s8, s6
-; GFX8-NEXT:    s_mov_b32 s9, s7
+; GFX8-NEXT:    s_mov_b32 s8, s2
+; GFX8-NEXT:    s_mov_b32 s9, s3
 ; GFX8-NEXT:    buffer_load_dword v0, off, s[8:11], 0
-; GFX8-NEXT:    s_mov_b32 s0, s4
-; GFX8-NEXT:    s_mov_b32 s1, s5
+; GFX8-NEXT:    s_mov_b32 s4, s0
+; GFX8-NEXT:    s_mov_b32 s5, s1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX8-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX8-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_convert_fp32_to_fp16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s10, s2
-; GFX11-NEXT:    s_mov_b32 s11, s3
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_mov_b32 s6, -1
+; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s10, s6
+; GFX11-NEXT:    s_mov_b32 s11, s7
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s8, s6
-; GFX11-NEXT:    s_mov_b32 s9, s7
-; GFX11-NEXT:    s_mov_b32 s0, s4
+; GFX11-NEXT:    s_mov_b32 s8, s2
+; GFX11-NEXT:    s_mov_b32 s9, s3
+; GFX11-NEXT:    s_mov_b32 s4, s0
 ; GFX11-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_mov_b32 s1, s5
+; GFX11-NEXT:    s_mov_b32 s5, s1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll
index a058c1119d4fd..47164c9d32a8f 100644
--- a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll
@@ -465,13 +465,13 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inre
 ;
 ; GFX1030-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX1030:       ; %bb.0: ; %main_body
-; GFX1030-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX1030-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
 ; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v0, s8
-; GFX1030-NEXT:    v_mov_b32_e32 v1, s9
-; GFX1030-NEXT:    v_mov_b32_e32 v2, s10
-; GFX1030-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc
-; GFX1030-NEXT:    v_mov_b32_e32 v2, s11
+; GFX1030-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1030-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1030-NEXT:    v_mov_b32_e32 v2, s6
+; GFX1030-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 4 offen glc slc
+; GFX1030-NEXT:    v_mov_b32_e32 v2, s7
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1030-NEXT:    ds_write_b64 v2, v[0:1]
 ; GFX1030-NEXT:    s_endpgm
@@ -519,13 +519,13 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inre
 ;
 ; G_GFX1030-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
 ; G_GFX1030:       ; %bb.0: ; %main_body
-; G_GFX1030-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
+; G_GFX1030-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
 ; G_GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX1030-NEXT:    v_mov_b32_e32 v0, s8
-; G_GFX1030-NEXT:    v_mov_b32_e32 v1, s9
-; G_GFX1030-NEXT:    v_mov_b32_e32 v2, s10
-; G_GFX1030-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc
-; G_GFX1030-NEXT:    v_mov_b32_e32 v2, s11
+; G_GFX1030-NEXT:    v_mov_b32_e32 v0, s4
+; G_GFX1030-NEXT:    v_mov_b32_e32 v1, s5
+; G_GFX1030-NEXT:    v_mov_b32_e32 v2, s6
+; G_GFX1030-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 4 offen glc slc
+; G_GFX1030-NEXT:    v_mov_b32_e32 v2, s7
 ; G_GFX1030-NEXT:    s_waitcnt vmcnt(0)
 ; G_GFX1030-NEXT:    ds_write_b64 v2, v[0:1]
 ; G_GFX1030-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll
index 046c92a2fc63f..3321948192bda 100644
--- a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll
@@ -54,14 +54,14 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8)
 ; GFX1030-LABEL: raw_ptr_buffer_atomic_min_noret_f64:
 ; GFX1030:       ; %bb.0: ; %main_body
 ; GFX1030-NEXT:    s_clause 0x2
-; GFX1030-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX1030-NEXT:    s_load_dword s8, s[0:1], 0x3c
-; GFX1030-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX1030-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX1030-NEXT:    s_load_dword s6, s[0:1], 0x3c
+; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v0, s2
-; GFX1030-NEXT:    v_mov_b32_e32 v1, s3
-; GFX1030-NEXT:    v_mov_b32_e32 v2, s8
-; GFX1030-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen
+; GFX1030-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1030-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1030-NEXT:    v_mov_b32_e32 v2, s6
+; GFX1030-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen
 ; GFX1030-NEXT:    s_endpgm
 ;
 ; G_SI-LABEL: raw_ptr_buffer_atomic_min_noret_f64:
@@ -291,14 +291,14 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8)
 ; GFX1030-LABEL: raw_ptr_buffer_atomic_max_noret_f64:
 ; GFX1030:       ; %bb.0: ; %main_body
 ; GFX1030-NEXT:    s_clause 0x2
-; GFX1030-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX1030-NEXT:    s_load_dword s8, s[0:1], 0x3c
-; GFX1030-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX1030-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX1030-NEXT:    s_load_dword s6, s[0:1], 0x3c
+; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v0, s2
-; GFX1030-NEXT:    v_mov_b32_e32 v1, s3
-; GFX1030-NEXT:    v_mov_b32_e32 v2, s8
-; GFX1030-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen
+; GFX1030-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1030-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1030-NEXT:    v_mov_b32_e32 v2, s6
+; GFX1030-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen
 ; GFX1030-NEXT:    s_endpgm
 ;
 ; G_SI-LABEL: raw_ptr_buffer_atomic_max_noret_f64:
@@ -465,13 +465,13 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp
 ;
 ; GFX1030-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX1030:       ; %bb.0: ; %main_body
-; GFX1030-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX1030-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
 ; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v0, s8
-; GFX1030-NEXT:    v_mov_b32_e32 v1, s9
-; GFX1030-NEXT:    v_mov_b32_e32 v2, s10
-; GFX1030-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc
-; GFX1030-NEXT:    v_mov_b32_e32 v2, s11
+; GFX1030-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1030-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1030-NEXT:    v_mov_b32_e32 v2, s6
+; GFX1030-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 4 offen glc slc
+; GFX1030-NEXT:    v_mov_b32_e32 v2, s7
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1030-NEXT:    ds_write_b64 v2, v[0:1]
 ; GFX1030-NEXT:    s_endpgm
@@ -519,13 +519,13 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp
 ;
 ; G_GFX1030-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc:
 ; G_GFX1030:       ; %bb.0: ; %main_body
-; G_GFX1030-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
+; G_GFX1030-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
 ; G_GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX1030-NEXT:    v_mov_b32_e32 v0, s8
-; G_GFX1030-NEXT:    v_mov_b32_e32 v1, s9
-; G_GFX1030-NEXT:    v_mov_b32_e32 v2, s10
-; G_GFX1030-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc
-; G_GFX1030-NEXT:    v_mov_b32_e32 v2, s11
+; G_GFX1030-NEXT:    v_mov_b32_e32 v0, s4
+; G_GFX1030-NEXT:    v_mov_b32_e32 v1, s5
+; G_GFX1030-NEXT:    v_mov_b32_e32 v2, s6
+; G_GFX1030-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 4 offen glc slc
+; G_GFX1030-NEXT:    v_mov_b32_e32 v2, s7
 ; G_GFX1030-NEXT:    s_waitcnt vmcnt(0)
 ; G_GFX1030-NEXT:    ds_write_b64 v2, v[0:1]
 ; G_GFX1030-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll
index b4fee7017d698..04ef30bd26aa5 100644
--- a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll
@@ -100,15 +100,15 @@ define amdgpu_kernel void @fp_to_sint_v2i32(ptr addrspace(1) %out, <2 x float> %
 ;
 ; VI-LABEL: fp_to_sint_v2i32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_cvt_i32_f32_e32 v1, s7
-; VI-NEXT:    v_cvt_i32_f32_e32 v0, s6
-; VI-NEXT:    s_mov_b32 s0, s4
-; VI-NEXT:    s_mov_b32 s1, s5
-; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT:    v_cvt_i32_f32_e32 v1, s3
+; VI-NEXT:    v_cvt_i32_f32_e32 v0, s2
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: fp_to_sint_v2i32:
@@ -329,24 +329,24 @@ define amdgpu_kernel void @fp_to_sint_v2i64(ptr addrspace(1) %out, <2 x float> %
 ;
 ; VI-LABEL: fp_to_sint_v2i64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    s_mov_b32 s8, 0x2f800000
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_trunc_f32_e32 v0, s7
+; VI-NEXT:    v_trunc_f32_e32 v0, s3
 ; VI-NEXT:    v_mul_f32_e64 v1, |v0|, s8
-; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s4, s0
 ; VI-NEXT:    v_floor_f32_e32 v1, v1
-; VI-NEXT:    s_mov_b32 s4, 0xcf800000
-; VI-NEXT:    v_fma_f32 v2, v1, s4, |v0|
-; VI-NEXT:    v_trunc_f32_e32 v4, s6
+; VI-NEXT:    s_mov_b32 s0, 0xcf800000
+; VI-NEXT:    v_fma_f32 v2, v1, s0, |v0|
+; VI-NEXT:    v_trunc_f32_e32 v4, s2
 ; VI-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; VI-NEXT:    v_mul_f32_e64 v3, |v4|, s8
 ; VI-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; VI-NEXT:    v_floor_f32_e32 v3, v3
 ; VI-NEXT:    v_cvt_u32_f32_e32 v5, v3
-; VI-NEXT:    v_fma_f32 v3, v3, s4, |v4|
+; VI-NEXT:    v_fma_f32 v3, v3, s0, |v4|
 ; VI-NEXT:    v_ashrrev_i32_e32 v0, 31, v0
 ; VI-NEXT:    v_cvt_u32_f32_e32 v6, v3
 ; VI-NEXT:    v_xor_b32_e32 v2, v2, v0
@@ -357,9 +357,9 @@ define amdgpu_kernel void @fp_to_sint_v2i64(ptr addrspace(1) %out, <2 x float> %
 ; VI-NEXT:    v_xor_b32_e32 v0, v6, v1
 ; VI-NEXT:    v_xor_b32_e32 v4, v5, v1
 ; VI-NEXT:    v_sub_u32_e32 v0, vcc, v0, v1
-; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_mov_b32 s5, s1
 ; VI-NEXT:    v_subb_u32_e32 v1, vcc, v4, v1, vcc
-; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: fp_to_sint_v2i64:
diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll b/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll
index f8ede1cd557a7..5abf82aa1aab5 100644
--- a/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll
@@ -60,15 +60,15 @@ define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i32(ptr addrspace(1) %out, <2 x
 ;
 ; VI-LABEL: fp_to_uint_v2f32_to_v2i32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_cvt_u32_f32_e32 v1, s7
-; VI-NEXT:    v_cvt_u32_f32_e32 v0, s6
-; VI-NEXT:    s_mov_b32 s0, s4
-; VI-NEXT:    s_mov_b32 s1, s5
-; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT:    v_cvt_u32_f32_e32 v1, s3
+; VI-NEXT:    v_cvt_u32_f32_e32 v0, s2
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: fp_to_uint_v2f32_to_v2i32:
@@ -264,26 +264,26 @@ define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i64(ptr addrspace(1) %out, <2 x
 ;
 ; VI-LABEL: fp_to_uint_v2f32_to_v2i64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s0, 0xcf800000
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_trunc_f32_e32 v0, s7
-; VI-NEXT:    v_trunc_f32_e32 v4, s6
+; VI-NEXT:    v_trunc_f32_e32 v0, s3
+; VI-NEXT:    v_trunc_f32_e32 v4, s2
 ; VI-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; VI-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v4
 ; VI-NEXT:    v_floor_f32_e32 v5, v1
+; VI-NEXT:    s_mov_b32 s2, 0xcf800000
 ; VI-NEXT:    v_floor_f32_e32 v6, v2
-; VI-NEXT:    v_fma_f32 v0, v5, s0, v0
+; VI-NEXT:    v_fma_f32 v0, v5, s2, v0
 ; VI-NEXT:    v_cvt_u32_f32_e32 v2, v0
-; VI-NEXT:    v_fma_f32 v0, v6, s0, v4
+; VI-NEXT:    v_fma_f32 v0, v6, s2, v4
 ; VI-NEXT:    v_cvt_u32_f32_e32 v3, v5
 ; VI-NEXT:    v_cvt_u32_f32_e32 v1, v6
 ; VI-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; VI-NEXT:    s_mov_b32 s0, s4
-; VI-NEXT:    s_mov_b32 s1, s5
-; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: fp_to_uint_v2f32_to_v2i64:
diff --git a/llvm/test/CodeGen/AMDGPU/fshl.ll b/llvm/test/CodeGen/AMDGPU/fshl.ll
index fecf303d57691..3c4087fe391b6 100644
--- a/llvm/test/CodeGen/AMDGPU/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshl.ll
@@ -126,12 +126,12 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) {
 ;
 ; VI-LABEL: fshl_i32_imm:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s7
-; VI-NEXT:    v_alignbit_b32 v2, s6, v0, 25
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v0, s3
+; VI-NEXT:    v_alignbit_b32 v2, s2, v0, 25
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -168,11 +168,11 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) {
 ;
 ; GFX11-LABEL: fshl_i32_imm:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_alignbit_b32 v1, s6, s7, 25
-; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX11-NEXT:    v_alignbit_b32 v1, s2, s3, 25
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -720,15 +720,15 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) {
 ;
 ; VI-LABEL: orxor2or1:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b32 s0, s6, 7
-; VI-NEXT:    s_or_b32 s0, s7, s0
-; VI-NEXT:    s_cmp_eq_u32 s0, 0
-; VI-NEXT:    s_cselect_b32 s0, s6, s7
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    s_lshl_b32 s4, s2, 7
+; VI-NEXT:    s_or_b32 s4, s3, s4
+; VI-NEXT:    s_cmp_eq_u32 s4, 0
+; VI-NEXT:    s_cselect_b32 s2, s2, s3
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -774,16 +774,16 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) {
 ;
 ; GFX11-LABEL: orxor2or1:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b32 s0, s6, 7
+; GFX11-NEXT:    s_lshl_b32 s4, s2, 7
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_or_b32 s0, s7, s0
-; GFX11-NEXT:    s_cmp_eq_u32 s0, 0
-; GFX11-NEXT:    s_cselect_b32 s0, s6, s7
+; GFX11-NEXT:    s_or_b32 s4, s3, s4
+; GFX11-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX11-NEXT:    s_cselect_b32 s2, s2, s3
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
-; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll
index a5ea1ee92a048..e8377763e4be2 100644
--- a/llvm/test/CodeGen/AMDGPU/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshr.ll
@@ -86,13 +86,13 @@ define amdgpu_kernel void @fshr_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z
 ; GFX11-LABEL: fshr_i32:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x34
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[0:1], 0x34
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_alignbit_b32 v0, s6, s7, v0
-; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX11-NEXT:    v_alignbit_b32 v0, s2, s3, v0
+; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -118,12 +118,12 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) {
 ;
 ; VI-LABEL: fshr_i32_imm:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s7
-; VI-NEXT:    v_alignbit_b32 v2, s6, v0, 7
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v0, s3
+; VI-NEXT:    v_alignbit_b32 v2, s2, v0, 7
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -160,11 +160,11 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) {
 ;
 ; GFX11-LABEL: fshr_i32_imm:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_alignbit_b32 v1, s6, s7, 7
-; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX11-NEXT:    v_alignbit_b32 v1, s2, s3, 7
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics.ll b/llvm/test/CodeGen/AMDGPU/global_atomics.ll
index 4d585cf56b9a1..d4398e5367c7f 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics.ll
@@ -4140,15 +4140,15 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr addrspace(1) %out, i32
 ;
 ; VI-LABEL: atomic_cmpxchg_i32_offset:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s6
-; VI-NEXT:    s_mov_b32 s0, s4
-; VI-NEXT:    s_mov_b32 s1, s5
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    s_endpgm
@@ -4391,15 +4391,15 @@ define amdgpu_kernel void @atomic_cmpxchg_i32(ptr addrspace(1) %out, i32 %in, i3
 ;
 ; VI-LABEL: atomic_cmpxchg_i32:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s6
-; VI-NEXT:    s_mov_b32 s0, s4
-; VI-NEXT:    s_mov_b32 s1, s5
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
index 3050da034d236..1fa7c52a68802 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
@@ -4648,24 +4648,24 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i
 ;
 ; VI-LABEL: atomic_max_i32_addr64_offset:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_ashr_i32 s1, s7, 31
-; VI-NEXT:    s_mov_b32 s0, s7
-; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; VI-NEXT:    s_add_u32 s2, s4, s0
-; VI-NEXT:    s_addc_u32 s3, s5, s1
-; VI-NEXT:    s_load_dword s4, s[2:3], 0x10
-; VI-NEXT:    s_add_u32 s2, s2, 16
-; VI-NEXT:    s_addc_u32 s3, s3, 0
-; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    s_ashr_i32 s5, s3, 31
+; VI-NEXT:    s_mov_b32 s4, s3
+; VI-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
+; VI-NEXT:    s_add_u32 s4, s0, s4
+; VI-NEXT:    s_addc_u32 s5, s1, s5
+; VI-NEXT:    s_load_dword s3, s[4:5], 0x10
+; VI-NEXT:    s_add_u32 s4, s4, 16
+; VI-NEXT:    s_addc_u32 s5, s5, 0
+; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    s_mov_b64 s[0:1], 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v3, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_mov_b32_e32 v3, s3
+; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:  .LBB91_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; VI-NEXT:    v_max_i32_e32 v2, s6, v3
+; VI-NEXT:    v_max_i32_e32 v2, s2, v3
 ; VI-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
@@ -4861,22 +4861,22 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in,
 ;
 ; VI-LABEL: atomic_max_i32_addr64:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_ashr_i32 s1, s7, 31
-; VI-NEXT:    s_mov_b32 s0, s7
-; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; VI-NEXT:    s_add_u32 s2, s4, s0
-; VI-NEXT:    s_addc_u32 s3, s5, s1
-; VI-NEXT:    s_load_dword s4, s[2:3], 0x0
-; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    s_ashr_i32 s5, s3, 31
+; VI-NEXT:    s_mov_b32 s4, s3
+; VI-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
+; VI-NEXT:    s_add_u32 s4, s0, s4
+; VI-NEXT:    s_addc_u32 s5, s1, s5
+; VI-NEXT:    s_load_dword s3, s[4:5], 0x0
+; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    s_mov_b64 s[0:1], 0
-; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v3, s4
+; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:  .LBB93_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; VI-NEXT:    v_max_i32_e32 v2, s6, v3
+; VI-NEXT:    v_max_i32_e32 v2, s2, v3
 ; VI-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
@@ -5901,24 +5901,24 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out,
 ;
 ; VI-LABEL: atomic_umax_i32_addr64_offset:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_ashr_i32 s1, s7, 31
-; VI-NEXT:    s_mov_b32 s0, s7
-; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; VI-NEXT:    s_add_u32 s2, s4, s0
-; VI-NEXT:    s_addc_u32 s3, s5, s1
-; VI-NEXT:    s_load_dword s4, s[2:3], 0x10
-; VI-NEXT:    s_add_u32 s2, s2, 16
-; VI-NEXT:    s_addc_u32 s3, s3, 0
-; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    s_ashr_i32 s5, s3, 31
+; VI-NEXT:    s_mov_b32 s4, s3
+; VI-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
+; VI-NEXT:    s_add_u32 s4, s0, s4
+; VI-NEXT:    s_addc_u32 s5, s1, s5
+; VI-NEXT:    s_load_dword s3, s[4:5], 0x10
+; VI-NEXT:    s_add_u32 s4, s4, 16
+; VI-NEXT:    s_addc_u32 s5, s5, 0
+; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    s_mov_b64 s[0:1], 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v3, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_mov_b32_e32 v3, s3
+; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:  .LBB105_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; VI-NEXT:    v_max_u32_e32 v2, s6, v3
+; VI-NEXT:    v_max_u32_e32 v2, s2, v3
 ; VI-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
@@ -7892,24 +7892,24 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i
 ;
 ; VI-LABEL: atomic_min_i32_addr64_offset:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_ashr_i32 s1, s7, 31
-; VI-NEXT:    s_mov_b32 s0, s7
-; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; VI-NEXT:    s_add_u32 s2, s4, s0
-; VI-NEXT:    s_addc_u32 s3, s5, s1
-; VI-NEXT:    s_load_dword s4, s[2:3], 0x10
-; VI-NEXT:    s_add_u32 s2, s2, 16
-; VI-NEXT:    s_addc_u32 s3, s3, 0
-; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    s_ashr_i32 s5, s3, 31
+; VI-NEXT:    s_mov_b32 s4, s3
+; VI-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
+; VI-NEXT:    s_add_u32 s4, s0, s4
+; VI-NEXT:    s_addc_u32 s5, s1, s5
+; VI-NEXT:    s_load_dword s3, s[4:5], 0x10
+; VI-NEXT:    s_add_u32 s4, s4, 16
+; VI-NEXT:    s_addc_u32 s5, s5, 0
+; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    s_mov_b64 s[0:1], 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v3, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_mov_b32_e32 v3, s3
+; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:  .LBB128_1: ; %atomicrmw.start
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; VI-NEXT:    v_min_i32_e32 v2, s6, v3
+; VI-NEXT:    v_min_i32_e32 v2, s2, v3
 ; VI-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll
index adb803a22815c..e2d55990473c0 100644
--- a/llvm/test/CodeGen/AMDGPU/half.ll
+++ b/llvm/test/CodeGen/AMDGPU/half.ll
@@ -100,13 +100,13 @@ define amdgpu_kernel void @load_v3f16_arg(ptr addrspace(1) %out, <3 x half> %arg
 ;
 ; GFX11-LABEL: load_v3f16_arg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s7
-; GFX11-NEXT:    v_mov_b32_e32 v2, s6
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_store_b16 v0, v1, s[4:5] offset:4
-; GFX11-NEXT:    global_store_b32 v0, v2, s[4:5]
+; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1] offset:4
+; GFX11-NEXT:    global_store_b32 v0, v2, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -130,11 +130,11 @@ define amdgpu_kernel void @load_v4f16_arg(ptr addrspace(1) %out, <4 x half> %arg
 ;
 ; GFX11-LABEL: load_v4f16_arg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -349,14 +349,14 @@ define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(ptr addrspace(1) %out, <3
 ;
 ; GFX11-LABEL: extload_v3f16_to_v3f32_arg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
 ; GFX11-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshr_b32 s0, s6, 16
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, s6
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, s0
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, s7
-; GFX11-NEXT:    global_store_b96 v3, v[0:2], s[4:5]
+; GFX11-NEXT:    s_lshr_b32 s4, s2, 16
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, s2
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, s4
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, s3
+; GFX11-NEXT:    global_store_b96 v3, v[0:2], s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -398,16 +398,16 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4
 ;
 ; GFX11-LABEL: extload_v4f16_to_v4f32_arg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
 ; GFX11-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshr_b32 s0, s7, 16
-; GFX11-NEXT:    s_lshr_b32 s1, s6, 16
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, s6
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, s0
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, s1
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, s7
-; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[4:5]
+; GFX11-NEXT:    s_lshr_b32 s4, s3, 16
+; GFX11-NEXT:    s_lshr_b32 s5, s2, 16
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, s2
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, s4
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, s5
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, s3
+; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -649,12 +649,12 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3
 ;
 ; GFX11-LABEL: extload_v3f16_to_v3f64_arg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshr_b32 s0, s6, 16
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, s7
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, s0
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v6, s6
+; GFX11-NEXT:    s_lshr_b32 s4, s2, 16
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, s3
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, s4
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v6, s2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_cvt_f64_f32_e32 v[4:5], v0
 ; GFX11-NEXT:    v_cvt_f64_f32_e32 v[2:3], v1
@@ -662,8 +662,8 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3
 ; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v6
 ; GFX11-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_store_b64 v6, v[4:5], s[4:5] offset:16
-; GFX11-NEXT:    global_store_b128 v6, v[0:3], s[4:5]
+; GFX11-NEXT:    global_store_b64 v6, v[4:5], s[0:1] offset:16
+; GFX11-NEXT:    global_store_b128 v6, v[0:3], s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -725,14 +725,14 @@ define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(ptr addrspace(1) %out, <4
 ;
 ; GFX11-LABEL: extload_v4f16_to_v4f64_arg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshr_b32 s1, s7, 16
-; GFX11-NEXT:    s_lshr_b32 s0, s6, 16
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, s7
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, s1
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, s6
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v8, s0
+; GFX11-NEXT:    s_lshr_b32 s5, s3, 16
+; GFX11-NEXT:    s_lshr_b32 s4, s2, 16
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, s3
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, s5
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, s2
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v8, s4
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cvt_f64_f32_e32 v[4:5], v2
 ; GFX11-NEXT:    v_cvt_f64_f32_e32 v[6:7], v3
@@ -741,8 +741,8 @@ define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(ptr addrspace(1) %out, <4
 ; GFX11-NEXT:    v_cvt_f64_f32_e32 v[2:3], v8
 ; GFX11-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_store_b128 v8, v[4:7], s[4:5] offset:16
-; GFX11-NEXT:    global_store_b128 v8, v[0:3], s[4:5]
+; GFX11-NEXT:    global_store_b128 v8, v[4:7], s[0:1] offset:16
+; GFX11-NEXT:    global_store_b128 v8, v[0:3], s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -2693,11 +2693,11 @@ define amdgpu_kernel void @fadd_v2f16(ptr addrspace(1) %out, <2 x half> %a, <2 x
 ;
 ; GFX11-LABEL: fadd_v2f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_pk_add_f16 v1, s6, s7
-; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX11-NEXT:    v_pk_add_f16 v1, s2, s3
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
index 18d5c057d156a..f736ca7cd625a 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
@@ -85,19 +85,19 @@ entry:
 define amdgpu_kernel void @float2_inselt(ptr addrspace(1) %out, <2 x float> %vec, i32 %sel) {
 ; GCN-LABEL: float2_inselt:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_dword s2, s[0:1], 0x34
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NEXT:    s_load_dword s4, s[0:1], 0x34
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_cmp_lg_u32 s2, 1
-; GCN-NEXT:    v_mov_b32_e32 v0, s7
+; GCN-NEXT:    s_cmp_lg_u32 s4, 1
+; GCN-NEXT:    v_mov_b32_e32 v0, s3
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmp_lg_u32 s2, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 0
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, 1.0, v0, vcc
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_mov_b32_e32 v2, s4
+; GCN-NEXT:    v_mov_b32_e32 v3, s1
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, 1.0, v0, vcc
-; GCN-NEXT:    v_mov_b32_e32 v3, s5
+; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN-NEXT:    s_endpgm
 entry:
@@ -289,20 +289,20 @@ entry:
 define amdgpu_kernel void @half4_inselt(ptr addrspace(1) %out, <4 x half> %vec, i32 %sel) {
 ; GCN-LABEL: half4_inselt:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_dword s2, s[0:1], 0x34
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN-NEXT:    s_mov_b32 s0, 0x3c003c00
-; GCN-NEXT:    s_mov_b32 s1, s0
+; GCN-NEXT:    s_load_dword s6, s[0:1], 0x34
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT:    s_mov_b32 s4, 0x3c003c00
+; GCN-NEXT:    s_mov_b32 s5, s4
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_lshl_b32 s2, s2, 4
-; GCN-NEXT:    s_lshl_b64 s[2:3], 0xffff, s2
-; GCN-NEXT:    s_andn2_b64 s[6:7], s[6:7], s[2:3]
-; GCN-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
-; GCN-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    v_mov_b32_e32 v3, s1
-; GCN-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-NEXT:    s_lshl_b32 s6, s6, 4
+; GCN-NEXT:    s_lshl_b64 s[6:7], 0xffff, s6
+; GCN-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GCN-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
+; GCN-NEXT:    s_or_b64 s[2:3], s[4:5], s[2:3]
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    v_mov_b32_e32 v3, s3
 ; GCN-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GCN-NEXT:    s_endpgm
 entry:
@@ -314,16 +314,16 @@ entry:
 define amdgpu_kernel void @half2_inselt(ptr addrspace(1) %out, <2 x half> %vec, i32 %sel) {
 ; GCN-LABEL: half2_inselt:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_lshl_b32 s0, s7, 4
-; GCN-NEXT:    s_lshl_b32 s0, 0xffff, s0
-; GCN-NEXT:    s_andn2_b32 s1, s6, s0
-; GCN-NEXT:    s_and_b32 s0, s0, 0x3c003c00
-; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-NEXT:    s_lshl_b32 s3, s3, 4
+; GCN-NEXT:    s_lshl_b32 s3, 0xffff, s3
+; GCN-NEXT:    s_andn2_b32 s2, s2, s3
+; GCN-NEXT:    s_and_b32 s3, s3, 0x3c003c00
+; GCN-NEXT:    s_or_b32 s2, s3, s2
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN-NEXT:    flat_store_dword v[0:1], v2
 ; GCN-NEXT:    s_endpgm
 entry:
@@ -397,16 +397,16 @@ entry:
 define amdgpu_kernel void @short2_inselt(ptr addrspace(1) %out, <2 x i16> %vec, i32 %sel) {
 ; GCN-LABEL: short2_inselt:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_lshl_b32 s0, s7, 4
-; GCN-NEXT:    s_lshl_b32 s0, 0xffff, s0
-; GCN-NEXT:    s_andn2_b32 s1, s6, s0
-; GCN-NEXT:    s_and_b32 s0, s0, 0x10001
-; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-NEXT:    s_lshl_b32 s3, s3, 4
+; GCN-NEXT:    s_lshl_b32 s3, 0xffff, s3
+; GCN-NEXT:    s_andn2_b32 s2, s2, s3
+; GCN-NEXT:    s_and_b32 s3, s3, 0x10001
+; GCN-NEXT:    s_or_b32 s2, s3, s2
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN-NEXT:    flat_store_dword v[0:1], v2
 ; GCN-NEXT:    s_endpgm
 entry:
@@ -418,20 +418,20 @@ entry:
 define amdgpu_kernel void @short4_inselt(ptr addrspace(1) %out, <4 x i16> %vec, i32 %sel) {
 ; GCN-LABEL: short4_inselt:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_dword s2, s[0:1], 0x34
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN-NEXT:    s_mov_b32 s0, 0x10001
-; GCN-NEXT:    s_mov_b32 s1, s0
+; GCN-NEXT:    s_load_dword s6, s[0:1], 0x34
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT:    s_mov_b32 s4, 0x10001
+; GCN-NEXT:    s_mov_b32 s5, s4
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_lshl_b32 s2, s2, 4
-; GCN-NEXT:    s_lshl_b64 s[2:3], 0xffff, s2
-; GCN-NEXT:    s_andn2_b64 s[6:7], s[6:7], s[2:3]
-; GCN-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
-; GCN-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    v_mov_b32_e32 v3, s1
-; GCN-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-NEXT:    s_lshl_b32 s6, s6, 4
+; GCN-NEXT:    s_lshl_b64 s[6:7], 0xffff, s6
+; GCN-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GCN-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
+; GCN-NEXT:    s_or_b64 s[2:3], s[4:5], s[2:3]
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    v_mov_b32_e32 v3, s3
 ; GCN-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GCN-NEXT:    s_endpgm
 entry:
@@ -443,19 +443,19 @@ entry:
 define amdgpu_kernel void @byte8_inselt(ptr addrspace(1) %out, <8 x i8> %vec, i32 %sel) {
 ; GCN-LABEL: byte8_inselt:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_dword s2, s[0:1], 0x34
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NEXT:    s_load_dword s4, s[0:1], 0x34
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_lshl_b32 s0, s2, 3
-; GCN-NEXT:    s_lshl_b64 s[0:1], 0xff, s0
-; GCN-NEXT:    s_and_b32 s3, s1, 0x1010101
-; GCN-NEXT:    s_and_b32 s2, s0, 0x1010101
-; GCN-NEXT:    s_andn2_b64 s[0:1], s[6:7], s[0:1]
-; GCN-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
-; GCN-NEXT:    v_mov_b32_e32 v2, s4
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    v_mov_b32_e32 v1, s1
-; GCN-NEXT:    v_mov_b32_e32 v3, s5
+; GCN-NEXT:    s_lshl_b32 s4, s4, 3
+; GCN-NEXT:    s_lshl_b64 s[4:5], 0xff, s4
+; GCN-NEXT:    s_and_b32 s7, s5, 0x1010101
+; GCN-NEXT:    s_and_b32 s6, s4, 0x1010101
+; GCN-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[4:5]
+; GCN-NEXT:    s_or_b64 s[2:3], s[6:7], s[2:3]
+; GCN-NEXT:    v_mov_b32_e32 v3, s1
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN-NEXT:    s_endpgm
 entry:
@@ -962,34 +962,34 @@ entry:
 define amdgpu_kernel void @bit4_inselt(ptr addrspace(1) %out, <4 x i1> %vec, i32 %sel) {
 ; GCN-LABEL: bit4_inselt:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; GCN-NEXT:    s_mov_b32 s10, -1
-; GCN-NEXT:    s_mov_b32 s11, 0xe80000
-; GCN-NEXT:    s_add_u32 s8, s8, s3
-; GCN-NEXT:    s_addc_u32 s9, s9, 0
+; GCN-NEXT:    s_mov_b32 s4, SCRATCH_RSRC_DWORD0
+; GCN-NEXT:    s_mov_b32 s5, SCRATCH_RSRC_DWORD1
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_mov_b32 s7, 0xe80000
+; GCN-NEXT:    s_add_u32 s4, s4, s3
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT:    s_addc_u32 s5, s5, 0
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_and_b32 s0, s7, 3
-; GCN-NEXT:    v_mov_b32_e32 v1, s6
-; GCN-NEXT:    v_lshrrev_b16_e64 v2, 1, s6
-; GCN-NEXT:    v_lshrrev_b16_e64 v3, 2, s6
-; GCN-NEXT:    v_lshrrev_b16_e64 v4, 3, s6
-; GCN-NEXT:    v_or_b32_e32 v0, s0, v0
+; GCN-NEXT:    s_and_b32 s3, s3, 3
+; GCN-NEXT:    v_mov_b32_e32 v1, s2
+; GCN-NEXT:    v_lshrrev_b16_e64 v2, 1, s2
+; GCN-NEXT:    v_lshrrev_b16_e64 v3, 2, s2
+; GCN-NEXT:    v_lshrrev_b16_e64 v4, 3, s2
+; GCN-NEXT:    v_or_b32_e32 v0, s3, v0
 ; GCN-NEXT:    v_and_b32_e32 v2, 1, v2
 ; GCN-NEXT:    v_and_b32_e32 v3, 3, v3
 ; GCN-NEXT:    v_and_b32_e32 v4, 1, v4
-; GCN-NEXT:    buffer_store_byte v1, off, s[8:11], 0
-; GCN-NEXT:    buffer_store_byte v4, off, s[8:11], 0 offset:3
-; GCN-NEXT:    buffer_store_byte v3, off, s[8:11], 0 offset:2
-; GCN-NEXT:    buffer_store_byte v2, off, s[8:11], 0 offset:1
+; GCN-NEXT:    buffer_store_byte v1, off, s[4:7], 0
+; GCN-NEXT:    buffer_store_byte v4, off, s[4:7], 0 offset:3
+; GCN-NEXT:    buffer_store_byte v3, off, s[4:7], 0 offset:2
+; GCN-NEXT:    buffer_store_byte v2, off, s[4:7], 0 offset:1
 ; GCN-NEXT:    v_mov_b32_e32 v1, 1
-; GCN-NEXT:    buffer_store_byte v1, v0, s[8:11], 0 offen
-; GCN-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
-; GCN-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0 offset:1
-; GCN-NEXT:    buffer_load_ubyte v2, off, s[8:11], 0 offset:2
-; GCN-NEXT:    buffer_load_ubyte v3, off, s[8:11], 0 offset:3
+; GCN-NEXT:    buffer_store_byte v1, v0, s[4:7], 0 offen
+; GCN-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
+; GCN-NEXT:    buffer_load_ubyte v1, off, s[4:7], 0 offset:1
+; GCN-NEXT:    buffer_load_ubyte v2, off, s[4:7], 0 offset:2
+; GCN-NEXT:    buffer_load_ubyte v3, off, s[4:7], 0 offset:3
 ; GCN-NEXT:    s_waitcnt vmcnt(3)
 ; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GCN-NEXT:    s_waitcnt vmcnt(2)
@@ -1004,8 +1004,8 @@ define amdgpu_kernel void @bit4_inselt(ptr addrspace(1) %out, <4 x i1> %vec, i32
 ; GCN-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GCN-NEXT:    v_or_b32_e32 v0, v0, v3
 ; GCN-NEXT:    v_and_b32_e32 v2, 15, v0
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-NEXT:    flat_store_byte v[0:1], v2
 ; GCN-NEXT:    s_endpgm
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
index 13134601cc33d..d5265e364a17e 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
@@ -556,73 +556,73 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ;
 ; GFX11-LABEL: udiv_i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, s7
-; GFX11-NEXT:    s_sub_i32 s1, 0, s7
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, s3
+; GFX11-NEXT:    s_sub_i32 s5, 0, s3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX11-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    s_mul_i32 s1, s1, s0
+; GFX11-NEXT:    s_mul_i32 s5, s5, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_mul_hi_u32 s1, s0, s1
-; GFX11-NEXT:    s_add_i32 s0, s0, s1
+; GFX11-NEXT:    s_mul_hi_u32 s5, s4, s5
+; GFX11-NEXT:    s_add_i32 s4, s4, s5
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_mul_hi_u32 s0, s6, s0
-; GFX11-NEXT:    s_mul_i32 s1, s0, s7
-; GFX11-NEXT:    s_add_i32 s2, s0, 1
-; GFX11-NEXT:    s_sub_i32 s1, s6, s1
+; GFX11-NEXT:    s_mul_hi_u32 s4, s2, s4
+; GFX11-NEXT:    s_mul_i32 s5, s4, s3
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_sub_i32 s3, s1, s7
-; GFX11-NEXT:    s_cmp_ge_u32 s1, s7
-; GFX11-NEXT:    s_cselect_b32 s0, s2, s0
-; GFX11-NEXT:    s_cselect_b32 s1, s3, s1
-; GFX11-NEXT:    s_add_i32 s2, s0, 1
-; GFX11-NEXT:    s_cmp_ge_u32 s1, s7
-; GFX11-NEXT:    s_cselect_b32 s0, s2, s0
+; GFX11-NEXT:    s_sub_i32 s2, s2, s5
+; GFX11-NEXT:    s_add_i32 s5, s4, 1
+; GFX11-NEXT:    s_sub_i32 s6, s2, s3
+; GFX11-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX11-NEXT:    s_cselect_b32 s4, s5, s4
+; GFX11-NEXT:    s_cselect_b32 s2, s6, s2
+; GFX11-NEXT:    s_add_i32 s5, s4, 1
+; GFX11-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX11-NEXT:    s_cselect_b32 s2, s5, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX11-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: udiv_i32:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_cvt_f32_u32 s0, s7
-; GFX12-NEXT:    s_sub_co_i32 s1, 0, s7
+; GFX12-NEXT:    s_cvt_f32_u32 s4, s3
+; GFX12-NEXT:    s_sub_co_i32 s5, 0, s3
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
-; GFX12-NEXT:    v_rcp_iflag_f32_e32 v0, s0
-; GFX12-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX12-NEXT:    v_rcp_iflag_f32_e32 v0, s4
+; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_3)
-; GFX12-NEXT:    s_mul_f32 s0, s0, 0x4f7ffffe
-; GFX12-NEXT:    s_cvt_u32_f32 s0, s0
+; GFX12-NEXT:    s_mul_f32 s4, s4, 0x4f7ffffe
+; GFX12-NEXT:    s_cvt_u32_f32 s4, s4
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_mul_i32 s1, s1, s0
-; GFX12-NEXT:    s_mul_hi_u32 s1, s0, s1
+; GFX12-NEXT:    s_mul_i32 s5, s5, s4
+; GFX12-NEXT:    s_mul_hi_u32 s5, s4, s5
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_add_co_i32 s0, s0, s1
-; GFX12-NEXT:    s_mul_hi_u32 s0, s6, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_mul_i32 s1, s0, s7
-; GFX12-NEXT:    s_add_co_i32 s2, s0, 1
-; GFX12-NEXT:    s_sub_co_i32 s1, s6, s1
-; GFX12-NEXT:    s_sub_co_i32 s3, s1, s7
-; GFX12-NEXT:    s_cmp_ge_u32 s1, s7
-; GFX12-NEXT:    s_cselect_b32 s0, s2, s0
-; GFX12-NEXT:    s_cselect_b32 s1, s3, s1
-; GFX12-NEXT:    s_add_co_i32 s2, s0, 1
-; GFX12-NEXT:    s_cmp_ge_u32 s1, s7
-; GFX12-NEXT:    s_cselect_b32 s0, s2, s0
+; GFX12-NEXT:    s_add_co_i32 s4, s4, s5
+; GFX12-NEXT:    s_mul_hi_u32 s4, s2, s4
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_mul_i32 s5, s4, s3
+; GFX12-NEXT:    s_sub_co_i32 s2, s2, s5
+; GFX12-NEXT:    s_add_co_i32 s5, s4, 1
+; GFX12-NEXT:    s_sub_co_i32 s6, s2, s3
+; GFX12-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX12-NEXT:    s_cselect_b32 s4, s5, s4
+; GFX12-NEXT:    s_cselect_b32 s2, s6, s2
+; GFX12-NEXT:    s_add_co_i32 s5, s4, 1
+; GFX12-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX12-NEXT:    s_cselect_b32 s2, s5, s4
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
-; GFX12-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    s_endpgm
   %r = udiv i32 %x, %y
diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
index a344128d94fcf..69f181fcede30 100644
--- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
@@ -731,12 +731,12 @@ define amdgpu_kernel void @v2i32_arg(ptr addrspace(1) nocapture %out, <2 x i32>
 ;
 ; VI-LABEL: v2i32_arg:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v2, s6
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    v_mov_b32_e32 v3, s7
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
 ;
@@ -794,12 +794,12 @@ define amdgpu_kernel void @v2f32_arg(ptr addrspace(1) nocapture %out, <2 x float
 ;
 ; VI-LABEL: v2f32_arg:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v2, s6
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    v_mov_b32_e32 v3, s7
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
 ;
@@ -998,16 +998,16 @@ define amdgpu_kernel void @v3i16_arg(ptr addrspace(1) nocapture %out, <3 x i16>
 ;
 ; VI-LABEL: v3i16_arg:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_add_u32 s0, s4, 4
-; VI-NEXT:    s_addc_u32 s1, s5, 0
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v4, s7
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    v_mov_b32_e32 v5, s6
+; VI-NEXT:    s_add_u32 s4, s0, 4
+; VI-NEXT:    s_addc_u32 s5, s1, 0
+; VI-NEXT:    v_mov_b32_e32 v2, s4
+; VI-NEXT:    v_mov_b32_e32 v4, s3
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v3, s5
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v5, s2
 ; VI-NEXT:    flat_store_short v[2:3], v4
 ; VI-NEXT:    flat_store_dword v[0:1], v5
 ; VI-NEXT:    s_endpgm
@@ -1332,12 +1332,12 @@ define amdgpu_kernel void @v4i16_arg(ptr addrspace(1) %out, <4 x i16> %in) {
 ;
 ; VI-LABEL: v4i16_arg:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v2, s6
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    v_mov_b32_e32 v3, s7
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
 ;
@@ -1545,16 +1545,16 @@ define amdgpu_kernel void @v5i8_arg(ptr addrspace(1) nocapture %out, <5 x i8> %i
 ;
 ; VI-LABEL: v5i8_arg:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_add_u32 s0, s4, 4
-; VI-NEXT:    s_addc_u32 s1, s5, 0
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v4, s7
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    v_mov_b32_e32 v5, s6
+; VI-NEXT:    s_add_u32 s4, s0, 4
+; VI-NEXT:    s_addc_u32 s5, s1, 0
+; VI-NEXT:    v_mov_b32_e32 v2, s4
+; VI-NEXT:    v_mov_b32_e32 v4, s3
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v3, s5
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v5, s2
 ; VI-NEXT:    flat_store_byte v[2:3], v4
 ; VI-NEXT:    flat_store_dword v[0:1], v5
 ; VI-NEXT:    s_endpgm
@@ -2397,12 +2397,12 @@ define amdgpu_kernel void @v8i8_arg(ptr addrspace(1) %out, <8 x i8> %in) {
 ;
 ; VI-LABEL: v8i8_arg:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v2, s6
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    v_mov_b32_e32 v3, s7
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
 ;
@@ -4539,19 +4539,19 @@ define amdgpu_kernel void @i65_arg(ptr addrspace(1) nocapture %out, i65 %in) nou
 ;
 ; VI-LABEL: i65_arg:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dword s2, s[0:1], 0x34
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dword s4, s[0:1], 0x34
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_and_b32 s1, s2, 1
-; VI-NEXT:    s_add_u32 s0, s4, 8
-; VI-NEXT:    v_mov_b32_e32 v6, s1
-; VI-NEXT:    s_addc_u32 s1, s5, 0
+; VI-NEXT:    s_and_b32 s4, s4, 1
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_add_u32 s0, s0, 8
+; VI-NEXT:    s_addc_u32 s1, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v5, s1
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v2, s6
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v6, s4
 ; VI-NEXT:    v_mov_b32_e32 v4, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    v_mov_b32_e32 v3, s7
+; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_store_byte v[4:5], v6
 ; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
@@ -5565,18 +5565,18 @@ define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) {
 ;
 ; VI-LABEL: array_3xi32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_mov_b32_e32 v2, s6
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    flat_store_short v[0:1], v0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    flat_store_dword v[0:1], v1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s5
+; VI-NEXT:    v_mov_b32_e32 v0, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
index 01a1ab41c5cac..45a1afbf11992 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
@@ -21,12 +21,12 @@ define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, float %x
 ;
 ; VI-LABEL: s_cvt_pkrtz_v2f16_f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s7
-; VI-NEXT:    v_cvt_pkrtz_f16_f32 v2, s6, v0
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v0, s3
+; VI-NEXT:    v_cvt_pkrtz_f16_f32 v2, s2, v0
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -51,11 +51,11 @@ define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, float %x
 ;
 ; GFX11-LABEL: s_cvt_pkrtz_v2f16_f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_cvt_pk_rtz_f16_f32_e64 v1, s6, s7
-; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX11-NEXT:    v_cvt_pk_rtz_f16_f32_e64 v1, s2, s3
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll
index a737c5e7dd265..5d20a848bd6a6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll
@@ -15,12 +15,12 @@ declare half @llvm.fabs.f16(half) #0
 define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float %src, float %a) {
 ; SDAG-GFX11-LABEL: v_fcmp_f32_oeq_with_fabs:
 ; SDAG-GFX11:       ; %bb.0:
-; SDAG-GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; SDAG-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT:    v_cmp_eq_f32_e64 s0, s6, |s7|
+; SDAG-GFX11-NEXT:    v_cmp_eq_f32_e64 s2, s2, |s3|
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
-; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
+; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; SDAG-GFX11-NEXT:    s_nop 0
 ; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
@@ -37,13 +37,13 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float
 ;
 ; GISEL-GFX11-LABEL: v_fcmp_f32_oeq_with_fabs:
 ; GISEL-GFX11:       ; %bb.0:
-; GISEL-GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GISEL-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT:    v_cmp_eq_f32_e64 s0, s6, |s7|
+; GISEL-GFX11-NEXT:    v_cmp_eq_f32_e64 s2, s2, |s3|
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s0
-; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
+; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GISEL-GFX11-NEXT:    s_nop 0
 ; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
@@ -66,12 +66,12 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float
 define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace(1) %out, float %src, float %a) {
 ; SDAG-GFX11-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs:
 ; SDAG-GFX11:       ; %bb.0:
-; SDAG-GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; SDAG-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT:    v_cmp_eq_f32_e64 s0, |s6|, |s7|
+; SDAG-GFX11-NEXT:    v_cmp_eq_f32_e64 s2, |s2|, |s3|
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
-; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
+; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; SDAG-GFX11-NEXT:    s_nop 0
 ; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
@@ -88,13 +88,13 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace(
 ;
 ; GISEL-GFX11-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs:
 ; GISEL-GFX11:       ; %bb.0:
-; GISEL-GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GISEL-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT:    v_cmp_eq_f32_e64 s0, |s6|, |s7|
+; GISEL-GFX11-NEXT:    v_cmp_eq_f32_e64 s2, |s2|, |s3|
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s0
-; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
+; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GISEL-GFX11-NEXT:    s_nop 0
 ; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll
index 7d41cf1c5bcb8..674fec1b865a6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll
@@ -16,14 +16,14 @@ declare half @llvm.fabs.f16(half) #0
 define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float %src, float %a) {
 ; GFX11-LABEL: v_fcmp_f32_oeq_with_fabs:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_eq_f32_e64 s[0:1], s6, |s7|
+; GFX11-NEXT:    v_cmp_eq_f32_e64 s[2:3], s2, |s3|
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mov_b32_e32 v0, s0
-; GFX11-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -42,27 +42,27 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float
 ;
 ; VI-SDAG-LABEL: v_fcmp_f32_oeq_with_fabs:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT:    v_mov_b32_e32 v0, s7
-; VI-SDAG-NEXT:    v_cmp_eq_f32_e64 s[0:1], s6, |v0|
-; VI-SDAG-NEXT:    v_mov_b32_e32 v0, s4
-; VI-SDAG-NEXT:    v_mov_b32_e32 v3, s1
-; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s5
-; VI-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; VI-SDAG-NEXT:    v_mov_b32_e32 v0, s3
+; VI-SDAG-NEXT:    v_cmp_eq_f32_e64 s[2:3], s2, |v0|
+; VI-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; VI-SDAG-NEXT:    v_mov_b32_e32 v2, s2
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-SDAG-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-SDAG-NEXT:    s_endpgm
 ;
 ; VI-GISEL-LABEL: v_fcmp_f32_oeq_with_fabs:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s7
-; VI-GISEL-NEXT:    v_cmp_eq_f32_e64 s[0:1], s6, |v0|
-; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
-; VI-GISEL-NEXT:    v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s3
+; VI-GISEL-NEXT:    v_cmp_eq_f32_e64 s[2:3], s2, |v0|
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; VI-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    s_endpgm
   %temp = call float @llvm.fabs.f32(float %a)
@@ -74,14 +74,14 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float
 define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace(1) %out, float %src, float %a) {
 ; GFX11-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_eq_f32_e64 s[0:1], |s6|, |s7|
+; GFX11-NEXT:    v_cmp_eq_f32_e64 s[2:3], |s2|, |s3|
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mov_b32_e32 v0, s0
-; GFX11-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -100,27 +100,27 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace(
 ;
 ; VI-SDAG-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT:    v_mov_b32_e32 v0, s7
-; VI-SDAG-NEXT:    v_cmp_eq_f32_e64 s[0:1], |s6|, |v0|
-; VI-SDAG-NEXT:    v_mov_b32_e32 v0, s4
-; VI-SDAG-NEXT:    v_mov_b32_e32 v3, s1
-; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s5
-; VI-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; VI-SDAG-NEXT:    v_mov_b32_e32 v0, s3
+; VI-SDAG-NEXT:    v_cmp_eq_f32_e64 s[2:3], |s2|, |v0|
+; VI-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; VI-SDAG-NEXT:    v_mov_b32_e32 v2, s2
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-SDAG-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-SDAG-NEXT:    s_endpgm
 ;
 ; VI-GISEL-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s7
-; VI-GISEL-NEXT:    v_cmp_eq_f32_e64 s[0:1], |s6|, |v0|
-; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
-; VI-GISEL-NEXT:    v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s3
+; VI-GISEL-NEXT:    v_cmp_eq_f32_e64 s[2:3], |s2|, |v0|
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; VI-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    s_endpgm
   %temp = call float @llvm.fabs.f32(float %a)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll
index ae61b58e95ac7..8fe85e49a4207 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll
@@ -1743,16 +1743,16 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) {
 define amdgpu_kernel void @v_icmp_i1_ne0(ptr addrspace(1) %out, i32 %a, i32 %b) {
 ; GFX11-LABEL: v_icmp_i1_ne0:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_cmp_gt_u32 s6, 1
-; GFX11-NEXT:    s_cselect_b32 s0, -1, 0
-; GFX11-NEXT:    s_cmp_gt_u32 s7, 2
-; GFX11-NEXT:    s_cselect_b32 s1, -1, 0
+; GFX11-NEXT:    s_cmp_gt_u32 s2, 1
+; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX11-NEXT:    s_cmp_gt_u32 s3, 2
+; GFX11-NEXT:    s_cselect_b32 s3, -1, 0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_b32 s0, s0, s1
-; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
-; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX11-NEXT:    s_and_b32 s2, s2, s3
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll
index 54931ac345130..a650f999835c6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll
@@ -1952,35 +1952,35 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) {
 define amdgpu_kernel void @v_icmp_i1_ne0(ptr addrspace(1) %out, i32 %a, i32 %b) {
 ; GFX11-LABEL: v_icmp_i1_ne0:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_cmp_gt_u32 s6, 1
-; GFX11-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GFX11-NEXT:    s_cmp_gt_u32 s7, 2
+; GFX11-NEXT:    s_cmp_gt_u32 s2, 1
+; GFX11-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GFX11-NEXT:    s_cmp_gt_u32 s3, 2
 ; GFX11-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
-; GFX11-NEXT:    v_mov_b32_e32 v0, s0
-; GFX11-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-NEXT:    s_and_b64 s[2:3], s[4:5], s[2:3]
+; GFX11-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; VI-LABEL: v_icmp_i1_ne0:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_cmp_gt_u32 s6, 1
-; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT:    s_cmp_gt_u32 s7, 2
+; VI-NEXT:    s_cmp_gt_u32 s2, 1
+; VI-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; VI-NEXT:    s_cmp_gt_u32 s3, 2
 ; VI-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; VI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    s_and_b64 s[2:3], s[4:5], s[2:3]
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll
index 3a77b3bc9cb95..74d936f8093dc 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll
@@ -189,7 +189,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr,
 ;
 ; GFX1030-LABEL: image_bvh_intersect_ray_nsa_reassign:
 ; GFX1030:       ; %bb.0: ; %main_body
-; GFX1030-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX1030-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
 ; GFX1030-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX1030-NEXT:    v_mov_b32_e32 v10, 0x41000000
 ; GFX1030-NEXT:    v_mov_b32_e32 v9, 0x40e00000
@@ -199,33 +199,33 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr,
 ; GFX1030-NEXT:    v_mov_b32_e32 v5, 0x40400000
 ; GFX1030-NEXT:    v_mov_b32_e32 v4, 2.0
 ; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1030-NEXT:    v_add_co_u32 v0, s0, s4, v2
-; GFX1030-NEXT:    v_add_co_ci_u32_e64 v1, null, s5, 0, s0
-; GFX1030-NEXT:    v_add_co_u32 v2, s0, s6, v2
-; GFX1030-NEXT:    v_add_co_ci_u32_e64 v3, null, s7, 0, s0
+; GFX1030-NEXT:    v_add_co_u32 v0, s0, s0, v2
+; GFX1030-NEXT:    v_add_co_ci_u32_e64 v1, null, s1, 0, s0
+; GFX1030-NEXT:    v_add_co_u32 v2, s0, s2, v2
+; GFX1030-NEXT:    v_add_co_ci_u32_e64 v3, null, s3, 0, s0
 ; GFX1030-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX1030-NEXT:    flat_load_dword v1, v[2:3]
 ; GFX1030-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1030-NEXT:    v_mov_b32_e32 v3, 1.0
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    image_bvh_intersect_ray v[0:3], v[0:10], s[8:11]
+; GFX1030-NEXT:    image_bvh_intersect_ray v[0:3], v[0:10], s[4:7]
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1030-NEXT:    flat_store_dwordx4 v[0:1], v[0:3]
 ; GFX1030-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: image_bvh_intersect_ray_nsa_reassign:
 ; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
 ; GFX11-NEXT:    v_dual_mov_b32 v5, 0x40a00000 :: v_dual_lshlrev_b32 v2, 2, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX11-NEXT:    v_mov_b32_e32 v8, 2.0
 ; GFX11-NEXT:    v_dual_mov_b32 v4, 4.0 :: v_dual_mov_b32 v7, 1.0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v0, s0, s4, v2
+; GFX11-NEXT:    v_add_co_u32 v0, s0, s0, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, s5, 0, s0
-; GFX11-NEXT:    v_add_co_u32 v2, s0, s6, v2
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, s7, 0, s0
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, s1, 0, s0
+; GFX11-NEXT:    v_add_co_u32 v2, s0, s2, v2
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, s3, 0, s0
 ; GFX11-NEXT:    flat_load_b32 v9, v[0:1]
 ; GFX11-NEXT:    flat_load_b32 v10, v[2:3]
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0x40c00000
@@ -233,7 +233,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr,
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0x41000000
 ; GFX11-NEXT:    v_mov_b32_e32 v3, 0x40400000
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    image_bvh_intersect_ray v[0:3], [v9, v10, v[6:8], v[3:5], v[0:2]], s[8:11]
+; GFX11-NEXT:    image_bvh_intersect_ray v[0:3], [v9, v10, v[6:8], v[3:5], v[0:2]], s[4:7]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b128 v[0:1], v[0:3]
 ; GFX11-NEXT:    s_endpgm
@@ -283,46 +283,46 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_
 ;
 ; GFX1030-LABEL: image_bvh_intersect_ray_a16_nsa_reassign:
 ; GFX1030:       ; %bb.0: ; %main_body
-; GFX1030-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX1030-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
 ; GFX1030-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX1030-NEXT:    v_mov_b32_e32 v4, 2.0
 ; GFX1030-NEXT:    v_mov_b32_e32 v5, 0x44004200
 ; GFX1030-NEXT:    v_mov_b32_e32 v6, 0x46004500
 ; GFX1030-NEXT:    v_mov_b32_e32 v7, 0x48004700
 ; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1030-NEXT:    v_add_co_u32 v0, s0, s4, v2
-; GFX1030-NEXT:    v_add_co_ci_u32_e64 v1, null, s5, 0, s0
-; GFX1030-NEXT:    v_add_co_u32 v2, s0, s6, v2
-; GFX1030-NEXT:    v_add_co_ci_u32_e64 v3, null, s7, 0, s0
+; GFX1030-NEXT:    v_add_co_u32 v0, s0, s0, v2
+; GFX1030-NEXT:    v_add_co_ci_u32_e64 v1, null, s1, 0, s0
+; GFX1030-NEXT:    v_add_co_u32 v2, s0, s2, v2
+; GFX1030-NEXT:    v_add_co_ci_u32_e64 v3, null, s3, 0, s0
 ; GFX1030-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX1030-NEXT:    flat_load_dword v1, v[2:3]
 ; GFX1030-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1030-NEXT:    v_mov_b32_e32 v3, 1.0
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    image_bvh_intersect_ray v[0:3], v[0:7], s[8:11] a16
+; GFX1030-NEXT:    image_bvh_intersect_ray v[0:3], v[0:7], s[4:7] a16
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1030-NEXT:    flat_store_dwordx4 v[0:1], v[0:3]
 ; GFX1030-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: image_bvh_intersect_ray_a16_nsa_reassign:
 ; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX11-NEXT:    v_dual_mov_b32 v4, 1.0 :: v_dual_mov_b32 v5, 2.0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_u32 v0, s0, s4, v2
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, s5, 0, s0
-; GFX11-NEXT:    v_add_co_u32 v2, s0, s6, v2
+; GFX11-NEXT:    v_add_co_u32 v0, s0, s0, v2
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, s1, 0, s0
+; GFX11-NEXT:    v_add_co_u32 v2, s0, s2, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, s7, 0, s0
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, s3, 0, s0
 ; GFX11-NEXT:    flat_load_b32 v6, v[0:1]
 ; GFX11-NEXT:    flat_load_b32 v7, v[2:3]
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0x47004400
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0x46004200 :: v_dual_mov_b32 v3, 0
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0x48004500
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    image_bvh_intersect_ray v[0:3], [v6, v7, v[3:5], v[0:2]], s[8:11] a16
+; GFX11-NEXT:    image_bvh_intersect_ray v[0:3], [v6, v7, v[3:5], v[0:2]], s[4:7] a16
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b128 v[0:1], v[0:3]
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
index 634159aec9db5..cd92529b77165 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
@@ -85,12 +85,12 @@ define amdgpu_kernel void @v_permlane16_b32_vii(ptr addrspace(1) %out, i32 %src0
 ;
 ; GFX12-LABEL: v_permlane16_b32_vii:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_permlane16_b32 v0, v0, 1, 2
-; GFX12-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
@@ -131,13 +131,13 @@ define amdgpu_kernel void @v_permlane16_b32_vll(ptr addrspace(1) %out, i32 %src0
 ;
 ; GFX12-LABEL: v_permlane16_b32_vll:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
-; GFX12-NEXT:    s_movk_i32 s0, 0x1234
+; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_permlane16_b32 v0, v0, s0, 0xc1d1
-; GFX12-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-NEXT:    s_movk_i32 s2, 0x1234
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    v_permlane16_b32 v0, v0, s2, 0xc1d1
+; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
@@ -203,35 +203,35 @@ define amdgpu_kernel void @v_permlane16_b32_vvv(ptr addrspace(1) %out, i32 %src0
 ;
 ; GFX12-SDAG-LABEL: v_permlane16_b32_vvv:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
 ; GFX12-SDAG-NEXT:    v_bfe_u32 v0, v0, 10, 10
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s1, v0
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s3, v1
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, s6
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, s1
-; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v1, s3, s2
+; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-SDAG-NEXT:    s_nop 0
 ; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlane16_b32_vvv:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
 ; GFX12-GISEL-NEXT:    v_bfe_u32 v0, v0, 10, 10
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s6
-; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s3, v1
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s1
-; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s3, s4
+; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-GISEL-NEXT:    s_nop 0
 ; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
@@ -269,52 +269,54 @@ define amdgpu_kernel void @v_permlane16_b32_vvs(ptr addrspace(1) %out, i32 %src0
 ;
 ; GFX11-SDAG-LABEL: v_permlane16_b32_vvs:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, s7
-; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v1, s2, s3
+; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-SDAG-NEXT:    s_nop 0
 ; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlane16_b32_vvs:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s7
-; GFX11-GISEL-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v0, v0, s4, s3
+; GFX11-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-GISEL-NEXT:    s_nop 0
 ; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlane16_b32_vvs:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, s7
-; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v1, s2, s3
+; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-SDAG-NEXT:    s_nop 0
 ; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlane16_b32_vvs:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s7
-; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s4, s3
+; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-GISEL-NEXT:    s_nop 0
 ; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
@@ -339,58 +341,60 @@ define amdgpu_kernel void @v_permlane16_b32_vsv(ptr addrspace(1) %out, i32 %src0
 ;
 ; GFX11-SDAG-LABEL: v_permlane16_b32_vsv:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-SDAG-NEXT:    v_bfe_u32 v0, v0, 10, 10
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6
-; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v1, s7, s0
-; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v1, s3, s2
+; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-SDAG-NEXT:    s_nop 0
 ; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlane16_b32_vsv:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-GISEL-NEXT:    v_bfe_u32 v0, v0, 10, 10
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, s6
-; GFX11-GISEL-NEXT:    v_permlane16_b32 v0, v0, s7, s0
-; GFX11-GISEL-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v0, v0, s3, s4
+; GFX11-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-GISEL-NEXT:    s_nop 0
 ; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlane16_b32_vsv:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    v_bfe_u32 v0, v0, 10, 10
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6
-; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v1, s7, s0
-; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v1, s3, s2
+; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-SDAG-NEXT:    s_nop 0
 ; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlane16_b32_vsv:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    v_bfe_u32 v0, v0, 10, 10
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s6
-; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s7, s0
-; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s3, s4
+; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-GISEL-NEXT:    s_nop 0
 ; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
@@ -609,12 +613,12 @@ define amdgpu_kernel void @v_permlanex16_b32_vii(ptr addrspace(1) %out, i32 %src
 ;
 ; GFX12-LABEL: v_permlanex16_b32_vii:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_permlanex16_b32 v0, v0, 1, 2
-; GFX12-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
@@ -655,13 +659,13 @@ define amdgpu_kernel void @v_permlanex16_b32_vll(ptr addrspace(1) %out, i32 %src
 ;
 ; GFX12-LABEL: v_permlanex16_b32_vll:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
-; GFX12-NEXT:    s_movk_i32 s0, 0x1234
+; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s0, 0xc1d1
-; GFX12-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-NEXT:    s_movk_i32 s2, 0x1234
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s2, 0xc1d1
+; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
@@ -727,35 +731,35 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv(ptr addrspace(1) %out, i32 %src
 ;
 ; GFX12-SDAG-LABEL: v_permlanex16_b32_vvv:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
 ; GFX12-SDAG-NEXT:    v_bfe_u32 v0, v0, 10, 10
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s1, v0
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s3, v1
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, s6
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, s1
-; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s3, s2
+; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-SDAG-NEXT:    s_nop 0
 ; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlanex16_b32_vvv:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
 ; GFX12-GISEL-NEXT:    v_bfe_u32 v0, v0, 10, 10
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s6
-; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s3, v1
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
-; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s3, s4
+; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-GISEL-NEXT:    s_nop 0
 ; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
@@ -793,52 +797,54 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs(ptr addrspace(1) %out, i32 %src
 ;
 ; GFX11-SDAG-LABEL: v_permlanex16_b32_vvs:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, s7
-; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s2, s3
+; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-SDAG-NEXT:    s_nop 0
 ; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlanex16_b32_vvs:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s7
-; GFX11-GISEL-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s4, s3
+; GFX11-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-GISEL-NEXT:    s_nop 0
 ; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlanex16_b32_vvs:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, s7
-; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s2, s3
+; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-SDAG-NEXT:    s_nop 0
 ; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlanex16_b32_vvs:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s7
-; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s4, s3
+; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-GISEL-NEXT:    s_nop 0
 ; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
@@ -863,58 +869,60 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv(ptr addrspace(1) %out, i32 %src
 ;
 ; GFX11-SDAG-LABEL: v_permlanex16_b32_vsv:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-SDAG-NEXT:    v_bfe_u32 v0, v0, 10, 10
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6
-; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s7, s0
-; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s3, s2
+; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-SDAG-NEXT:    s_nop 0
 ; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlanex16_b32_vsv:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-GISEL-NEXT:    v_bfe_u32 v0, v0, 10, 10
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, s6
-; GFX11-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s7, s0
-; GFX11-GISEL-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s3, s4
+; GFX11-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-GISEL-NEXT:    s_nop 0
 ; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlanex16_b32_vsv:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    v_bfe_u32 v0, v0, 10, 10
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6
-; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s7, s0
-; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s3, s2
+; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-SDAG-NEXT:    s_nop 0
 ; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlanex16_b32_vsv:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    v_bfe_u32 v0, v0, 10, 10
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s6
-; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s7, s0
-; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s3, s4
+; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-GISEL-NEXT:    s_nop 0
 ; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll
index 77a975f7abe0f..973678291e263 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll
@@ -10,26 +10,26 @@ declare i32 @llvm.amdgcn.workitem.id.y()
 define amdgpu_kernel void @v_permlane16var_b32_vv(ptr addrspace(1) %out, i32 %src0, i32 %src1) {
 ; GFX12-SDAG-LABEL: v_permlane16var_b32_vv:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlane16_var_b32 v0, v0, v1
-; GFX12-SDAG-NEXT:    global_store_b32 v2, v0, s[4:5]
+; GFX12-SDAG-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX12-SDAG-NEXT:    s_nop 0
 ; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlane16var_b32_vv:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_permlane16_var_b32 v0, v0, v1
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-GISEL-NEXT:    s_nop 0
 ; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
@@ -41,26 +41,26 @@ define amdgpu_kernel void @v_permlane16var_b32_vv(ptr addrspace(1) %out, i32 %sr
 define amdgpu_kernel void @v_permlane16var_b32_vi(ptr addrspace(1) %out, i32 %src0) {
 ; GFX12-SDAG-LABEL: v_permlane16var_b32_vi:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 1
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s6
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlane16_var_b32 v1, v1, v0
-; GFX12-SDAG-NEXT:    global_store_b32 v2, v1, s[4:5]
+; GFX12-SDAG-NEXT:    global_store_b32 v2, v1, s[0:1]
 ; GFX12-SDAG-NEXT:    s_nop 0
 ; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlane16var_b32_vi:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, s6
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, s2
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_permlane16_var_b32 v1, v1, v0
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-GISEL-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX12-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-GISEL-NEXT:    s_nop 0
 ; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
@@ -72,26 +72,26 @@ define amdgpu_kernel void @v_permlane16var_b32_vi(ptr addrspace(1) %out, i32 %sr
 define amdgpu_kernel void @v_permlane16var_b32_vl(ptr addrspace(1) %out, i32 %src0) {
 ; GFX12-SDAG-LABEL: v_permlane16var_b32_vl:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 0xc1d1
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s6
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlane16_var_b32 v1, v1, v0
-; GFX12-SDAG-NEXT:    global_store_b32 v2, v1, s[4:5]
+; GFX12-SDAG-NEXT:    global_store_b32 v2, v1, s[0:1]
 ; GFX12-SDAG-NEXT:    s_nop 0
 ; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlane16var_b32_vl:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, 0xc1d1 :: v_dual_mov_b32 v1, s6
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, 0xc1d1 :: v_dual_mov_b32 v1, s2
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_permlane16_var_b32 v1, v1, v0
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-GISEL-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX12-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-GISEL-NEXT:    s_nop 0
 ; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
@@ -103,25 +103,25 @@ define amdgpu_kernel void @v_permlane16var_b32_vl(ptr addrspace(1) %out, i32 %sr
 define amdgpu_kernel void @v_permlane16var_b32_vvv(ptr addrspace(1) %out, i32 %src0) {
 ; GFX12-SDAG-LABEL: v_permlane16var_b32_vvv:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s6
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlane16_var_b32 v1, v1, v0
-; GFX12-SDAG-NEXT:    global_store_b32 v2, v1, s[4:5]
+; GFX12-SDAG-NEXT:    global_store_b32 v2, v1, s[0:1]
 ; GFX12-SDAG-NEXT:    s_nop 0
 ; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlane16var_b32_vvv:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, s6
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_permlane16_var_b32 v1, v1, v0
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-GISEL-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX12-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-GISEL-NEXT:    s_nop 0
 ; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
@@ -134,26 +134,26 @@ define amdgpu_kernel void @v_permlane16var_b32_vvv(ptr addrspace(1) %out, i32 %s
 define amdgpu_kernel void @v_permlane16var_b32_vv_fi(ptr addrspace(1) %out, i32 %src0, i32 %src1) {
 ; GFX12-SDAG-LABEL: v_permlane16var_b32_vv_fi:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlane16_var_b32 v0, v0, v1 op_sel:[1,0]
-; GFX12-SDAG-NEXT:    global_store_b32 v2, v0, s[4:5]
+; GFX12-SDAG-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX12-SDAG-NEXT:    s_nop 0
 ; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlane16var_b32_vv_fi:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_permlane16_var_b32 v0, v0, v1 op_sel:[1,0]
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-GISEL-NEXT:    s_nop 0
 ; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
@@ -165,26 +165,26 @@ define amdgpu_kernel void @v_permlane16var_b32_vv_fi(ptr addrspace(1) %out, i32
 define amdgpu_kernel void @v_permlane16var_b32_vv_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1) {
 ; GFX12-SDAG-LABEL: v_permlane16var_b32_vv_bc:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlane16_var_b32 v0, v0, v1 op_sel:[0,1]
-; GFX12-SDAG-NEXT:    global_store_b32 v2, v0, s[4:5]
+; GFX12-SDAG-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX12-SDAG-NEXT:    s_nop 0
 ; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlane16var_b32_vv_bc:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_permlane16_var_b32 v0, v0, v1 op_sel:[0,1]
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-GISEL-NEXT:    s_nop 0
 ; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
@@ -196,26 +196,26 @@ define amdgpu_kernel void @v_permlane16var_b32_vv_bc(ptr addrspace(1) %out, i32
 define amdgpu_kernel void @v_permlane16var_b32_vv_fi_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1) {
 ; GFX12-SDAG-LABEL: v_permlane16var_b32_vv_fi_bc:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlane16_var_b32 v0, v0, v1 op_sel:[1,1]
-; GFX12-SDAG-NEXT:    global_store_b32 v2, v0, s[4:5]
+; GFX12-SDAG-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX12-SDAG-NEXT:    s_nop 0
 ; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlane16var_b32_vv_fi_bc:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_permlane16_var_b32 v0, v0, v1 op_sel:[1,1]
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-GISEL-NEXT:    s_nop 0
 ; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
@@ -227,26 +227,26 @@ define amdgpu_kernel void @v_permlane16var_b32_vv_fi_bc(ptr addrspace(1) %out, i
 define amdgpu_kernel void @v_permlanex16var_b32_vv(ptr addrspace(1) %out, i32 %src0, i32 %src1) {
 ; GFX12-SDAG-LABEL: v_permlanex16var_b32_vv:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlanex16_var_b32 v0, v0, v1
-; GFX12-SDAG-NEXT:    global_store_b32 v2, v0, s[4:5]
+; GFX12-SDAG-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX12-SDAG-NEXT:    s_nop 0
 ; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vv:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_permlanex16_var_b32 v0, v0, v1
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-GISEL-NEXT:    s_nop 0
 ; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
@@ -258,26 +258,26 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv(ptr addrspace(1) %out, i32 %s
 define amdgpu_kernel void @v_permlanex16var_b32_vi(ptr addrspace(1) %out, i32 %src0) {
 ; GFX12-SDAG-LABEL: v_permlanex16var_b32_vi:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 1
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s6
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlanex16_var_b32 v1, v1, v0
-; GFX12-SDAG-NEXT:    global_store_b32 v2, v1, s[4:5]
+; GFX12-SDAG-NEXT:    global_store_b32 v2, v1, s[0:1]
 ; GFX12-SDAG-NEXT:    s_nop 0
 ; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vi:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, s6
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, s2
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_permlanex16_var_b32 v1, v1, v0
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-GISEL-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX12-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-GISEL-NEXT:    s_nop 0
 ; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
@@ -289,26 +289,26 @@ define amdgpu_kernel void @v_permlanex16var_b32_vi(ptr addrspace(1) %out, i32 %s
 define amdgpu_kernel void @v_permlanex16var_b32_vl(ptr addrspace(1) %out, i32 %src0) {
 ; GFX12-SDAG-LABEL: v_permlanex16var_b32_vl:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 0xc1d1
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s6
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlanex16_var_b32 v1, v1, v0
-; GFX12-SDAG-NEXT:    global_store_b32 v2, v1, s[4:5]
+; GFX12-SDAG-NEXT:    global_store_b32 v2, v1, s[0:1]
 ; GFX12-SDAG-NEXT:    s_nop 0
 ; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vl:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, 0xc1d1 :: v_dual_mov_b32 v1, s6
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, 0xc1d1 :: v_dual_mov_b32 v1, s2
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_permlanex16_var_b32 v1, v1, v0
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-GISEL-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX12-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-GISEL-NEXT:    s_nop 0
 ; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
@@ -320,25 +320,25 @@ define amdgpu_kernel void @v_permlanex16var_b32_vl(ptr addrspace(1) %out, i32 %s
 define amdgpu_kernel void @v_permlanex16var_b32_vvv(ptr addrspace(1) %out, i32 %src0) {
 ; GFX12-SDAG-LABEL: v_permlanex16var_b32_vvv:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s6
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlanex16_var_b32 v1, v1, v0
-; GFX12-SDAG-NEXT:    global_store_b32 v2, v1, s[4:5]
+; GFX12-SDAG-NEXT:    global_store_b32 v2, v1, s[0:1]
 ; GFX12-SDAG-NEXT:    s_nop 0
 ; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vvv:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, s6
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_permlanex16_var_b32 v1, v1, v0
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-GISEL-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX12-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-GISEL-NEXT:    s_nop 0
 ; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
@@ -351,26 +351,26 @@ define amdgpu_kernel void @v_permlanex16var_b32_vvv(ptr addrspace(1) %out, i32 %
 define amdgpu_kernel void @v_permlanex16var_b32_vv_fi(ptr addrspace(1) %out, i32 %src0, i32 %src1) {
 ; GFX12-SDAG-LABEL: v_permlanex16var_b32_vv_fi:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,0]
-; GFX12-SDAG-NEXT:    global_store_b32 v2, v0, s[4:5]
+; GFX12-SDAG-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX12-SDAG-NEXT:    s_nop 0
 ; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vv_fi:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,0]
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-GISEL-NEXT:    s_nop 0
 ; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
@@ -382,26 +382,26 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv_fi(ptr addrspace(1) %out, i32
 define amdgpu_kernel void @v_permlanex16var_b32_vv_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1) {
 ; GFX12-SDAG-LABEL: v_permlanex16var_b32_vv_bc:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlanex16_var_b32 v0, v0, v1 op_sel:[0,1]
-; GFX12-SDAG-NEXT:    global_store_b32 v2, v0, s[4:5]
+; GFX12-SDAG-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX12-SDAG-NEXT:    s_nop 0
 ; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vv_bc:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_permlanex16_var_b32 v0, v0, v1 op_sel:[0,1]
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-GISEL-NEXT:    s_nop 0
 ; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
@@ -413,26 +413,26 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv_bc(ptr addrspace(1) %out, i32
 define amdgpu_kernel void @v_permlanex16var_b32_vv_fi_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1) {
 ; GFX12-SDAG-LABEL: v_permlanex16var_b32_vv_fi_bc:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,1]
-; GFX12-SDAG-NEXT:    global_store_b32 v2, v0, s[4:5]
+; GFX12-SDAG-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX12-SDAG-NEXT:    s_nop 0
 ; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vv_fi_bc:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,1]
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-GISEL-NEXT:    s_nop 0
 ; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
index bb3418ca5de99..47c021769aa56 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
@@ -91,58 +91,58 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 {
 ;
 ; VARIANT4-LABEL: test_barrier:
 ; VARIANT4:       ; %bb.0: ; %entry
-; VARIANT4-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
+; VARIANT4-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
 ; VARIANT4-NEXT:    v_lshlrev_b32_e32 v3, 2, v0
 ; VARIANT4-NEXT:    s_wait_kmcnt 0x0
-; VARIANT4-NEXT:    v_xad_u32 v1, v0, -1, s6
-; VARIANT4-NEXT:    global_store_b32 v3, v0, s[4:5]
+; VARIANT4-NEXT:    v_xad_u32 v1, v0, -1, s2
+; VARIANT4-NEXT:    global_store_b32 v3, v0, s[0:1]
 ; VARIANT4-NEXT:    s_wait_storecnt 0x0
 ; VARIANT4-NEXT:    s_barrier_signal -1
 ; VARIANT4-NEXT:    s_barrier_wait -1
 ; VARIANT4-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; VARIANT4-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; VARIANT4-NEXT:    v_lshlrev_b64_e32 v[1:2], 2, v[1:2]
-; VARIANT4-NEXT:    v_add_co_u32 v1, vcc_lo, s4, v1
+; VARIANT4-NEXT:    v_add_co_u32 v1, vcc_lo, s0, v1
 ; VARIANT4-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; VARIANT4-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, s5, v2, vcc_lo
+; VARIANT4-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, s1, v2, vcc_lo
 ; VARIANT4-NEXT:    global_load_b32 v0, v[1:2], off
 ; VARIANT4-NEXT:    s_wait_loadcnt 0x0
-; VARIANT4-NEXT:    global_store_b32 v3, v0, s[4:5]
+; VARIANT4-NEXT:    global_store_b32 v3, v0, s[0:1]
 ; VARIANT4-NEXT:    s_nop 0
 ; VARIANT4-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; VARIANT4-NEXT:    s_endpgm
 ;
 ; VARIANT5-LABEL: test_barrier:
 ; VARIANT5:       ; %bb.0: ; %entry
-; VARIANT5-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
+; VARIANT5-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
 ; VARIANT5-NEXT:    v_lshlrev_b32_e32 v3, 2, v0
 ; VARIANT5-NEXT:    s_wait_kmcnt 0x0
-; VARIANT5-NEXT:    v_xad_u32 v1, v0, -1, s6
-; VARIANT5-NEXT:    global_store_b32 v3, v0, s[4:5]
+; VARIANT5-NEXT:    v_xad_u32 v1, v0, -1, s2
+; VARIANT5-NEXT:    global_store_b32 v3, v0, s[0:1]
 ; VARIANT5-NEXT:    s_barrier_signal -1
 ; VARIANT5-NEXT:    s_barrier_wait -1
 ; VARIANT5-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; VARIANT5-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; VARIANT5-NEXT:    v_lshlrev_b64_e32 v[1:2], 2, v[1:2]
-; VARIANT5-NEXT:    v_add_co_u32 v1, vcc_lo, s4, v1
+; VARIANT5-NEXT:    v_add_co_u32 v1, vcc_lo, s0, v1
 ; VARIANT5-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; VARIANT5-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, s5, v2, vcc_lo
+; VARIANT5-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, s1, v2, vcc_lo
 ; VARIANT5-NEXT:    global_load_b32 v0, v[1:2], off
 ; VARIANT5-NEXT:    s_wait_loadcnt 0x0
-; VARIANT5-NEXT:    global_store_b32 v3, v0, s[4:5]
+; VARIANT5-NEXT:    global_store_b32 v3, v0, s[0:1]
 ; VARIANT5-NEXT:    s_nop 0
 ; VARIANT5-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; VARIANT5-NEXT:    s_endpgm
 ;
 ; VARIANT6-LABEL: test_barrier:
 ; VARIANT6:       ; %bb.0: ; %entry
-; VARIANT6-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
+; VARIANT6-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
 ; VARIANT6-NEXT:    v_lshlrev_b32_e32 v5, 2, v0
 ; VARIANT6-NEXT:    s_wait_kmcnt 0x0
-; VARIANT6-NEXT:    s_sub_co_i32 s0, s6, 1
-; VARIANT6-NEXT:    v_dual_mov_b32 v3, s4 :: v_dual_mov_b32 v4, s5
-; VARIANT6-NEXT:    v_sub_nc_u32_e32 v1, s0, v0
-; VARIANT6-NEXT:    global_store_b32 v5, v0, s[4:5]
+; VARIANT6-NEXT:    s_sub_co_i32 s2, s2, 1
+; VARIANT6-NEXT:    v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0
+; VARIANT6-NEXT:    v_sub_nc_u32_e32 v1, s2, v0
+; VARIANT6-NEXT:    global_store_b32 v5, v0, s[0:1]
 ; VARIANT6-NEXT:    s_wait_storecnt 0x0
 ; VARIANT6-NEXT:    s_barrier_signal -1
 ; VARIANT6-NEXT:    s_barrier_wait -1
@@ -154,7 +154,7 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 {
 ; VARIANT6-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v4, v2, vcc_lo
 ; VARIANT6-NEXT:    global_load_b32 v0, v[1:2], off
 ; VARIANT6-NEXT:    s_wait_loadcnt 0x0
-; VARIANT6-NEXT:    global_store_b32 v5, v0, s[4:5]
+; VARIANT6-NEXT:    global_store_b32 v5, v0, s[0:1]
 ; VARIANT6-NEXT:    s_nop 0
 ; VARIANT6-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; VARIANT6-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll
index 3eb226193051d..38a34ec6daf73 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll
@@ -518,33 +518,33 @@ define void @test2_s_barrier_signal_isfirst_var(ptr addrspace(1) %a, ptr addrspa
 define amdgpu_kernel void @test1_s_barrier_init(ptr addrspace(1) %out, i32 %mbrCnt) #0 {
 ; GCN-LABEL: test1_s_barrier_init:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
+; GCN-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
 ; GCN-NEXT:    v_mul_u32_u24_e32 v1, v0, v0
 ; GCN-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
 ; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GCN-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
 ; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    s_lshl_b32 s0, s6, 16
-; GCN-NEXT:    global_store_b32 v3, v2, s[4:5]
-; GCN-NEXT:    s_mov_b32 m0, s0
+; GCN-NEXT:    s_lshl_b32 s2, s2, 16
+; GCN-NEXT:    global_store_b32 v3, v2, s[0:1]
+; GCN-NEXT:    s_mov_b32 m0, s2
 ; GCN-NEXT:    s_barrier_init -1
-; GCN-NEXT:    global_store_b32 v3, v0, s[4:5]
+; GCN-NEXT:    global_store_b32 v3, v0, s[0:1]
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GCN-NEXT:    s_endpgm
 ;
 ; GLOBAL-ISEL-LABEL: test1_s_barrier_init:
 ; GLOBAL-ISEL:       ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
 ; GLOBAL-ISEL-NEXT:    v_mul_lo_u32 v1, v0, v0
 ; GLOBAL-ISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
 ; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GLOBAL-ISEL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
 ; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT:    s_lshl_b32 m0, 16, s6
-; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v2, s[4:5]
+; GLOBAL-ISEL-NEXT:    s_lshl_b32 m0, 16, s2
+; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
 ; GLOBAL-ISEL-NEXT:    s_barrier_init -1
-; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v0, s[4:5]
+; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
 ; GLOBAL-ISEL-NEXT:    s_nop 0
 ; GLOBAL-ISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GLOBAL-ISEL-NEXT:    s_endpgm
@@ -562,33 +562,33 @@ entry:
 define amdgpu_kernel void @test2_s_barrier_init(ptr addrspace(1) %out, i32 %mbrCnt) #0 {
 ; GCN-LABEL: test2_s_barrier_init:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
+; GCN-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
 ; GCN-NEXT:    v_mul_u32_u24_e32 v1, v0, v0
 ; GCN-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
 ; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GCN-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
 ; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    s_lshl_b32 s0, s6, 16
-; GCN-NEXT:    global_store_b32 v3, v2, s[4:5]
-; GCN-NEXT:    s_mov_b32 m0, s0
+; GCN-NEXT:    s_lshl_b32 s2, s2, 16
+; GCN-NEXT:    global_store_b32 v3, v2, s[0:1]
+; GCN-NEXT:    s_mov_b32 m0, s2
 ; GCN-NEXT:    s_barrier_init 1
-; GCN-NEXT:    global_store_b32 v3, v0, s[4:5]
+; GCN-NEXT:    global_store_b32 v3, v0, s[0:1]
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GCN-NEXT:    s_endpgm
 ;
 ; GLOBAL-ISEL-LABEL: test2_s_barrier_init:
 ; GLOBAL-ISEL:       ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
 ; GLOBAL-ISEL-NEXT:    v_mul_lo_u32 v1, v0, v0
 ; GLOBAL-ISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
 ; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GLOBAL-ISEL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
 ; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT:    s_lshl_b32 m0, 16, s6
-; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v2, s[4:5]
+; GLOBAL-ISEL-NEXT:    s_lshl_b32 m0, 16, s2
+; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
 ; GLOBAL-ISEL-NEXT:    s_barrier_init 1
-; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v0, s[4:5]
+; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
 ; GLOBAL-ISEL-NEXT:    s_nop 0
 ; GLOBAL-ISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GLOBAL-ISEL-NEXT:    s_endpgm
@@ -606,33 +606,33 @@ entry:
 define amdgpu_kernel void @test3_s_barrier_init(ptr addrspace(1) %out, i32 %mbrCnt) #0 {
 ; GCN-LABEL: test3_s_barrier_init:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
+; GCN-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
 ; GCN-NEXT:    v_mul_u32_u24_e32 v1, v0, v0
 ; GCN-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
 ; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GCN-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
 ; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    s_lshl_b32 s0, s6, 16
-; GCN-NEXT:    global_store_b32 v3, v2, s[4:5]
-; GCN-NEXT:    s_mov_b32 m0, s0
+; GCN-NEXT:    s_lshl_b32 s2, s2, 16
+; GCN-NEXT:    global_store_b32 v3, v2, s[0:1]
+; GCN-NEXT:    s_mov_b32 m0, s2
 ; GCN-NEXT:    s_barrier_init 0
-; GCN-NEXT:    global_store_b32 v3, v0, s[4:5]
+; GCN-NEXT:    global_store_b32 v3, v0, s[0:1]
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GCN-NEXT:    s_endpgm
 ;
 ; GLOBAL-ISEL-LABEL: test3_s_barrier_init:
 ; GLOBAL-ISEL:       ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
 ; GLOBAL-ISEL-NEXT:    v_mul_lo_u32 v1, v0, v0
 ; GLOBAL-ISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
 ; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GLOBAL-ISEL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
 ; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT:    s_lshl_b32 m0, 16, s6
-; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v2, s[4:5]
+; GLOBAL-ISEL-NEXT:    s_lshl_b32 m0, 16, s2
+; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
 ; GLOBAL-ISEL-NEXT:    s_barrier_init 0
-; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v0, s[4:5]
+; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
 ; GLOBAL-ISEL-NEXT:    s_nop 0
 ; GLOBAL-ISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GLOBAL-ISEL-NEXT:    s_endpgm
@@ -650,35 +650,35 @@ entry:
 define amdgpu_kernel void @test4_s_barrier_init(ptr addrspace(1) %out, i32 %bar, i32 %mbrCnt) #0 {
 ; GCN-LABEL: test4_s_barrier_init:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GCN-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GCN-NEXT:    v_mul_u32_u24_e32 v1, v0, v0
 ; GCN-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
 ; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
 ; GCN-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
 ; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    s_lshl_b32 s0, s7, 16
-; GCN-NEXT:    global_store_b32 v3, v2, s[4:5]
-; GCN-NEXT:    s_or_b32 s0, s6, s0
-; GCN-NEXT:    s_mov_b32 m0, s0
+; GCN-NEXT:    s_lshl_b32 s3, s3, 16
+; GCN-NEXT:    global_store_b32 v3, v2, s[0:1]
+; GCN-NEXT:    s_or_b32 s2, s2, s3
+; GCN-NEXT:    s_mov_b32 m0, s2
 ; GCN-NEXT:    s_barrier_init m0
-; GCN-NEXT:    global_store_b32 v3, v0, s[4:5]
+; GCN-NEXT:    global_store_b32 v3, v0, s[0:1]
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GCN-NEXT:    s_endpgm
 ;
 ; GLOBAL-ISEL-LABEL: test4_s_barrier_init:
 ; GLOBAL-ISEL:       ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GLOBAL-ISEL-NEXT:    v_mul_lo_u32 v1, v0, v0
 ; GLOBAL-ISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
 ; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GLOBAL-ISEL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
 ; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT:    s_lshl_b32 s0, 16, s7
-; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v2, s[4:5]
-; GLOBAL-ISEL-NEXT:    s_or_b32 m0, s6, s0
+; GLOBAL-ISEL-NEXT:    s_lshl_b32 s3, 16, s3
+; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
+; GLOBAL-ISEL-NEXT:    s_or_b32 m0, s2, s3
 ; GLOBAL-ISEL-NEXT:    s_barrier_init m0
-; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v0, s[4:5]
+; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
 ; GLOBAL-ISEL-NEXT:    s_nop 0
 ; GLOBAL-ISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GLOBAL-ISEL-NEXT:    s_endpgm
@@ -852,33 +852,33 @@ entry:
 define amdgpu_kernel void @test4_s_barrier_join_m0(ptr addrspace(1) %out, i32 %bar) #0 {
 ; GCN-LABEL: test4_s_barrier_join_m0:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
+; GCN-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
 ; GCN-NEXT:    v_mul_u32_u24_e32 v2, v0, v0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    v_lshlrev_b32_e32 v3, 2, v0
 ; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GCN-NEXT:    v_sub_nc_u32_e32 v0, v2, v0
 ; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    s_mov_b32 m0, s6
-; GCN-NEXT:    global_store_b32 v3, v1, s[4:5]
+; GCN-NEXT:    s_mov_b32 m0, s2
+; GCN-NEXT:    global_store_b32 v3, v1, s[0:1]
 ; GCN-NEXT:    s_barrier_join m0
-; GCN-NEXT:    global_store_b32 v3, v0, s[4:5]
+; GCN-NEXT:    global_store_b32 v3, v0, s[0:1]
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GCN-NEXT:    s_endpgm
 ;
 ; GLOBAL-ISEL-LABEL: test4_s_barrier_join_m0:
 ; GLOBAL-ISEL:       ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
 ; GLOBAL-ISEL-NEXT:    v_mul_lo_u32 v1, v0, v0
 ; GLOBAL-ISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
 ; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GLOBAL-ISEL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
 ; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT:    s_mov_b32 m0, s6
-; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v2, s[4:5]
+; GLOBAL-ISEL-NEXT:    s_mov_b32 m0, s2
+; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
 ; GLOBAL-ISEL-NEXT:    s_barrier_join m0
-; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v0, s[4:5]
+; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
 ; GLOBAL-ISEL-NEXT:    s_nop 0
 ; GLOBAL-ISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GLOBAL-ISEL-NEXT:    s_endpgm
@@ -1098,33 +1098,33 @@ entry:
 define amdgpu_kernel void @test4_s_wakeup_barrier_m0(ptr addrspace(1) %out, i32 %bar) #0 {
 ; GCN-LABEL: test4_s_wakeup_barrier_m0:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
+; GCN-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
 ; GCN-NEXT:    v_mul_u32_u24_e32 v2, v0, v0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    v_lshlrev_b32_e32 v3, 2, v0
 ; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GCN-NEXT:    v_sub_nc_u32_e32 v0, v2, v0
 ; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    s_mov_b32 m0, s6
-; GCN-NEXT:    global_store_b32 v3, v1, s[4:5]
+; GCN-NEXT:    s_mov_b32 m0, s2
+; GCN-NEXT:    global_store_b32 v3, v1, s[0:1]
 ; GCN-NEXT:    s_wakeup_barrier m0
-; GCN-NEXT:    global_store_b32 v3, v0, s[4:5]
+; GCN-NEXT:    global_store_b32 v3, v0, s[0:1]
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GCN-NEXT:    s_endpgm
 ;
 ; GLOBAL-ISEL-LABEL: test4_s_wakeup_barrier_m0:
 ; GLOBAL-ISEL:       ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
 ; GLOBAL-ISEL-NEXT:    v_mul_lo_u32 v1, v0, v0
 ; GLOBAL-ISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
 ; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GLOBAL-ISEL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
 ; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT:    s_mov_b32 m0, s6
-; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v2, s[4:5]
+; GLOBAL-ISEL-NEXT:    s_mov_b32 m0, s2
+; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
 ; GLOBAL-ISEL-NEXT:    s_wakeup_barrier m0
-; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v0, s[4:5]
+; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
 ; GLOBAL-ISEL-NEXT:    s_nop 0
 ; GLOBAL-ISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GLOBAL-ISEL-NEXT:    s_endpgm
@@ -1278,32 +1278,32 @@ entry:
 define amdgpu_kernel void @test4_s_get_barrier_state_m0(ptr addrspace(1) %out, i32 %bar) #0 {
 ; GCN-LABEL: test4_s_get_barrier_state_m0:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
+; GCN-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
 ; GCN-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    s_mov_b32 m0, s6
-; GCN-NEXT:    global_store_b32 v0, v1, s[4:5]
-; GCN-NEXT:    s_get_barrier_state s0, m0
+; GCN-NEXT:    s_mov_b32 m0, s2
+; GCN-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GCN-NEXT:    s_get_barrier_state s2, m0
 ; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_2)
-; GCN-NEXT:    v_mov_b32_e32 v1, s0
-; GCN-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GCN-NEXT:    v_mov_b32_e32 v1, s2
+; GCN-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GCN-NEXT:    s_endpgm
 ;
 ; GLOBAL-ISEL-LABEL: test4_s_get_barrier_state_m0:
 ; GLOBAL-ISEL:       ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
 ; GLOBAL-ISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT:    s_mov_b32 m0, s6
-; GLOBAL-ISEL-NEXT:    global_store_b32 v0, v1, s[4:5]
-; GLOBAL-ISEL-NEXT:    s_get_barrier_state s0, m0
+; GLOBAL-ISEL-NEXT:    s_mov_b32 m0, s2
+; GLOBAL-ISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GLOBAL-ISEL-NEXT:    s_get_barrier_state s2, m0
 ; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
 ; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_2)
-; GLOBAL-ISEL-NEXT:    v_mov_b32_e32 v1, s0
-; GLOBAL-ISEL-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GLOBAL-ISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GLOBAL-ISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GLOBAL-ISEL-NEXT:    s_nop 0
 ; GLOBAL-ISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GLOBAL-ISEL-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll
index 673e9bf08eb2b..c2e74eb05d164 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll
@@ -5,12 +5,12 @@
 define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_cluster(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 {
 ; GCN-LABEL: test_sched_group_barrier_pipeline_WMMA_cluster:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GCN-NEXT:    v_lshlrev_b32_e32 v40, 5, v0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GCN-NEXT:    v_add_nc_u32_e32 v32, s2, v40
-; GCN-NEXT:    v_dual_mov_b32 v81, s3 :: v_dual_add_nc_u32 v80, s3, v40
+; GCN-NEXT:    v_add_nc_u32_e32 v32, s0, v40
+; GCN-NEXT:    v_dual_mov_b32 v81, s1 :: v_dual_add_nc_u32 v80, s1, v40
 ; GCN-NEXT:    ds_load_b128 v[4:7], v32 offset:16
 ; GCN-NEXT:    ds_load_b128 v[12:15], v32 offset:2064
 ; GCN-NEXT:    ds_load_b128 v[20:23], v32 offset:6160
@@ -72,12 +72,12 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_cluster(ptr ad
 ;
 ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_WMMA_cluster:
 ; EXACTCUTOFF:       ; %bb.0: ; %entry
-; EXACTCUTOFF-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
+; EXACTCUTOFF-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; EXACTCUTOFF-NEXT:    v_lshlrev_b32_e32 v40, 5, v0
 ; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
 ; EXACTCUTOFF-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; EXACTCUTOFF-NEXT:    v_add_nc_u32_e32 v32, s2, v40
-; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v81, s3 :: v_dual_add_nc_u32 v80, s3, v40
+; EXACTCUTOFF-NEXT:    v_add_nc_u32_e32 v32, s0, v40
+; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v81, s1 :: v_dual_add_nc_u32 v80, s1, v40
 ; EXACTCUTOFF-NEXT:    ds_load_b128 v[4:7], v32 offset:16
 ; EXACTCUTOFF-NEXT:    ds_load_b128 v[12:15], v32 offset:2064
 ; EXACTCUTOFF-NEXT:    ds_load_b128 v[20:23], v32 offset:6160
@@ -175,12 +175,12 @@ entry:
 define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_interleave(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 {
 ; GCN-LABEL: test_sched_group_barrier_pipeline_WMMA_interleave:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GCN-NEXT:    v_lshlrev_b32_e32 v16, 5, v0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GCN-NEXT:    v_add_nc_u32_e32 v17, s2, v16
-; GCN-NEXT:    v_add_nc_u32_e32 v16, s3, v16
+; GCN-NEXT:    v_add_nc_u32_e32 v17, s0, v16
+; GCN-NEXT:    v_add_nc_u32_e32 v16, s1, v16
 ; GCN-NEXT:    ds_load_b128 v[4:7], v17 offset:16
 ; GCN-NEXT:    ds_load_b128 v[0:3], v17
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(2) SyncID(0)
@@ -196,7 +196,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_interleave(ptr
 ; GCN-NEXT:    ds_store_b128 v16, v[8:11]
 ; GCN-NEXT:    ds_load_b128 v[4:7], v17 offset:2064
 ; GCN-NEXT:    ds_load_b128 v[0:3], v17 offset:2048
-; GCN-NEXT:    v_mov_b32_e32 v16, s3
+; GCN-NEXT:    v_mov_b32_e32 v16, s1
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000200) size(2) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(2) SyncID(0)
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
@@ -256,12 +256,12 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_interleave(ptr
 ;
 ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_WMMA_interleave:
 ; EXACTCUTOFF:       ; %bb.0: ; %entry
-; EXACTCUTOFF-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
+; EXACTCUTOFF-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; EXACTCUTOFF-NEXT:    v_lshlrev_b32_e32 v16, 5, v0
 ; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
 ; EXACTCUTOFF-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; EXACTCUTOFF-NEXT:    v_add_nc_u32_e32 v17, s2, v16
-; EXACTCUTOFF-NEXT:    v_add_nc_u32_e32 v16, s3, v16
+; EXACTCUTOFF-NEXT:    v_add_nc_u32_e32 v17, s0, v16
+; EXACTCUTOFF-NEXT:    v_add_nc_u32_e32 v16, s1, v16
 ; EXACTCUTOFF-NEXT:    ds_load_b128 v[4:7], v17 offset:16
 ; EXACTCUTOFF-NEXT:    ds_load_b128 v[0:3], v17
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000100) size(2) SyncID(0)
@@ -277,7 +277,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_interleave(ptr
 ; EXACTCUTOFF-NEXT:    ds_store_b128 v16, v[8:11]
 ; EXACTCUTOFF-NEXT:    ds_load_b128 v[4:7], v17 offset:2064
 ; EXACTCUTOFF-NEXT:    ds_load_b128 v[0:3], v17 offset:2048
-; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v16, s3
+; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v16, s1
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000200) size(2) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000100) size(2) SyncID(0)
 ; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll
index bc29441edc247..fdcb1773d0a3f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll
@@ -7,13 +7,13 @@ declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16..i16(
 define amdgpu_kernel void @test_sched_group_barrier_pipeline_SWMMAC_cluster(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 {
 ; GCN-LABEL: test_sched_group_barrier_pipeline_SWMMAC_cluster:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GCN-NEXT:    v_lshlrev_b32_e32 v28, 4, v0
 ; GCN-NEXT:    v_mov_b32_e32 v48, 0
 ; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GCN-NEXT:    v_add_nc_u32_e32 v0, s2, v28
-; GCN-NEXT:    v_dual_mov_b32 v50, s3 :: v_dual_add_nc_u32 v49, s3, v28
+; GCN-NEXT:    v_add_nc_u32_e32 v0, s0, v28
+; GCN-NEXT:    v_dual_mov_b32 v50, s1 :: v_dual_add_nc_u32 v49, s1, v28
 ; GCN-NEXT:    ds_load_b128 v[8:11], v0
 ; GCN-NEXT:    ds_load_b128 v[12:15], v0 offset:512
 ; GCN-NEXT:    ds_load_b128 v[16:19], v0 offset:1536
@@ -58,13 +58,13 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_SWMMAC_cluster(ptr
 ;
 ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_SWMMAC_cluster:
 ; EXACTCUTOFF:       ; %bb.0: ; %entry
-; EXACTCUTOFF-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
+; EXACTCUTOFF-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; EXACTCUTOFF-NEXT:    v_lshlrev_b32_e32 v28, 4, v0
 ; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v48, 0
 ; EXACTCUTOFF-NEXT:    s_wait_kmcnt 0x0
 ; EXACTCUTOFF-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; EXACTCUTOFF-NEXT:    v_add_nc_u32_e32 v0, s2, v28
-; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v50, s3 :: v_dual_add_nc_u32 v49, s3, v28
+; EXACTCUTOFF-NEXT:    v_add_nc_u32_e32 v0, s0, v28
+; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v50, s1 :: v_dual_add_nc_u32 v49, s1, v28
 ; EXACTCUTOFF-NEXT:    ds_load_b128 v[8:11], v0
 ; EXACTCUTOFF-NEXT:    ds_load_b128 v[12:15], v0 offset:512
 ; EXACTCUTOFF-NEXT:    ds_load_b128 v[16:19], v0 offset:1536
@@ -147,11 +147,11 @@ entry:
 define amdgpu_kernel void @test_sched_group_barrier_pipeline_SWMMAC_interleaved(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 {
 ; GCN-LABEL: test_sched_group_barrier_pipeline_SWMMAC_interleaved:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v18, 0
 ; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    v_lshl_add_u32 v17, v0, 5, s2
-; GCN-NEXT:    v_lshl_add_u32 v0, v0, 4, s3
+; GCN-NEXT:    v_lshl_add_u32 v17, v0, 5, s0
+; GCN-NEXT:    v_lshl_add_u32 v0, v0, 4, s1
 ; GCN-NEXT:    ds_load_b128 v[9:12], v17 offset:1024
 ; GCN-NEXT:    ds_load_b128 v[1:4], v17
 ; GCN-NEXT:    ds_load_b128 v[5:8], v17 offset:16
@@ -165,7 +165,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_SWMMAC_interleaved(
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
 ; GCN-NEXT:    ds_store_b128 v0, v[13:16]
 ; GCN-NEXT:    ds_load_b128 v[9:12], v17 offset:2560
-; GCN-NEXT:    v_mov_b32_e32 v0, s3
+; GCN-NEXT:    v_mov_b32_e32 v0, s1
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
 ; GCN-NEXT:    s_wait_dscnt 0x0
@@ -210,11 +210,11 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_SWMMAC_interleaved(
 ;
 ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_SWMMAC_interleaved:
 ; EXACTCUTOFF:       ; %bb.0: ; %entry
-; EXACTCUTOFF-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
+; EXACTCUTOFF-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v18, 0
 ; EXACTCUTOFF-NEXT:    s_wait_kmcnt 0x0
-; EXACTCUTOFF-NEXT:    v_lshl_add_u32 v17, v0, 5, s2
-; EXACTCUTOFF-NEXT:    v_lshl_add_u32 v0, v0, 4, s3
+; EXACTCUTOFF-NEXT:    v_lshl_add_u32 v17, v0, 5, s0
+; EXACTCUTOFF-NEXT:    v_lshl_add_u32 v0, v0, 4, s1
 ; EXACTCUTOFF-NEXT:    ds_load_b128 v[9:12], v17 offset:1024
 ; EXACTCUTOFF-NEXT:    ds_load_b128 v[1:4], v17
 ; EXACTCUTOFF-NEXT:    ds_load_b128 v[5:8], v17 offset:16
@@ -228,7 +228,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_SWMMAC_interleaved(
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ds_store_b128 v0, v[13:16]
 ; EXACTCUTOFF-NEXT:    ds_load_b128 v[9:12], v17 offset:2560
-; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v0, s3
+; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v0, s1
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    s_wait_dscnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
index 8f8994e78fd06..fc33206845a71 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
@@ -211,21 +211,21 @@ define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half>
 define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> %in) {
 ; GCN-LABEL: set_inactive_v2i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GCN-NEXT:    s_mov_b32 s8, 1
-; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_mov_b32 s9, s8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    s_mov_b32 s0, s4
-; GCN-NEXT:    s_mov_b32 s1, s5
-; GCN-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-NEXT:    s_not_b64 exec, exec
 ; GCN-NEXT:    v_mov_b32_e32 v0, s8
 ; GCN-NEXT:    v_mov_b32_e32 v1, s9
 ; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
   %tmp = call <2 x i32> @llvm.amdgcn.set.inactive.v2i32(<2 x i32> %in, <2 x i32> <i32 1, i32 1>) #0
   store <2 x i32> %tmp, ptr addrspace(1) %out
@@ -235,21 +235,21 @@ define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> %
 define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
 ; GCN-LABEL: set_inactive_v2f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GCN-NEXT:    s_mov_b32 s8, 1.0
-; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_mov_b32 s9, s8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    s_mov_b32 s0, s4
-; GCN-NEXT:    s_mov_b32 s1, s5
-; GCN-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-NEXT:    s_not_b64 exec, exec
 ; GCN-NEXT:    v_mov_b32_e32 v0, s8
 ; GCN-NEXT:    v_mov_b32_e32 v1, s9
 ; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
   %tmp = call <2 x float> @llvm.amdgcn.set.inactive.v2f32(<2 x float> %in, <2 x float> <float 1.0, float 1.0>) #0
   store <2 x float> %tmp, ptr addrspace(1) %out
@@ -279,21 +279,21 @@ define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloa
 define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> %in) {
 ; GCN-LABEL: set_inactive_v4i16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GCN-NEXT:    s_mov_b32 s8, 0x10001
-; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_mov_b32 s9, s8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    s_mov_b32 s0, s4
-; GCN-NEXT:    s_mov_b32 s1, s5
-; GCN-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-NEXT:    s_not_b64 exec, exec
 ; GCN-NEXT:    v_mov_b32_e32 v0, s8
 ; GCN-NEXT:    v_mov_b32_e32 v1, s9
 ; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
   %tmp = call <4 x i16> @llvm.amdgcn.set.inactive.v4i16(<4 x i16> %in, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) #0
   store <4 x i16> %tmp, ptr addrspace(1) %out
@@ -303,21 +303,21 @@ define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> %
 define amdgpu_kernel void @set_inactive_v4f16(ptr addrspace(1) %out, <4 x half> %in) {
 ; GCN-LABEL: set_inactive_v4f16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GCN-NEXT:    s_mov_b32 s8, 0x3c003c00
-; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_mov_b32 s9, s8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    s_mov_b32 s0, s4
-; GCN-NEXT:    s_mov_b32 s1, s5
-; GCN-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-NEXT:    s_not_b64 exec, exec
 ; GCN-NEXT:    v_mov_b32_e32 v0, s8
 ; GCN-NEXT:    v_mov_b32_e32 v1, s9
 ; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
   %tmp = call <4 x half> @llvm.amdgcn.set.inactive.v4f16(<4 x half> %in, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>) #0
   store <4 x half> %tmp, ptr addrspace(1) %out
@@ -327,21 +327,21 @@ define amdgpu_kernel void @set_inactive_v4f16(ptr addrspace(1) %out, <4 x half>
 define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloat> %in) {
 ; GCN-LABEL: set_inactive_v4bf16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GCN-NEXT:    s_mov_b32 s8, 0x3f803f80
-; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_mov_b32 s9, s8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    s_mov_b32 s0, s4
-; GCN-NEXT:    s_mov_b32 s1, s5
-; GCN-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-NEXT:    s_not_b64 exec, exec
 ; GCN-NEXT:    v_mov_b32_e32 v0, s8
 ; GCN-NEXT:    v_mov_b32_e32 v1, s9
 ; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
   %tmp = call <4 x bfloat> @llvm.amdgcn.set.inactive.v4bf16(<4 x bfloat> %in, <4 x bfloat> <bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0>) #0
   store <4 x bfloat> %tmp, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll
index 87c5f5bd7b784..0755dcddd8f46 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll
@@ -18,15 +18,15 @@ define amdgpu_kernel void @bfe_u32_arg_arg_arg(ptr addrspace(1) %out, i32 %src0,
 ;
 ; VI-LABEL: bfe_u32_arg_arg_arg:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s6
-; VI-NEXT:    s_mov_b32 s0, s4
-; VI-NEXT:    s_mov_b32 s1, s5
-; VI-NEXT:    v_bfe_u32 v0, v0, s7, s7
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    v_bfe_u32 v0, v0, s3, s3
+; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 %src1, i32 %src1)
   store i32 %bfe_u32, ptr addrspace(1) %out, align 4
@@ -50,16 +50,16 @@ define amdgpu_kernel void @bfe_u32_arg_arg_imm(ptr addrspace(1) %out, i32 %src0,
 ;
 ; VI-LABEL: bfe_u32_arg_arg_imm:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_mov_b32_e32 v1, 0x7b
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s7
-; VI-NEXT:    s_mov_b32 s0, s4
-; VI-NEXT:    s_mov_b32 s1, s5
-; VI-NEXT:    v_bfe_u32 v0, s6, v0, v1
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    v_mov_b32_e32 v0, s3
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    v_bfe_u32 v0, s2, v0, v1
+; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 %src1, i32 123)
   store i32 %bfe_u32, ptr addrspace(1) %out, align 4
@@ -83,16 +83,16 @@ define amdgpu_kernel void @bfe_u32_arg_imm_arg(ptr addrspace(1) %out, i32 %src0,
 ;
 ; VI-LABEL: bfe_u32_arg_imm_arg:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_mov_b32_e32 v0, 0x7b
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    s_mov_b32 s0, s4
-; VI-NEXT:    s_mov_b32 s1, s5
-; VI-NEXT:    v_bfe_u32 v0, s6, v0, v1
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    v_bfe_u32 v0, s2, v0, v1
+; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 123, i32 %src2)
   store i32 %bfe_u32, ptr addrspace(1) %out, align 4
@@ -117,17 +117,17 @@ define amdgpu_kernel void @bfe_u32_imm_arg_arg(ptr addrspace(1) %out, i32 %src1,
 ;
 ; VI-LABEL: bfe_u32_imm_arg_arg:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    s_movk_i32 s8, 0x7b
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s6
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    s_mov_b32 s0, s4
-; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
 ; VI-NEXT:    v_bfe_u32 v0, s8, v0, v1
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 123, i32 %src1, i32 %src2)
   store i32 %bfe_u32, ptr addrspace(1) %out, align 4
@@ -1625,16 +1625,16 @@ define amdgpu_kernel void @v_lshr_and(ptr addrspace(1) %out, i32 %a, i32 %b) #0
 ;
 ; VI-LABEL: v_lshr_and:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s0, s4
-; VI-NEXT:    s_lshr_b32 s4, s6, s7
-; VI-NEXT:    s_and_b32 s4, s4, 7
-; VI-NEXT:    s_mov_b32 s1, s5
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_lshr_b32 s0, s2, s3
+; VI-NEXT:    s_and_b32 s0, s0, 7
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
   %c = lshr i32 %a, %b
   %d = and i32 %c, 7
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
index 4f65acda2a210..4ce0ff20e3b73 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
@@ -336,33 +336,33 @@ define amdgpu_kernel void @s_exp_f32(ptr addrspace(1) %out, float %in) {
 define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
 ; VI-SDAG-LABEL: s_exp_v2f32:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v0, 0x3fb8a000
 ; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT:    s_and_b32 s0, s7, 0xfffff000
-; VI-SDAG-NEXT:    v_mov_b32_e32 v2, s0
-; VI-SDAG-NEXT:    v_sub_f32_e32 v2, s7, v2
+; VI-SDAG-NEXT:    s_and_b32 s4, s3, 0xfffff000
+; VI-SDAG-NEXT:    v_mov_b32_e32 v2, s4
+; VI-SDAG-NEXT:    v_sub_f32_e32 v2, s3, v2
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v4, 0x39a3b295, v2
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v2, 0x3fb8a000, v2
 ; VI-SDAG-NEXT:    v_add_f32_e32 v2, v2, v4
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0x39a3b295
-; VI-SDAG-NEXT:    v_mul_f32_e32 v1, s0, v0
-; VI-SDAG-NEXT:    v_mul_f32_e32 v5, s0, v4
-; VI-SDAG-NEXT:    s_and_b32 s0, s6, 0xfffff000
+; VI-SDAG-NEXT:    v_mul_f32_e32 v1, s4, v0
+; VI-SDAG-NEXT:    v_mul_f32_e32 v5, s4, v4
+; VI-SDAG-NEXT:    s_and_b32 s4, s2, 0xfffff000
 ; VI-SDAG-NEXT:    v_rndne_f32_e32 v3, v1
-; VI-SDAG-NEXT:    v_mov_b32_e32 v6, s0
+; VI-SDAG-NEXT:    v_mov_b32_e32 v6, s4
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v1, v1, v3
 ; VI-SDAG-NEXT:    v_add_f32_e32 v2, v5, v2
-; VI-SDAG-NEXT:    v_sub_f32_e32 v6, s6, v6
+; VI-SDAG-NEXT:    v_sub_f32_e32 v6, s2, v6
 ; VI-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
-; VI-SDAG-NEXT:    v_mul_f32_e32 v0, s0, v0
+; VI-SDAG-NEXT:    v_mul_f32_e32 v0, s4, v0
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v7, 0x39a3b295, v6
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v6, 0x3fb8a000, v6
 ; VI-SDAG-NEXT:    v_exp_f32_e32 v1, v1
 ; VI-SDAG-NEXT:    v_cvt_i32_f32_e32 v2, v3
 ; VI-SDAG-NEXT:    v_rndne_f32_e32 v5, v0
 ; VI-SDAG-NEXT:    v_add_f32_e32 v6, v6, v7
-; VI-SDAG-NEXT:    v_mul_f32_e32 v4, s0, v4
+; VI-SDAG-NEXT:    v_mul_f32_e32 v4, s4, v4
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v5
 ; VI-SDAG-NEXT:    v_add_f32_e32 v4, v4, v6
 ; VI-SDAG-NEXT:    v_add_f32_e32 v0, v0, v4
@@ -370,48 +370,48 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
 ; VI-SDAG-NEXT:    v_cvt_i32_f32_e32 v4, v5
 ; VI-SDAG-NEXT:    v_ldexp_f32 v1, v1, v2
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0xc2ce8ed0
-; VI-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s7, v2
+; VI-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s3, v2
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v3, 0x42b17218
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v5, 0x7f800000
-; VI-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s7, v3
+; VI-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s3, v3
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v4
-; VI-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s6, v2
+; VI-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s2, v2
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; VI-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s6, v3
-; VI-SDAG-NEXT:    v_mov_b32_e32 v2, s4
+; VI-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s2, v3
+; VI-SDAG-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
-; VI-SDAG-NEXT:    v_mov_b32_e32 v3, s5
+; VI-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-SDAG-NEXT:    s_endpgm
 ;
 ; VI-GISEL-LABEL: s_exp_v2f32:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x3fb8a000
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x39a3b295
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT:    s_and_b32 s0, s6, 0xfffff000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, s0
-; VI-GISEL-NEXT:    v_sub_f32_e32 v2, s6, v2
+; VI-GISEL-NEXT:    s_and_b32 s4, s2, 0xfffff000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; VI-GISEL-NEXT:    v_sub_f32_e32 v2, s2, v2
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x39a3b295, v2
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8a000, v2
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, s0, v0
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, s4, v0
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v4
-; VI-GISEL-NEXT:    v_mul_f32_e32 v4, s0, v1
-; VI-GISEL-NEXT:    s_and_b32 s0, s7, 0xfffff000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, s0
+; VI-GISEL-NEXT:    v_mul_f32_e32 v4, s4, v1
+; VI-GISEL-NEXT:    s_and_b32 s4, s3, 0xfffff000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v5, s4
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v4, v2
 ; VI-GISEL-NEXT:    v_rndne_f32_e32 v4, v3
-; VI-GISEL-NEXT:    v_sub_f32_e32 v5, s7, v5
+; VI-GISEL-NEXT:    v_sub_f32_e32 v5, s3, v5
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v3, v3, v4
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x39a3b295, v5
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3fb8a000, v5
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v3, v2
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
+; VI-GISEL-NEXT:    v_mul_f32_e32 v0, s4, v0
 ; VI-GISEL-NEXT:    v_add_f32_e32 v5, v5, v6
-; VI-GISEL-NEXT:    v_mul_f32_e32 v1, s0, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v1, s4, v1
 ; VI-GISEL-NEXT:    v_cvt_i32_f32_e32 v3, v4
 ; VI-GISEL-NEXT:    v_exp_f32_e32 v2, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v5
@@ -422,19 +422,19 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
 ; VI-GISEL-NEXT:    v_exp_f32_e32 v5, v0
 ; VI-GISEL-NEXT:    v_ldexp_f32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc2ce8ed0
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v3
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v3
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x42b17218
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7f800000
-; VI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s6, v4
+; VI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s2, v4
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v6, vcc
 ; VI-GISEL-NEXT:    v_ldexp_f32 v1, v5, v1
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s7, v3
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s3, v3
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
-; VI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s7, v4
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; VI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s3, v4
+; VI-GISEL-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
-; VI-GISEL-NEXT:    v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
index ff20f90f05ca0..5ab960f47f57b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
@@ -338,33 +338,33 @@ define amdgpu_kernel void @s_exp10_f32(ptr addrspace(1) %out, float %in) {
 define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
 ; VI-SDAG-LABEL: s_exp10_v2f32:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v0, 0x40549000
 ; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT:    s_and_b32 s0, s7, 0xfffff000
-; VI-SDAG-NEXT:    v_mov_b32_e32 v2, s0
-; VI-SDAG-NEXT:    v_sub_f32_e32 v2, s7, v2
+; VI-SDAG-NEXT:    s_and_b32 s4, s3, 0xfffff000
+; VI-SDAG-NEXT:    v_mov_b32_e32 v2, s4
+; VI-SDAG-NEXT:    v_sub_f32_e32 v2, s3, v2
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v4, 0x3a2784bc, v2
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v2, 0x40549000, v2
 ; VI-SDAG-NEXT:    v_add_f32_e32 v2, v2, v4
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0x3a2784bc
-; VI-SDAG-NEXT:    v_mul_f32_e32 v1, s0, v0
-; VI-SDAG-NEXT:    v_mul_f32_e32 v5, s0, v4
-; VI-SDAG-NEXT:    s_and_b32 s0, s6, 0xfffff000
+; VI-SDAG-NEXT:    v_mul_f32_e32 v1, s4, v0
+; VI-SDAG-NEXT:    v_mul_f32_e32 v5, s4, v4
+; VI-SDAG-NEXT:    s_and_b32 s4, s2, 0xfffff000
 ; VI-SDAG-NEXT:    v_rndne_f32_e32 v3, v1
-; VI-SDAG-NEXT:    v_mov_b32_e32 v6, s0
+; VI-SDAG-NEXT:    v_mov_b32_e32 v6, s4
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v1, v1, v3
 ; VI-SDAG-NEXT:    v_add_f32_e32 v2, v5, v2
-; VI-SDAG-NEXT:    v_sub_f32_e32 v6, s6, v6
+; VI-SDAG-NEXT:    v_sub_f32_e32 v6, s2, v6
 ; VI-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
-; VI-SDAG-NEXT:    v_mul_f32_e32 v0, s0, v0
+; VI-SDAG-NEXT:    v_mul_f32_e32 v0, s4, v0
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v7, 0x3a2784bc, v6
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v6, 0x40549000, v6
 ; VI-SDAG-NEXT:    v_exp_f32_e32 v1, v1
 ; VI-SDAG-NEXT:    v_cvt_i32_f32_e32 v2, v3
 ; VI-SDAG-NEXT:    v_rndne_f32_e32 v5, v0
 ; VI-SDAG-NEXT:    v_add_f32_e32 v6, v6, v7
-; VI-SDAG-NEXT:    v_mul_f32_e32 v4, s0, v4
+; VI-SDAG-NEXT:    v_mul_f32_e32 v4, s4, v4
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v5
 ; VI-SDAG-NEXT:    v_add_f32_e32 v4, v4, v6
 ; VI-SDAG-NEXT:    v_add_f32_e32 v0, v0, v4
@@ -372,48 +372,48 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; VI-SDAG-NEXT:    v_cvt_i32_f32_e32 v4, v5
 ; VI-SDAG-NEXT:    v_ldexp_f32 v1, v1, v2
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0xc23369f4
-; VI-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s7, v2
+; VI-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s3, v2
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v3, 0x421a209b
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v5, 0x7f800000
-; VI-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s7, v3
+; VI-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s3, v3
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v4
-; VI-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s6, v2
+; VI-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s2, v2
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; VI-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s6, v3
-; VI-SDAG-NEXT:    v_mov_b32_e32 v2, s4
+; VI-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s2, v3
+; VI-SDAG-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
-; VI-SDAG-NEXT:    v_mov_b32_e32 v3, s5
+; VI-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-SDAG-NEXT:    s_endpgm
 ;
 ; VI-GISEL-LABEL: s_exp10_v2f32:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x40549000
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3a2784bc
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT:    s_and_b32 s0, s6, 0xfffff000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, s0
-; VI-GISEL-NEXT:    v_sub_f32_e32 v2, s6, v2
+; VI-GISEL-NEXT:    s_and_b32 s4, s2, 0xfffff000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; VI-GISEL-NEXT:    v_sub_f32_e32 v2, s2, v2
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3a2784bc, v2
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x40549000, v2
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, s0, v0
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, s4, v0
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v4
-; VI-GISEL-NEXT:    v_mul_f32_e32 v4, s0, v1
-; VI-GISEL-NEXT:    s_and_b32 s0, s7, 0xfffff000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, s0
+; VI-GISEL-NEXT:    v_mul_f32_e32 v4, s4, v1
+; VI-GISEL-NEXT:    s_and_b32 s4, s3, 0xfffff000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v5, s4
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v4, v2
 ; VI-GISEL-NEXT:    v_rndne_f32_e32 v4, v3
-; VI-GISEL-NEXT:    v_sub_f32_e32 v5, s7, v5
+; VI-GISEL-NEXT:    v_sub_f32_e32 v5, s3, v5
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v3, v3, v4
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3a2784bc, v5
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x40549000, v5
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v3, v2
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
+; VI-GISEL-NEXT:    v_mul_f32_e32 v0, s4, v0
 ; VI-GISEL-NEXT:    v_add_f32_e32 v5, v5, v6
-; VI-GISEL-NEXT:    v_mul_f32_e32 v1, s0, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v1, s4, v1
 ; VI-GISEL-NEXT:    v_cvt_i32_f32_e32 v3, v4
 ; VI-GISEL-NEXT:    v_exp_f32_e32 v2, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v5
@@ -424,19 +424,19 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; VI-GISEL-NEXT:    v_exp_f32_e32 v5, v0
 ; VI-GISEL-NEXT:    v_ldexp_f32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc23369f4
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v3
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v3
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x421a209b
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7f800000
-; VI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s6, v4
+; VI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s2, v4
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v6, vcc
 ; VI-GISEL-NEXT:    v_ldexp_f32 v1, v5, v1
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s7, v3
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s3, v3
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
-; VI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s7, v4
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; VI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s3, v4
+; VI-GISEL-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
-; VI-GISEL-NEXT:    v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
index 06fa910366584..6cca705f7b1db 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
@@ -222,25 +222,25 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ;
 ; VI-SDAG-LABEL: s_exp2_v2f32:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v0, 0xc2fc0000
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x42800000
 ; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s7, v0
+; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s3, v0
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v3, 1.0, v1, vcc
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v2, vcc
-; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
+; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v0
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, 1.0, v1, vcc
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; VI-SDAG-NEXT:    v_add_f32_e32 v4, s7, v4
-; VI-SDAG-NEXT:    v_add_f32_e32 v1, s6, v1
+; VI-SDAG-NEXT:    v_add_f32_e32 v4, s3, v4
+; VI-SDAG-NEXT:    v_add_f32_e32 v1, s2, v1
 ; VI-SDAG-NEXT:    v_exp_f32_e32 v4, v4
 ; VI-SDAG-NEXT:    v_exp_f32_e32 v2, v1
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v1, v4, v3
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v0, v2, v0
-; VI-SDAG-NEXT:    v_mov_b32_e32 v2, s4
-; VI-SDAG-NEXT:    v_mov_b32_e32 v3, s5
+; VI-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-SDAG-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
index ad70589b544ee..d847af780acab 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
@@ -568,15 +568,15 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
 ;
 ; GFX1100-SDAG-LABEL: s_log_v2f32:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1100-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, s7
-; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s1, 0x800000, s6
+; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s4, 0x800000, s3
+; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s5, 0x800000, s2
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s0
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s1
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s5
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_dual_mul_f32 v0, s7, v0 :: v_dual_mul_f32 v1, s6, v1
+; GFX1100-SDAG-NEXT:    v_dual_mul_f32 v0, s3, v0 :: v_dual_mul_f32 v1, s2, v1
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v1, v1
@@ -589,29 +589,29 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
 ; GFX1100-SDAG-NEXT:    v_dual_fmac_f32 v4, 0x3377d1cf, v0 :: v_dual_fmac_f32 v5, 0x3377d1cf, v1
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX1100-SDAG-NEXT:    v_dual_add_f32 v2, v2, v4 :: v_dual_add_f32 v3, v3, v5
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 0x41b17218, s0
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v5, 0, 0x41b17218, s1
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 0x41b17218, s4
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v5, 0, 0x41b17218, s5
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1|
 ; GFX1100-SDAG-NEXT:    v_dual_cndmask_b32 v2, v1, v3 :: v_dual_mov_b32 v3, 0
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_dual_sub_f32 v1, v0, v4 :: v_dual_sub_f32 v0, v2, v5
-; GFX1100-SDAG-NEXT:    global_store_b64 v3, v[0:1], s[4:5]
+; GFX1100-SDAG-NEXT:    global_store_b64 v3, v[0:1], s[0:1]
 ; GFX1100-SDAG-NEXT:    s_nop 0
 ; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1100-GISEL-LABEL: s_log_v2f32:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1100-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, s6
-; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s1, 0x800000, s7
+; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s4, 0x800000, s2
+; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s5, 0x800000, s3
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s1
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s5
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v0, s6, v0 :: v_dual_mul_f32 v1, s7, v1
+; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v0, s2, v0 :: v_dual_mul_f32 v1, s3, v1
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v1, v1
@@ -624,14 +624,14 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
 ; GFX1100-GISEL-NEXT:    v_dual_fmac_f32 v4, 0x3377d1cf, v0 :: v_dual_fmac_f32 v5, 0x3377d1cf, v1
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX1100-GISEL-NEXT:    v_dual_add_f32 v2, v2, v4 :: v_dual_add_f32 v3, v3, v5
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 0x41b17218, s0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 0x41b17218, s1
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 0x41b17218, s4
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 0x41b17218, s5
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1|
 ; GFX1100-GISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_cndmask_b32 v1, v1, v3
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_dual_sub_f32 v0, v0, v4 :: v_dual_sub_f32 v1, v1, v5
-; GFX1100-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX1100-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX1100-GISEL-NEXT:    s_nop 0
 ; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-GISEL-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
index 82c73fa441aaf..3f060de9f6596 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
@@ -568,15 +568,15 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ;
 ; GFX1100-SDAG-LABEL: s_log10_v2f32:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1100-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, s7
-; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s1, 0x800000, s6
+; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s4, 0x800000, s3
+; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s5, 0x800000, s2
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s0
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s1
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s5
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_dual_mul_f32 v0, s7, v0 :: v_dual_mul_f32 v1, s6, v1
+; GFX1100-SDAG-NEXT:    v_dual_mul_f32 v0, s3, v0 :: v_dual_mul_f32 v1, s2, v1
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v1, v1
@@ -589,29 +589,29 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; GFX1100-SDAG-NEXT:    v_dual_fmac_f32 v4, 0x3284fbcf, v0 :: v_dual_fmac_f32 v5, 0x3284fbcf, v1
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX1100-SDAG-NEXT:    v_dual_add_f32 v2, v2, v4 :: v_dual_add_f32 v3, v3, v5
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 0x411a209b, s0
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v5, 0, 0x411a209b, s1
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 0x411a209b, s4
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v5, 0, 0x411a209b, s5
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1|
 ; GFX1100-SDAG-NEXT:    v_dual_cndmask_b32 v2, v1, v3 :: v_dual_mov_b32 v3, 0
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_dual_sub_f32 v1, v0, v4 :: v_dual_sub_f32 v0, v2, v5
-; GFX1100-SDAG-NEXT:    global_store_b64 v3, v[0:1], s[4:5]
+; GFX1100-SDAG-NEXT:    global_store_b64 v3, v[0:1], s[0:1]
 ; GFX1100-SDAG-NEXT:    s_nop 0
 ; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1100-GISEL-LABEL: s_log10_v2f32:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1100-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, s6
-; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s1, 0x800000, s7
+; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s4, 0x800000, s2
+; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s5, 0x800000, s3
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s1
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s5
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v0, s6, v0 :: v_dual_mul_f32 v1, s7, v1
+; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v0, s2, v0 :: v_dual_mul_f32 v1, s3, v1
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v1, v1
@@ -624,14 +624,14 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; GFX1100-GISEL-NEXT:    v_dual_fmac_f32 v4, 0x3284fbcf, v0 :: v_dual_fmac_f32 v5, 0x3284fbcf, v1
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX1100-GISEL-NEXT:    v_dual_add_f32 v2, v2, v4 :: v_dual_add_f32 v3, v3, v5
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 0x411a209b, s0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 0x411a209b, s1
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 0x411a209b, s4
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 0x411a209b, s5
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1|
 ; GFX1100-GISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_cndmask_b32 v1, v1, v3
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_dual_sub_f32 v0, v0, v4 :: v_dual_sub_f32 v1, v1, v5
-; GFX1100-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX1100-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX1100-GISEL-NEXT:    s_nop 0
 ; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-GISEL-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
index b76e6214922bd..90a15ae8d9b28 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
@@ -265,25 +265,25 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ;
 ; VI-SDAG-LABEL: s_log2_v2f32:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v0, 0x800000
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s7, v0
+; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s3, v0
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v3, 0, v1, vcc
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 1.0, v2, vcc
-; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
+; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v0
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-SDAG-NEXT:    v_mul_f32_e32 v4, s7, v4
-; VI-SDAG-NEXT:    v_mul_f32_e32 v1, s6, v1
+; VI-SDAG-NEXT:    v_mul_f32_e32 v4, s3, v4
+; VI-SDAG-NEXT:    v_mul_f32_e32 v1, s2, v1
 ; VI-SDAG-NEXT:    v_log_f32_e32 v4, v4
 ; VI-SDAG-NEXT:    v_log_f32_e32 v2, v1
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v1, v4, v3
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v2, v0
-; VI-SDAG-NEXT:    v_mov_b32_e32 v2, s4
-; VI-SDAG-NEXT:    v_mov_b32_e32 v3, s5
+; VI-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-SDAG-NEXT:    s_endpgm
 ;
@@ -359,49 +359,49 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ;
 ; GFX1100-SDAG-LABEL: s_log2_v2f32:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1100-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, s7
-; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s1, 0x800000, s6
+; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s4, 0x800000, s3
+; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s5, 0x800000, s2
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s1
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 0x42000000, s0
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42000000, s1
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s4
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s5
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 0x42000000, s4
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42000000, s5
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v1, s7, v1
-; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v3, s6, v3
+; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v1, s3, v1
+; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v3, s2, v3
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v1, v1
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v3, v3
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_dual_sub_f32 v1, v1, v0 :: v_dual_sub_f32 v0, v3, v2
-; GFX1100-SDAG-NEXT:    global_store_b64 v4, v[0:1], s[4:5]
+; GFX1100-SDAG-NEXT:    global_store_b64 v4, v[0:1], s[0:1]
 ; GFX1100-SDAG-NEXT:    s_nop 0
 ; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1100-GISEL-LABEL: s_log2_v2f32:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1100-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, s6
-; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s1, 0x800000, s7
+; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s4, 0x800000, s2
+; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s5, 0x800000, s3
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s1
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42000000, s0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 0x42000000, s1
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s5
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42000000, s4
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 0x42000000, s5
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v0, s6, v0 :: v_dual_mul_f32 v1, s7, v1
+; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v0, s2, v0 :: v_dual_mul_f32 v1, s3, v1
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v1, v1
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_dual_sub_f32 v0, v0, v2 :: v_dual_sub_f32 v1, v1, v3
 ; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1100-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX1100-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX1100-GISEL-NEXT:    s_nop 0
 ; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-GISEL-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll b/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll
index 9fcbdf3968693..826862e124920 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll
@@ -224,11 +224,11 @@ define amdgpu_kernel void @local_size_yz(ptr addrspace(1) %out) {
 ;
 ; VI-LABEL: local_size_yz:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x1c
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x1c
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mul_i32 s0, s4, s5
-; VI-NEXT:    v_mov_b32_e32 v0, s6
-; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    s_mul_i32 s0, s0, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.ll
index c5d2f791d1677..8196999b8f1f1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.round.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.round.ll
@@ -133,56 +133,82 @@ define amdgpu_kernel void @round_v2f32(ptr addrspace(1) %out, <2 x float> %in) #
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
-; GFX89-LABEL: round_v2f32:
-; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX89-NEXT:    s_brev_b32 s8, -2
-; GFX89-NEXT:    s_mov_b32 s3, 0xf000
-; GFX89-NEXT:    s_mov_b32 s2, -1
-; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX89-NEXT:    v_trunc_f32_e32 v0, s7
-; GFX89-NEXT:    v_sub_f32_e32 v1, s7, v0
-; GFX89-NEXT:    s_mov_b32 s0, s4
-; GFX89-NEXT:    s_mov_b32 s1, s5
-; GFX89-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5
-; GFX89-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, s[4:5]
-; GFX89-NEXT:    v_mov_b32_e32 v2, s7
-; GFX89-NEXT:    v_bfi_b32 v1, s8, v1, v2
-; GFX89-NEXT:    v_add_f32_e32 v1, v0, v1
-; GFX89-NEXT:    v_trunc_f32_e32 v0, s6
-; GFX89-NEXT:    v_sub_f32_e32 v2, s6, v0
-; GFX89-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
-; GFX89-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
-; GFX89-NEXT:    v_mov_b32_e32 v3, s6
-; GFX89-NEXT:    v_bfi_b32 v2, s8, v2, v3
-; GFX89-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX89-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; GFX89-NEXT:    s_endpgm
+; GFX8-LABEL: round_v2f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT:    s_brev_b32 s8, -2
+; GFX8-NEXT:    s_mov_b32 s7, 0xf000
+; GFX8-NEXT:    s_mov_b32 s6, -1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_trunc_f32_e32 v0, s3
+; GFX8-NEXT:    v_sub_f32_e32 v1, s3, v0
+; GFX8-NEXT:    s_mov_b32 s4, s0
+; GFX8-NEXT:    s_mov_b32 s5, s1
+; GFX8-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v2, s3
+; GFX8-NEXT:    v_bfi_b32 v1, s8, v1, v2
+; GFX8-NEXT:    v_add_f32_e32 v1, v0, v1
+; GFX8-NEXT:    v_trunc_f32_e32 v0, s2
+; GFX8-NEXT:    v_sub_f32_e32 v2, s2, v0
+; GFX8-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v2|, 0.5
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v3, s2
+; GFX8-NEXT:    v_bfi_b32 v2, s8, v2, v3
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-LABEL: round_v2f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_brev_b32 s8, -2
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_trunc_f32_e32 v0, s7
+; GFX9-NEXT:    v_sub_f32_e32 v1, s7, v0
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v2, s7
+; GFX9-NEXT:    v_bfi_b32 v1, s8, v1, v2
+; GFX9-NEXT:    v_add_f32_e32 v1, v0, v1
+; GFX9-NEXT:    v_trunc_f32_e32 v0, s6
+; GFX9-NEXT:    v_sub_f32_e32 v2, s6, v0
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v3, s6
+; GFX9-NEXT:    v_bfi_b32 v2, s8, v2, v3
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: round_v2f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_trunc_f32_e32 v0, s7
-; GFX11-NEXT:    v_trunc_f32_e32 v2, s6
+; GFX11-NEXT:    v_trunc_f32_e32 v0, s3
+; GFX11-NEXT:    v_trunc_f32_e32 v2, s2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_sub_f32_e32 v1, s7, v0
-; GFX11-NEXT:    v_sub_f32_e32 v3, s6, v2
+; GFX11-NEXT:    v_sub_f32_e32 v1, s3, v0
+; GFX11-NEXT:    v_sub_f32_e32 v3, s2, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_ge_f32_e64 s0, |v1|, 0.5
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, s0
+; GFX11-NEXT:    v_cmp_ge_f32_e64 s4, |v1|, 0.5
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, s4
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_ge_f32_e64 s0, |v3|, 0.5
-; GFX11-NEXT:    v_bfi_b32 v1, 0x7fffffff, v1, s7
+; GFX11-NEXT:    v_cmp_ge_f32_e64 s4, |v3|, 0.5
+; GFX11-NEXT:    v_bfi_b32 v1, 0x7fffffff, v1, s3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1.0, s0
-; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1.0, s4
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    v_add_f32_e32 v1, v0, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_bfi_b32 v3, 0x7fffffff, v3, s6
-; GFX11-NEXT:    s_mov_b32 s6, -1
+; GFX11-NEXT:    v_bfi_b32 v3, 0x7fffffff, v3, s2
+; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    v_add_f32_e32 v0, v2, v3
-; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
index 67071327e3aff..a54405bf1b471 100644
--- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
@@ -18,18 +18,18 @@ define amdgpu_kernel void @s_lshr_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <
 ;
 ; VI-LABEL: s_lshr_v2i16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshr_b32 s1, s6, 16
-; VI-NEXT:    s_lshr_b32 s2, s7, 16
-; VI-NEXT:    s_and_b32 s0, s6, 0xffff
-; VI-NEXT:    s_lshr_b32 s1, s1, s2
-; VI-NEXT:    s_lshr_b32 s0, s0, s7
-; VI-NEXT:    s_lshl_b32 s1, s1, 16
-; VI-NEXT:    s_or_b32 s0, s0, s1
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    s_and_b32 s4, s2, 0xffff
+; VI-NEXT:    s_lshr_b32 s2, s2, 16
+; VI-NEXT:    s_lshr_b32 s5, s3, 16
+; VI-NEXT:    s_lshr_b32 s2, s2, s5
+; VI-NEXT:    s_lshr_b32 s3, s4, s3
+; VI-NEXT:    s_lshl_b32 s2, s2, 16
+; VI-NEXT:    s_or_b32 s2, s3, s2
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -63,11 +63,11 @@ define amdgpu_kernel void @s_lshr_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <
 ;
 ; GFX11-LABEL: s_lshr_v2i16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_pk_lshrrev_b16 v1, s7, s6
-; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX11-NEXT:    v_pk_lshrrev_b16 v1, s3, s2
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/madak.ll b/llvm/test/CodeGen/AMDGPU/madak.ll
index def0dfa4b903b..944db3d3adc3a 100644
--- a/llvm/test/CodeGen/AMDGPU/madak.ll
+++ b/llvm/test/CodeGen/AMDGPU/madak.ll
@@ -190,11 +190,11 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad
 ;
 ; GFX8-LABEL: madak_2_use_f32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 2, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, s7
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 4, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -206,8 +206,8 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v4, v[4:5] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, s5
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v6
 ; GFX8-NEXT:    v_mov_b32_e32 v5, 0x41200000
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    v_madak_f32 v6, v7, v8, 0x41200000
@@ -259,22 +259,22 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad
 ;
 ; GFX11-MAD-LABEL: madak_2_use_f32:
 ; GFX11-MAD:       ; %bb.0:
-; GFX11-MAD-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-MAD-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-MAD-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-MAD-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-MAD-NEXT:    global_load_b32 v1, v0, s[6:7] glc dlc
+; GFX11-MAD-NEXT:    global_load_b32 v1, v0, s[2:3] glc dlc
 ; GFX11-MAD-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-MAD-NEXT:    global_load_b32 v2, v0, s[6:7] offset:4 glc dlc
+; GFX11-MAD-NEXT:    global_load_b32 v2, v0, s[2:3] offset:4 glc dlc
 ; GFX11-MAD-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-MAD-NEXT:    global_load_b32 v3, v0, s[6:7] offset:8 glc dlc
+; GFX11-MAD-NEXT:    global_load_b32 v3, v0, s[2:3] offset:8 glc dlc
 ; GFX11-MAD-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-MAD-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GFX11-MAD-NEXT:    v_mul_f32_e32 v1, v1, v3
 ; GFX11-MAD-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-MAD-NEXT:    v_dual_add_f32 v1, 0x41200000, v1 :: v_dual_add_f32 v2, 0x41200000, v2
-; GFX11-MAD-NEXT:    global_store_b32 v0, v2, s[4:5] dlc
+; GFX11-MAD-NEXT:    global_store_b32 v0, v2, s[0:1] dlc
 ; GFX11-MAD-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-MAD-NEXT:    global_store_b32 v0, v1, s[6:7] offset:4 dlc
+; GFX11-MAD-NEXT:    global_store_b32 v0, v1, s[2:3] offset:4 dlc
 ; GFX11-MAD-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-MAD-NEXT:    s_nop 0
 ; GFX11-MAD-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -321,20 +321,20 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad
 ;
 ; GFX11-FMA-LABEL: madak_2_use_f32:
 ; GFX11-FMA:       ; %bb.0:
-; GFX11-FMA-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-FMA-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-FMA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FMA-NEXT:    global_load_b32 v1, v0, s[6:7] glc dlc
+; GFX11-FMA-NEXT:    global_load_b32 v1, v0, s[2:3] glc dlc
 ; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FMA-NEXT:    global_load_b32 v2, v0, s[6:7] offset:4 glc dlc
+; GFX11-FMA-NEXT:    global_load_b32 v2, v0, s[2:3] offset:4 glc dlc
 ; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FMA-NEXT:    global_load_b32 v3, v0, s[6:7] offset:8 glc dlc
+; GFX11-FMA-NEXT:    global_load_b32 v3, v0, s[2:3] offset:8 glc dlc
 ; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FMA-NEXT:    v_fmaak_f32 v2, v1, v2, 0x41200000
 ; GFX11-FMA-NEXT:    v_fmaak_f32 v1, v1, v3, 0x41200000
-; GFX11-FMA-NEXT:    global_store_b32 v0, v2, s[4:5] dlc
+; GFX11-FMA-NEXT:    global_store_b32 v0, v2, s[0:1] dlc
 ; GFX11-FMA-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FMA-NEXT:    global_store_b32 v0, v1, s[6:7] offset:4 dlc
+; GFX11-FMA-NEXT:    global_store_b32 v0, v1, s[2:3] offset:4 dlc
 ; GFX11-FMA-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-FMA-NEXT:    s_nop 0
 ; GFX11-FMA-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -381,15 +381,15 @@ define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out,
 ;
 ; GFX8-LABEL: madak_m_inline_imm_f32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, s7
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
-; GFX8-NEXT:    v_mov_b32_e32 v1, s5
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_madak_f32 v2, 4.0, v3, 0x41200000
@@ -420,15 +420,15 @@ define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out,
 ;
 ; GFX11-MAD-LABEL: madak_m_inline_imm_f32:
 ; GFX11-MAD:       ; %bb.0:
-; GFX11-MAD-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-MAD-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-MAD-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-MAD-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-MAD-NEXT:    global_load_b32 v1, v0, s[6:7]
+; GFX11-MAD-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-MAD-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-MAD-NEXT:    v_mul_f32_e32 v1, 4.0, v1
 ; GFX11-MAD-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-MAD-NEXT:    v_add_f32_e32 v1, 0x41200000, v1
-; GFX11-MAD-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX11-MAD-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-MAD-NEXT:    s_nop 0
 ; GFX11-MAD-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-MAD-NEXT:    s_endpgm
@@ -457,13 +457,13 @@ define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out,
 ;
 ; GFX11-FMA-LABEL: madak_m_inline_imm_f32:
 ; GFX11-FMA:       ; %bb.0:
-; GFX11-FMA-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-FMA-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-FMA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FMA-NEXT:    global_load_b32 v1, v0, s[6:7]
+; GFX11-FMA-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FMA-NEXT:    v_fmaak_f32 v1, 4.0, v1, 0x41200000
-; GFX11-FMA-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX11-FMA-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-FMA-NEXT:    s_nop 0
 ; GFX11-FMA-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FMA-NEXT:    s_endpgm
@@ -919,13 +919,13 @@ define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float
 ;
 ; GFX8-LABEL: s_s_madak_f32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0x41200000
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v0, s7
-; GFX8-NEXT:    v_mac_f32_e32 v2, s6, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s4
-; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_mov_b32_e32 v0, s3
+; GFX8-NEXT:    v_mac_f32_e32 v2, s2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -952,12 +952,12 @@ define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float
 ;
 ; GFX11-MAD-LABEL: s_s_madak_f32:
 ; GFX11-MAD:       ; %bb.0:
-; GFX11-MAD-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-MAD-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-MAD-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-MAD-NEXT:    v_mul_f32_e64 v0, s6, s7
+; GFX11-MAD-NEXT:    v_mul_f32_e64 v0, s2, s3
 ; GFX11-MAD-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-MAD-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_f32 v0, 0x41200000, v0
-; GFX11-MAD-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX11-MAD-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-MAD-NEXT:    s_nop 0
 ; GFX11-MAD-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-MAD-NEXT:    s_endpgm
@@ -985,12 +985,12 @@ define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float
 ;
 ; GFX11-FMA-LABEL: s_s_madak_f32:
 ; GFX11-FMA:       ; %bb.0:
-; GFX11-FMA-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-FMA-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-FMA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FMA-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s7
+; GFX11-FMA-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s3
 ; GFX11-FMA-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FMA-NEXT:    v_fmaak_f32 v0, s6, v0, 0x41200000
-; GFX11-FMA-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX11-FMA-NEXT:    v_fmaak_f32 v0, s2, v0, 0x41200000
+; GFX11-FMA-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-FMA-NEXT:    s_nop 0
 ; GFX11-FMA-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FMA-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/memory_clause.ll b/llvm/test/CodeGen/AMDGPU/memory_clause.ll
index 5c88328b6dd91..86a5055ab0704 100644
--- a/llvm/test/CodeGen/AMDGPU/memory_clause.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory_clause.ll
@@ -24,22 +24,22 @@ define amdgpu_kernel void @vector_clause(ptr addrspace(1) noalias nocapture read
 ;
 ; GCN-SCRATCH-LABEL: vector_clause:
 ; GCN-SCRATCH:       ; %bb.0: ; %bb
-; GCN-SCRATCH-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-SCRATCH-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GCN-SCRATCH-NEXT:    v_lshlrev_b32_e32 v16, 4, v0
 ; GCN-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-SCRATCH-NEXT:    s_clause 0x3
-; GCN-SCRATCH-NEXT:    global_load_dwordx4 v[0:3], v16, s[4:5]
-; GCN-SCRATCH-NEXT:    global_load_dwordx4 v[4:7], v16, s[4:5] offset:16
-; GCN-SCRATCH-NEXT:    global_load_dwordx4 v[8:11], v16, s[4:5] offset:32
-; GCN-SCRATCH-NEXT:    global_load_dwordx4 v[12:15], v16, s[4:5] offset:48
+; GCN-SCRATCH-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1]
+; GCN-SCRATCH-NEXT:    global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
+; GCN-SCRATCH-NEXT:    global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
+; GCN-SCRATCH-NEXT:    global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
 ; GCN-SCRATCH-NEXT:    s_waitcnt vmcnt(3)
-; GCN-SCRATCH-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7]
+; GCN-SCRATCH-NEXT:    global_store_dwordx4 v16, v[0:3], s[2:3]
 ; GCN-SCRATCH-NEXT:    s_waitcnt vmcnt(2)
-; GCN-SCRATCH-NEXT:    global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
+; GCN-SCRATCH-NEXT:    global_store_dwordx4 v16, v[4:7], s[2:3] offset:16
 ; GCN-SCRATCH-NEXT:    s_waitcnt vmcnt(1)
-; GCN-SCRATCH-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
+; GCN-SCRATCH-NEXT:    global_store_dwordx4 v16, v[8:11], s[2:3] offset:32
 ; GCN-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GCN-SCRATCH-NEXT:    global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
+; GCN-SCRATCH-NEXT:    global_store_dwordx4 v16, v[12:15], s[2:3] offset:48
 ; GCN-SCRATCH-NEXT:    s_endpgm
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/merge-s-load.mir b/llvm/test/CodeGen/AMDGPU/merge-s-load.mir
index a87503c731c7a..b08da2e1848ff 100644
--- a/llvm/test/CodeGen/AMDGPU/merge-s-load.mir
+++ b/llvm/test/CodeGen/AMDGPU/merge-s-load.mir
@@ -8,9 +8,9 @@ body: |
   bb.0:
     ; CHECK-LABEL: name: merge_s_load_x1_x1
     ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
-    ; CHECK-NEXT: early-clobber %3:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s64), align 4)
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32_xm0_xexec = COPY %3.sub0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed %3.sub1
+    ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s64), align 4)
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[S_LOAD_DWORDX2_IMM]].sub0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_LOAD_DWORDX2_IMM]].sub1
     %0:sgpr_64 = IMPLICIT_DEF
     %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s32))
     %2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 4, 0 :: (dereferenceable invariant load (s32))
@@ -48,16 +48,16 @@ body: |
   bb.0:
     ; GFX11-LABEL: name: merge_s_load_x1_x1_x1
     ; GFX11: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
-    ; GFX11-NEXT: early-clobber %4:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s64), align 4)
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32_xm0_xexec = COPY %4.sub0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed %4.sub1
+    ; GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s64), align 4)
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[S_LOAD_DWORDX2_IMM]].sub0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_LOAD_DWORDX2_IMM]].sub1
     ; GFX11-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 8, 0 :: (dereferenceable invariant load (s32))
     ;
     ; GFX12-LABEL: name: merge_s_load_x1_x1_x1
     ; GFX12: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
-    ; GFX12-NEXT: early-clobber %5:sgpr_96 = S_LOAD_DWORDX3_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s96), align 4)
-    ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_64_xexec = COPY %5.sub0_sub1
-    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed %5.sub2
+    ; GFX12-NEXT: [[S_LOAD_DWORDX3_IMM:%[0-9]+]]:sgpr_96 = S_LOAD_DWORDX3_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s96), align 4)
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_64_xexec = COPY [[S_LOAD_DWORDX3_IMM]].sub0_sub1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_LOAD_DWORDX3_IMM]].sub2
     ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY]].sub0
     ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY]].sub1
     %0:sgpr_64 = IMPLICIT_DEF
@@ -72,9 +72,9 @@ body: |
   bb.0:
     ; GFX11-LABEL: name: merge_s_load_x1_x1_x1_x1
     ; GFX11: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
-    ; GFX11-NEXT: early-clobber %7:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s128), align 4)
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64_xexec = COPY %7.sub0_sub1
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY killed %7.sub2_sub3
+    ; GFX11-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s128), align 4)
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64_xexec = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY killed [[S_LOAD_DWORDX4_IMM]].sub2_sub3
     ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY]].sub0
     ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY]].sub1
     ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY1]].sub0
@@ -82,9 +82,9 @@ body: |
     ;
     ; GFX12-LABEL: name: merge_s_load_x1_x1_x1_x1
     ; GFX12: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
-    ; GFX12-NEXT: early-clobber %7:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s128), align 4)
-    ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_96 = COPY %7.sub0_sub1_sub2
-    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed %7.sub3
+    ; GFX12-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s128), align 4)
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_96 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1_sub2
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_LOAD_DWORDX4_IMM]].sub3
     ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_64_xexec = COPY [[COPY]].sub0_sub1
     ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY]].sub2
     ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY2]].sub0
@@ -102,9 +102,9 @@ body: |
   bb.0:
     ; GFX11-LABEL: name: merge_s_load_x1_x1_x1_x1_x1_x1_x1_x1
     ; GFX11: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
-    ; GFX11-NEXT: early-clobber %15:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 4)
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %15.sub0_sub1_sub2_sub3
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed %15.sub4_sub5_sub6_sub7
+    ; GFX11-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 4)
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed [[S_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7
     ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_64_xexec = COPY [[COPY]].sub0_sub1
     ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY killed [[COPY]].sub2_sub3
     ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY2]].sub0
@@ -120,9 +120,9 @@ body: |
     ;
     ; GFX12-LABEL: name: merge_s_load_x1_x1_x1_x1_x1_x1_x1_x1
     ; GFX12: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
-    ; GFX12-NEXT: early-clobber %15:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 4)
-    ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %15.sub0_sub1_sub2_sub3
-    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed %15.sub4_sub5_sub6_sub7
+    ; GFX12-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 4)
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed [[S_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7
     ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_96 = COPY [[COPY]].sub0_sub1_sub2
     ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY]].sub3
     ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_64_xexec = COPY [[COPY2]].sub0_sub1
@@ -157,9 +157,9 @@ body: |
     ;
     ; GFX12-LABEL: name: merge_s_load_x2_x1
     ; GFX12: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
-    ; GFX12-NEXT: early-clobber %3:sgpr_96 = S_LOAD_DWORDX3_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s96), align 8)
-    ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY %3.sub0_sub1
-    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed %3.sub2
+    ; GFX12-NEXT: [[S_LOAD_DWORDX3_IMM:%[0-9]+]]:sgpr_96 = S_LOAD_DWORDX3_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s96), align 8)
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY [[S_LOAD_DWORDX3_IMM]].sub0_sub1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_LOAD_DWORDX3_IMM]].sub2
     %0:sgpr_64 = IMPLICIT_DEF
     %1:sgpr_64 = S_LOAD_DWORDX2_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s64))
     %2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 8, 0 :: (dereferenceable invariant load (s32))
@@ -171,9 +171,9 @@ body: |
   bb.0:
     ; CHECK-LABEL: name: merge_s_load_x2_x2
     ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
-    ; CHECK-NEXT: early-clobber %3:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s128), align 8)
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY %3.sub0_sub1
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_64 = COPY killed %3.sub2_sub3
+    ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s128), align 8)
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_64 = COPY killed [[S_LOAD_DWORDX4_IMM]].sub2_sub3
     %0:sgpr_64 = IMPLICIT_DEF
     %1:sgpr_64 = S_LOAD_DWORDX2_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s64))
     %2:sgpr_64 = S_LOAD_DWORDX2_IMM %0:sgpr_64, 8, 0 :: (dereferenceable invariant load (s64))
@@ -185,9 +185,9 @@ body: |
   bb.0:
     ; CHECK-LABEL: name: merge_s_load_x2_x2_x2_x2
     ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
-    ; CHECK-NEXT: early-clobber %7:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 8)
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %7.sub0_sub1_sub2_sub3
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed %7.sub4_sub5_sub6_sub7
+    ; CHECK-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 8)
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed [[S_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_64 = COPY [[COPY]].sub0_sub1
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY killed [[COPY]].sub2_sub3
     ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY [[COPY1]].sub0_sub1
@@ -205,9 +205,9 @@ body: |
   bb.0:
     ; CHECK-LABEL: name: merge_s_load_x3_x1
     ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
-    ; CHECK-NEXT: early-clobber %3:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s128))
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_96 = COPY %3.sub0_sub1_sub2
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed %3.sub3
+    ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s128))
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_96 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1_sub2
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_LOAD_DWORDX4_IMM]].sub3
     %0:sgpr_64 = IMPLICIT_DEF
     %1:sgpr_96 = S_LOAD_DWORDX3_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s96))
     %2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 12, 0 :: (dereferenceable invariant load (s32))
@@ -219,118 +219,10 @@ body: |
   bb.0:
     ; CHECK-LABEL: name: merge_s_load_x4_x4
     ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
-    ; CHECK-NEXT: early-clobber %3:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 16)
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %3.sub0_sub1_sub2_sub3
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed %3.sub4_sub5_sub6_sub7
+    ; CHECK-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 16)
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed [[S_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7
     %0:sgpr_64 = IMPLICIT_DEF
     %1:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s128))
     %2:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 16, 0 :: (dereferenceable invariant load (s128))
 ...
-
-# The constrained multi-dword scalar load merge tests.
----
-name: merge_s_load_x1_x2ec
-body: |
-  bb.0:
-    ; CHECK-LABEL: name: merge_s_load_x1_x2ec
-    ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s32))
-    ; CHECK-NEXT: early-clobber %2:sgpr_64 = S_LOAD_DWORDX2_IMM_ec [[DEF]], 4, 0 :: (dereferenceable invariant load (s64))
-    %0:sgpr_64 = IMPLICIT_DEF
-    %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s32))
-    early-clobber %2:sgpr_64 = S_LOAD_DWORDX2_IMM_ec %0:sgpr_64, 4, 0 :: (dereferenceable invariant load (s64))
-...
-
----
-name: merge_s_load_x1_x3ec
-body: |
-  bb.0:
-    ; CHECK-LABEL: name: merge_s_load_x1_x3ec
-    ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s32))
-    ; CHECK-NEXT: early-clobber %2:sgpr_96 = S_LOAD_DWORDX3_IMM_ec [[DEF]], 4, 0 :: (dereferenceable invariant load (s96), align 16)
-    %0:sgpr_64 = IMPLICIT_DEF
-    %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s32))
-    early-clobber %2:sgpr_96 = S_LOAD_DWORDX3_IMM_ec %0:sgpr_64, 4, 0 :: (dereferenceable invariant load (s96))
-...
-
----
-name: merge_s_load_x2ec_x1
-body: |
-  bb.0:
-    ; GFX11-LABEL: name: merge_s_load_x2ec_x1
-    ; GFX11: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
-    ; GFX11-NEXT: early-clobber %1:sgpr_64 = S_LOAD_DWORDX2_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s64))
-    ; GFX11-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 8, 0 :: (dereferenceable invariant load (s32))
-    ;
-    ; GFX12-LABEL: name: merge_s_load_x2ec_x1
-    ; GFX12: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
-    ; GFX12-NEXT: early-clobber %3:sgpr_96 = S_LOAD_DWORDX3_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s96), align 8)
-    ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY %3.sub0_sub1
-    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed %3.sub2
-    %0:sgpr_64 = IMPLICIT_DEF
-    early-clobber %1:sgpr_64 = S_LOAD_DWORDX2_IMM_ec %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s64))
-    %2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 8, 0 :: (dereferenceable invariant load (s32))
-...
-
----
-name: merge_s_load_x2ec_x2ec
-body: |
-  bb.0:
-    ; CHECK-LABEL: name: merge_s_load_x2ec_x2ec
-    ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
-    ; CHECK-NEXT: early-clobber %3:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s128), align 8)
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY %3.sub0_sub1
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_64 = COPY killed %3.sub2_sub3
-    %0:sgpr_64 = IMPLICIT_DEF
-    early-clobber %1:sgpr_64 = S_LOAD_DWORDX2_IMM_ec %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s64))
-    early-clobber %2:sgpr_64 = S_LOAD_DWORDX2_IMM_ec %0:sgpr_64, 8, 0 :: (dereferenceable invariant load (s64))
-...
-
----
-name: merge_s_load_x2ec_x2ec_x2ec_x2ec
-body: |
-  bb.0:
-    ; CHECK-LABEL: name: merge_s_load_x2ec_x2ec_x2ec_x2ec
-    ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
-    ; CHECK-NEXT: early-clobber %7:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 8)
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %7.sub0_sub1_sub2_sub3
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed %7.sub4_sub5_sub6_sub7
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_64 = COPY [[COPY]].sub0_sub1
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY killed [[COPY]].sub2_sub3
-    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY [[COPY1]].sub0_sub1
-    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY killed [[COPY1]].sub2_sub3
-    %0:sgpr_64 = IMPLICIT_DEF
-    early-clobber %1:sgpr_64 = S_LOAD_DWORDX2_IMM_ec %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s64))
-    early-clobber %2:sgpr_64 = S_LOAD_DWORDX2_IMM_ec %0:sgpr_64, 8, 0 :: (dereferenceable invariant load (s64))
-    early-clobber %3:sgpr_64 = S_LOAD_DWORDX2_IMM_ec %0:sgpr_64, 16, 0 :: (dereferenceable invariant load (s64))
-    early-clobber %4:sgpr_64 = S_LOAD_DWORDX2_IMM_ec %0:sgpr_64, 24, 0 :: (dereferenceable invariant load (s64))
-...
-
----
-name: merge_s_load_x3ec_x1
-body: |
-  bb.0:
-    ; CHECK-LABEL: name: merge_s_load_x3ec_x1
-    ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
-    ; CHECK-NEXT: early-clobber %3:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s128))
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_96 = COPY %3.sub0_sub1_sub2
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed %3.sub3
-    %0:sgpr_64 = IMPLICIT_DEF
-    early-clobber %1:sgpr_96 = S_LOAD_DWORDX3_IMM_ec %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s96))
-    %2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 12, 0 :: (dereferenceable invariant load (s32))
-...
-
----
-name: merge_s_load_x4ec_x4ec
-body: |
-  bb.0:
-    ; CHECK-LABEL: name: merge_s_load_x4ec_x4ec
-    ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
-    ; CHECK-NEXT: early-clobber %3:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 16)
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %3.sub0_sub1_sub2_sub3
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed %3.sub4_sub5_sub6_sub7
-    %0:sgpr_64 = IMPLICIT_DEF
-    early-clobber %1:sgpr_128 = S_LOAD_DWORDX4_IMM_ec %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s128))
-    early-clobber %2:sgpr_128 = S_LOAD_DWORDX4_IMM_ec %0:sgpr_64, 16, 0 :: (dereferenceable invariant load (s128))
-...
diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll
index 2334543157b6e..9dafa27ece86f 100644
--- a/llvm/test/CodeGen/AMDGPU/min.ll
+++ b/llvm/test/CodeGen/AMDGPU/min.ll
@@ -185,13 +185,13 @@ define amdgpu_kernel void @s_test_imin_sle_i32(ptr addrspace(1) %out, i32 %a, i3
 ;
 ; GFX11-LABEL: s_test_imin_sle_i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_min_i32 s0, s6, s7
+; GFX11-NEXT:    s_min_i32 s2, s2, s3
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX11-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -257,13 +257,13 @@ define amdgpu_kernel void @s_test_imin_sle_v1i32(ptr addrspace(1) %out, <1 x i32
 ;
 ; GFX11-LABEL: s_test_imin_sle_v1i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_min_i32 s0, s6, s7
+; GFX11-NEXT:    s_min_i32 s2, s2, s3
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX11-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -808,11 +808,11 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16
 ;
 ; GFX11-LABEL: s_test_imin_sle_v2i16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_pk_min_i16 v1, s6, s7
-; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX11-NEXT:    v_pk_min_i16 v1, s2, s3
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -1324,13 +1324,13 @@ define amdgpu_kernel void @s_test_imin_slt_i32(ptr addrspace(1) %out, i32 %a, i3
 ;
 ; GFX11-LABEL: s_test_imin_slt_i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_min_i32 s0, s6, s7
+; GFX11-NEXT:    s_min_i32 s2, s2, s3
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX11-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -2084,13 +2084,13 @@ define amdgpu_kernel void @s_test_umin_ule_i32(ptr addrspace(1) %out, i32 %a, i3
 ;
 ; GFX11-LABEL: s_test_umin_ule_i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_min_u32 s0, s6, s7
+; GFX11-NEXT:    s_min_u32 s2, s2, s3
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX11-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -2405,13 +2405,13 @@ define amdgpu_kernel void @s_test_umin_ult_i32(ptr addrspace(1) %out, i32 %a, i3
 ;
 ; GFX11-LABEL: s_test_umin_ult_i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_min_u32 s0, s6, s7
+; GFX11-NEXT:    s_min_u32 s2, s2, s3
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX11-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -2763,13 +2763,13 @@ define amdgpu_kernel void @s_test_umin_ult_v1i32(ptr addrspace(1) %out, <1 x i32
 ;
 ; GFX11-LABEL: s_test_umin_ult_v1i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_min_u32 s0, s6, s7
+; GFX11-NEXT:    s_min_u32 s2, s2, s3
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX11-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll
index 3c60153df441e..b4272049f36a4 100644
--- a/llvm/test/CodeGen/AMDGPU/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul.ll
@@ -676,17 +676,14 @@ define amdgpu_kernel void @mul64_sext_c(ptr addrspace(1) %out, i32 %in) {
 ;
 ; GFX12-LABEL: mul64_sext_c:
 ; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_ashr_i32 s3, s2, 31
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_mul_u64 s[4:5], s[2:3], 0x50
 ; GFX12-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    s_mov_b32 s2, -1
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_ashr_i32 s1, s6, 31
-; GFX12-NEXT:    s_mov_b32 s0, s6
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_mul_u64 s[0:1], s[0:1], 0x50
-; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT:    s_mov_b32 s0, s4
-; GFX12-NEXT:    s_mov_b32 s1, s5
 ; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], null
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -787,17 +784,13 @@ define amdgpu_kernel void @mul64_zext_c(ptr addrspace(1) %out, i32 %in) {
 ;
 ; GFX12-LABEL: mul64_zext_c:
 ; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
-; GFX12-NEXT:    s_mov_b32 s1, 0
+; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-NEXT:    s_mov_b32 s3, 0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mul_u64 s[4:5], s[2:3], 0x50
 ; GFX12-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    s_mov_b32 s2, -1
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s0, s6
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_mul_u64 s[0:1], s[0:1], 0x50
-; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT:    s_mov_b32 s0, s4
-; GFX12-NEXT:    s_mov_b32 s1, s5
 ; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], null
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
diff --git a/llvm/test/CodeGen/AMDGPU/mul_int24.ll b/llvm/test/CodeGen/AMDGPU/mul_int24.ll
index 6d7bf0027a822..58fd4b9bd2fee 100644
--- a/llvm/test/CodeGen/AMDGPU/mul_int24.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul_int24.ll
@@ -24,17 +24,17 @@ define amdgpu_kernel void @test_smul24_i32(ptr addrspace(1) %out, i32 %a, i32 %b
 ;
 ; VI-LABEL: test_smul24_i32:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s0, s4
-; VI-NEXT:    s_mov_b32 s1, s5
-; VI-NEXT:    s_bfe_i32 s4, s6, 0x180000
-; VI-NEXT:    s_bfe_i32 s5, s7, 0x180000
-; VI-NEXT:    s_mul_i32 s4, s4, s5
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_bfe_i32 s0, s2, 0x180000
+; VI-NEXT:    s_bfe_i32 s1, s3, 0x180000
+; VI-NEXT:    s_mul_i32 s0, s0, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: test_smul24_i32:
@@ -113,15 +113,15 @@ define amdgpu_kernel void @test_smulhi24_i64(ptr addrspace(1) %out, i32 %a, i32
 ;
 ; VI-LABEL: test_smulhi24_i64:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s7
-; VI-NEXT:    s_mov_b32 s0, s4
-; VI-NEXT:    s_mov_b32 s1, s5
-; VI-NEXT:    v_mul_hi_i32_i24_e32 v0, s6, v0
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    v_mov_b32_e32 v0, s3
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    v_mul_hi_i32_i24_e32 v0, s2, v0
+; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: test_smulhi24_i64:
diff --git a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
index e6470a5833d3a..698a54de108f7 100644
--- a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
@@ -24,17 +24,17 @@ define amdgpu_kernel void @test_umul24_i32(ptr addrspace(1) %out, i32 %a, i32 %b
 ;
 ; VI-LABEL: test_umul24_i32:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s0, s4
-; VI-NEXT:    s_mov_b32 s1, s5
-; VI-NEXT:    s_and_b32 s4, s6, 0xffffff
-; VI-NEXT:    s_and_b32 s5, s7, 0xffffff
-; VI-NEXT:    s_mul_i32 s4, s4, s5
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_and_b32 s0, s2, 0xffffff
+; VI-NEXT:    s_and_b32 s1, s3, 0xffffff
+; VI-NEXT:    s_mul_i32 s0, s0, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: test_umul24_i32:
@@ -392,15 +392,15 @@ define amdgpu_kernel void @test_umulhi24_i32_i64(ptr addrspace(1) %out, i32 %a,
 ;
 ; VI-LABEL: test_umulhi24_i32_i64:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s7
-; VI-NEXT:    s_mov_b32 s0, s4
-; VI-NEXT:    s_mov_b32 s1, s5
-; VI-NEXT:    v_mul_hi_u32_u24_e32 v0, s6, v0
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    v_mov_b32_e32 v0, s3
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    v_mul_hi_u32_u24_e32 v0, s2, v0
+; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: test_umulhi24_i32_i64:
@@ -647,18 +647,18 @@ define amdgpu_kernel void @test_umulhi16_i32(ptr addrspace(1) %out, i32 %a, i32
 ;
 ; VI-LABEL: test_umulhi16_i32:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s0, s4
-; VI-NEXT:    s_mov_b32 s1, s5
-; VI-NEXT:    s_and_b32 s4, s6, 0xffff
-; VI-NEXT:    s_and_b32 s5, s7, 0xffff
-; VI-NEXT:    s_mul_i32 s4, s4, s5
-; VI-NEXT:    s_lshr_b32 s4, s4, 16
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_and_b32 s0, s2, 0xffff
+; VI-NEXT:    s_and_b32 s1, s3, 0xffff
+; VI-NEXT:    s_mul_i32 s0, s0, s1
+; VI-NEXT:    s_lshr_b32 s0, s0, 16
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: test_umulhi16_i32:
diff --git a/llvm/test/CodeGen/AMDGPU/or.ll b/llvm/test/CodeGen/AMDGPU/or.ll
index 65f4a1b17a881..0473f803bfb30 100644
--- a/llvm/test/CodeGen/AMDGPU/or.ll
+++ b/llvm/test/CodeGen/AMDGPU/or.ll
@@ -156,15 +156,15 @@ define amdgpu_kernel void @scalar_or_i32(ptr addrspace(1) %out, i32 %a, i32 %b)
 ;
 ; GFX8-LABEL: scalar_or_i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT:    s_mov_b32 s3, 0xf000
-; GFX8-NEXT:    s_mov_b32 s2, -1
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT:    s_mov_b32 s7, 0xf000
+; GFX8-NEXT:    s_mov_b32 s6, -1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 s0, s4
-; GFX8-NEXT:    s_or_b32 s4, s6, s7
-; GFX8-NEXT:    s_mov_b32 s1, s5
-; GFX8-NEXT:    v_mov_b32_e32 v0, s4
-; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT:    s_mov_b32 s4, s0
+; GFX8-NEXT:    s_or_b32 s0, s2, s3
+; GFX8-NEXT:    s_mov_b32 s5, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX8-NEXT:    s_endpgm
 ;
 ; EG-LABEL: scalar_or_i32:
diff --git a/llvm/test/CodeGen/AMDGPU/rotl.ll b/llvm/test/CodeGen/AMDGPU/rotl.ll
index 4a004731c42f9..fcccd2da07f76 100644
--- a/llvm/test/CodeGen/AMDGPU/rotl.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotl.ll
@@ -35,13 +35,13 @@ define amdgpu_kernel void @rotl_i32(ptr addrspace(1) %in, i32 %x, i32 %y) {
 ;
 ; GFX8-LABEL: rotl_i32:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_sub_i32 s0, 32, s7
+; GFX8-NEXT:    s_sub_i32 s3, 32, s3
+; GFX8-NEXT:    v_mov_b32_e32 v0, s3
+; GFX8-NEXT:    v_alignbit_b32 v2, s2, s2, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_alignbit_b32 v2, s6, s6, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s4
-; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -57,13 +57,13 @@ define amdgpu_kernel void @rotl_i32(ptr addrspace(1) %in, i32 %x, i32 %y) {
 ;
 ; GFX11-LABEL: rotl_i32:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_sub_i32 s0, 32, s7
+; GFX11-NEXT:    s_sub_i32 s3, 32, s3
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_alignbit_b32 v1, s6, s6, s0
-; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX11-NEXT:    v_alignbit_b32 v1, s2, s2, s3
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/rotr.ll b/llvm/test/CodeGen/AMDGPU/rotr.ll
index d6431d731a1f7..214894092a8b0 100644
--- a/llvm/test/CodeGen/AMDGPU/rotr.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotr.ll
@@ -32,12 +32,12 @@ define amdgpu_kernel void @rotr_i32(ptr addrspace(1) %in, i32 %x, i32 %y) {
 ;
 ; GFX8-LABEL: rotr_i32:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v0, s7
-; GFX8-NEXT:    v_alignbit_b32 v2, s6, s6, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s4
-; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_mov_b32_e32 v0, s3
+; GFX8-NEXT:    v_alignbit_b32 v2, s2, s2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -52,11 +52,11 @@ define amdgpu_kernel void @rotr_i32(ptr addrspace(1) %in, i32 %x, i32 %y) {
 ;
 ; GFX11-LABEL: rotr_i32:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_alignbit_b32 v1, s6, s6, s7
-; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX11-NEXT:    v_alignbit_b32 v1, s2, s2, s3
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
index 1384fb0e0203a..acdcb631dccbd 100644
--- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
@@ -21,21 +21,21 @@ define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2
 ;
 ; VI-LABEL: s_shl_v2i16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s0, s4
-; VI-NEXT:    s_mov_b32 s1, s5
-; VI-NEXT:    s_lshr_b32 s4, s6, 16
-; VI-NEXT:    s_lshr_b32 s5, s7, 16
-; VI-NEXT:    s_lshl_b32 s4, s4, s5
-; VI-NEXT:    s_lshl_b32 s5, s6, s7
-; VI-NEXT:    s_lshl_b32 s4, s4, 16
-; VI-NEXT:    s_and_b32 s5, s5, 0xffff
-; VI-NEXT:    s_or_b32 s4, s5, s4
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_lshr_b32 s0, s2, 16
+; VI-NEXT:    s_lshr_b32 s1, s3, 16
+; VI-NEXT:    s_lshl_b32 s0, s0, s1
+; VI-NEXT:    s_lshl_b32 s1, s2, s3
+; VI-NEXT:    s_lshl_b32 s0, s0, 16
+; VI-NEXT:    s_and_b32 s1, s1, 0xffff
+; VI-NEXT:    s_or_b32 s0, s1, s0
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; CI-LABEL: s_shl_v2i16:
@@ -71,14 +71,14 @@ define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2
 ;
 ; GFX11-LABEL: s_shl_v2i16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s6, -1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_pk_lshlrev_b16 v0, s7, s6
-; GFX11-NEXT:    s_mov_b32 s0, s4
-; GFX11-NEXT:    s_mov_b32 s1, s5
-; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT:    v_pk_lshlrev_b16 v0, s3, s2
+; GFX11-NEXT:    s_mov_b32 s4, s0
+; GFX11-NEXT:    s_mov_b32 s5, s1
+; GFX11-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/sign_extend.ll b/llvm/test/CodeGen/AMDGPU/sign_extend.ll
index 33249e4faeccd..9a03d216c7a99 100644
--- a/llvm/test/CodeGen/AMDGPU/sign_extend.ll
+++ b/llvm/test/CodeGen/AMDGPU/sign_extend.ll
@@ -19,16 +19,16 @@ define amdgpu_kernel void @s_sext_i1_to_i32(ptr addrspace(1) %out, i32 %a, i32 %
 ;
 ; VI-LABEL: s_sext_i1_to_i32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_cmp_eq_u32 s6, s7
-; VI-NEXT:    s_mov_b32 s0, s4
-; VI-NEXT:    s_mov_b32 s1, s5
-; VI-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[4:5]
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    s_cmp_eq_u32 s2, s3
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
+; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
   %cmp = icmp eq i32 %a, %b
   %sext = sext i1 %cmp to i32
@@ -96,17 +96,17 @@ define amdgpu_kernel void @s_sext_i1_to_i64(ptr addrspace(1) %out, i32 %a, i32 %
 ;
 ; VI-LABEL: s_sext_i1_to_i64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_cmp_eq_u32 s6, s7
-; VI-NEXT:    s_mov_b32 s0, s4
-; VI-NEXT:    s_mov_b32 s1, s5
-; VI-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[4:5]
+; VI-NEXT:    s_cmp_eq_u32 s2, s3
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v1, v0
-; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
   %cmp = icmp eq i32 %a, %b
   %sext = sext i1 %cmp to i64
@@ -235,16 +235,16 @@ define amdgpu_kernel void @s_sext_i1_to_i16(ptr addrspace(1) %out, i32 %a, i32 %
 ;
 ; VI-LABEL: s_sext_i1_to_i16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_cmp_eq_u32 s6, s7
-; VI-NEXT:    s_mov_b32 s0, s4
-; VI-NEXT:    s_mov_b32 s1, s5
-; VI-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[4:5]
-; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; VI-NEXT:    s_cmp_eq_u32 s2, s3
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
+; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
   %cmp = icmp eq i32 %a, %b
   %sext = sext i1 %cmp to i16
diff --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
index 23222a45818c0..f88aaf389ca9a 100644
--- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
@@ -208,11 +208,11 @@ define amdgpu_kernel void @local_store_i48(ptr addrspace(3) %ptr, i48 %arg) #0 {
 ; GFX11-LABEL: local_store_i48:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b32 s4, s[0:1], 0x0
-; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-NEXT:    ds_store_b16 v0, v1 offset:4
 ; GFX11-NEXT:    ds_store_b32 v0, v2
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/sub.ll b/llvm/test/CodeGen/AMDGPU/sub.ll
index 45aa544bf80af..3ae982089228d 100644
--- a/llvm/test/CodeGen/AMDGPU/sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.ll
@@ -22,12 +22,12 @@ define amdgpu_kernel void @s_sub_i32(ptr addrspace(1) %out, i32 %a, i32 %b) {
 ;
 ; GFX8-LABEL: s_sub_i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_sub_i32 s0, s6, s7
-; GFX8-NEXT:    v_mov_b32_e32 v0, s4
-; GFX8-NEXT:    v_mov_b32_e32 v1, s5
-; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    s_sub_i32 s2, s2, s3
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -43,12 +43,12 @@ define amdgpu_kernel void @s_sub_i32(ptr addrspace(1) %out, i32 %a, i32 %b) {
 ;
 ; GFX12-LABEL: s_sub_i32:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_sub_co_i32 s0, s6, s7
+; GFX12-NEXT:    s_sub_co_i32 s2, s2, s3
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
-; GFX12-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
@@ -95,12 +95,12 @@ define amdgpu_kernel void @s_sub_imm_i32(ptr addrspace(1) %out, i32 %a) {
 ;
 ; GFX12-LABEL: s_sub_imm_i32:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b96 s[4:6], s[0:1], 0x24
+; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_sub_co_i32 s0, 0x4d2, s6
+; GFX12-NEXT:    s_sub_co_i32 s2, 0x4d2, s2
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
-; GFX12-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
index 6c53afe840d18..bfeab97d81dbe 100644
--- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
@@ -227,21 +227,21 @@ define amdgpu_kernel void @s_test_sub_v2i16_kernarg(ptr addrspace(1) %out, <2 x
 ;
 ; VI-LABEL: s_test_sub_v2i16_kernarg:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s0, s4
-; VI-NEXT:    s_mov_b32 s1, s5
-; VI-NEXT:    s_lshr_b32 s4, s6, 16
-; VI-NEXT:    s_lshr_b32 s5, s7, 16
-; VI-NEXT:    s_sub_i32 s4, s4, s5
-; VI-NEXT:    s_sub_i32 s5, s6, s7
-; VI-NEXT:    s_lshl_b32 s4, s4, 16
-; VI-NEXT:    s_and_b32 s5, s5, 0xffff
-; VI-NEXT:    s_or_b32 s4, s5, s4
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_lshr_b32 s0, s2, 16
+; VI-NEXT:    s_lshr_b32 s1, s3, 16
+; VI-NEXT:    s_sub_i32 s0, s0, s1
+; VI-NEXT:    s_sub_i32 s1, s2, s3
+; VI-NEXT:    s_lshl_b32 s0, s0, 16
+; VI-NEXT:    s_and_b32 s1, s1, 0xffff
+; VI-NEXT:    s_or_b32 s0, s1, s0
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: s_test_sub_v2i16_kernarg:
@@ -258,14 +258,14 @@ define amdgpu_kernel void @s_test_sub_v2i16_kernarg(ptr addrspace(1) %out, <2 x
 ;
 ; GFX11-LABEL: s_test_sub_v2i16_kernarg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s6, -1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_pk_sub_i16 v0, s6, s7
-; GFX11-NEXT:    s_mov_b32 s0, s4
-; GFX11-NEXT:    s_mov_b32 s1, s5
-; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT:    v_pk_sub_i16 v0, s2, s3
+; GFX11-NEXT:    s_mov_b32 s4, s0
+; GFX11-NEXT:    s_mov_b32 s5, s1
+; GFX11-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll
index c7952f561427c..f686aad0cefc2 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv.ll
@@ -218,35 +218,35 @@ define amdgpu_kernel void @s_udiv_i32(ptr addrspace(1) %out, i32 %a, i32 %b) {
 ;
 ; VI-LABEL: s_udiv_i32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_cvt_f32_u32_e32 v0, s7
-; VI-NEXT:    s_sub_i32 s0, 0, s7
-; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    v_cvt_f32_u32_e32 v0, s3
+; VI-NEXT:    s_sub_i32 s4, 0, s3
+; VI-NEXT:    s_mov_b32 s5, s1
 ; VI-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; VI-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; VI-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; VI-NEXT:    v_mul_lo_u32 v1, s0, v0
-; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    v_mul_lo_u32 v1, s4, v0
+; VI-NEXT:    s_mov_b32 s4, s0
 ; VI-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
-; VI-NEXT:    v_mul_hi_u32 v0, s6, v0
-; VI-NEXT:    v_readfirstlane_b32 s4, v0
-; VI-NEXT:    s_mul_i32 s4, s4, s7
-; VI-NEXT:    s_sub_i32 s4, s6, s4
-; VI-NEXT:    s_sub_i32 s5, s4, s7
+; VI-NEXT:    v_mul_hi_u32 v0, s2, v0
+; VI-NEXT:    v_readfirstlane_b32 s0, v0
+; VI-NEXT:    s_mul_i32 s0, s0, s3
+; VI-NEXT:    s_sub_i32 s0, s2, s0
+; VI-NEXT:    s_sub_i32 s1, s0, s3
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 1, v0
-; VI-NEXT:    s_cmp_ge_u32 s4, s7
+; VI-NEXT:    s_cmp_ge_u32 s0, s3
 ; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; VI-NEXT:    s_cselect_b32 s4, s5, s4
+; VI-NEXT:    s_cselect_b32 s0, s1, s0
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 1, v0
-; VI-NEXT:    s_cmp_ge_u32 s4, s7
+; VI-NEXT:    s_cmp_ge_u32 s0, s3
 ; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; GCN-LABEL: s_udiv_i32:
diff --git a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
index e5de7d0a74642..65eb1cee42350 100644
--- a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
@@ -122,14 +122,14 @@ define amdgpu_kernel void @v_cnd_nan(ptr addrspace(1) %out, i32 %c, float %f) #0
 ;
 ; VI-LABEL: v_cnd_nan:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_cmp_eq_u32 s6, 0
-; VI-NEXT:    v_mov_b32_e32 v0, s7
+; VI-NEXT:    s_cmp_eq_u32 s2, 0
+; VI-NEXT:    v_mov_b32_e32 v0, s3
 ; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -146,14 +146,14 @@ define amdgpu_kernel void @v_cnd_nan(ptr addrspace(1) %out, i32 %c, float %f) #0
 ;
 ; GFX11-LABEL: v_cnd_nan:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_cmp_eq_u32 s6, 0
-; GFX11-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GFX11-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX11-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, -1, s7, s[0:1]
-; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, -1, s3, s[4:5]
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
index 7f69c4733b38a..e3185e189157b 100644
--- a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
@@ -89,16 +89,16 @@ define <2 x i16> @basic_smax_smin(i16 %src0, i16 %src1) {
 define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg %src0ext, i32 inreg %src1ext) {
 ; SDAG-VI-LABEL: basic_smax_smin_sgpr:
 ; SDAG-VI:       ; %bb.0:
-; SDAG-VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; SDAG-VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; SDAG-VI-NEXT:    v_mov_b32_e32 v0, 0xff
 ; SDAG-VI-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT:    v_max_i16_e64 v1, s6, 0
-; SDAG-VI-NEXT:    v_max_i16_e64 v2, s7, 0
+; SDAG-VI-NEXT:    v_max_i16_e64 v1, s2, 0
+; SDAG-VI-NEXT:    v_max_i16_e64 v2, s3, 0
 ; SDAG-VI-NEXT:    v_min_i16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; SDAG-VI-NEXT:    v_min_i16_e32 v1, 0xff, v1
 ; SDAG-VI-NEXT:    v_or_b32_e32 v2, v1, v0
-; SDAG-VI-NEXT:    v_mov_b32_e32 v0, s4
-; SDAG-VI-NEXT:    v_mov_b32_e32 v1, s5
+; SDAG-VI-NEXT:    v_mov_b32_e32 v0, s0
+; SDAG-VI-NEXT:    v_mov_b32_e32 v1, s1
 ; SDAG-VI-NEXT:    flat_store_dword v[0:1], v2
 ; SDAG-VI-NEXT:    s_endpgm
 ;
@@ -117,40 +117,40 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg
 ;
 ; SDAG-GFX11-LABEL: basic_smax_smin_sgpr:
 ; SDAG-GFX11:       ; %bb.0:
-; SDAG-GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; SDAG-GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; SDAG-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT:    v_med3_i16 v0, s6, 0, 0xff
-; SDAG-GFX11-NEXT:    v_med3_i16 v1, s7, 0, 0xff
+; SDAG-GFX11-NEXT:    v_med3_i16 v0, s2, 0, 0xff
+; SDAG-GFX11-NEXT:    v_med3_i16 v1, s3, 0, 0xff
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SDAG-GFX11-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; SDAG-GFX11-NEXT:    global_store_b32 v2, v0, s[4:5]
+; SDAG-GFX11-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; SDAG-GFX11-NEXT:    s_nop 0
 ; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-VI-LABEL: basic_smax_smin_sgpr:
 ; GISEL-VI:       ; %bb.0:
-; GISEL-VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GISEL-VI-NEXT:    s_sext_i32_i16 s0, 0
-; GISEL-VI-NEXT:    s_sext_i32_i16 s1, 0xff
+; GISEL-VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-VI-NEXT:    s_sext_i32_i16 s4, 0
+; GISEL-VI-NEXT:    s_sext_i32_i16 s5, 0xff
 ; GISEL-VI-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT:    s_sext_i32_i16 s2, s6
-; GISEL-VI-NEXT:    s_sext_i32_i16 s3, s7
-; GISEL-VI-NEXT:    s_max_i32 s2, s2, s0
-; GISEL-VI-NEXT:    s_max_i32 s0, s3, s0
-; GISEL-VI-NEXT:    s_sext_i32_i16 s0, s0
+; GISEL-VI-NEXT:    s_sext_i32_i16 s3, s3
 ; GISEL-VI-NEXT:    s_sext_i32_i16 s2, s2
-; GISEL-VI-NEXT:    s_min_i32 s0, s0, s1
-; GISEL-VI-NEXT:    s_min_i32 s2, s2, s1
-; GISEL-VI-NEXT:    s_and_b32 s0, 0xffff, s0
-; GISEL-VI-NEXT:    s_and_b32 s1, 0xffff, s2
-; GISEL-VI-NEXT:    s_lshl_b32 s0, s0, 16
-; GISEL-VI-NEXT:    s_or_b32 s0, s1, s0
-; GISEL-VI-NEXT:    v_mov_b32_e32 v0, s4
-; GISEL-VI-NEXT:    v_mov_b32_e32 v2, s0
-; GISEL-VI-NEXT:    v_mov_b32_e32 v1, s5
+; GISEL-VI-NEXT:    s_max_i32 s3, s3, s4
+; GISEL-VI-NEXT:    s_max_i32 s2, s2, s4
+; GISEL-VI-NEXT:    s_sext_i32_i16 s3, s3
+; GISEL-VI-NEXT:    s_sext_i32_i16 s2, s2
+; GISEL-VI-NEXT:    s_min_i32 s3, s3, s5
+; GISEL-VI-NEXT:    s_min_i32 s2, s2, s5
+; GISEL-VI-NEXT:    s_and_b32 s3, 0xffff, s3
+; GISEL-VI-NEXT:    s_and_b32 s2, 0xffff, s2
+; GISEL-VI-NEXT:    s_lshl_b32 s3, s3, 16
+; GISEL-VI-NEXT:    s_or_b32 s2, s2, s3
+; GISEL-VI-NEXT:    v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT:    v_mov_b32_e32 v2, s2
+; GISEL-VI-NEXT:    v_mov_b32_e32 v1, s1
 ; GISEL-VI-NEXT:    flat_store_dword v[0:1], v2
 ; GISEL-VI-NEXT:    s_endpgm
 ;
@@ -176,23 +176,23 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg
 ;
 ; GISEL-GFX11-LABEL: basic_smax_smin_sgpr:
 ; GISEL-GFX11:       ; %bb.0:
-; GISEL-GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
-; GISEL-GFX11-NEXT:    s_sext_i32_i16 s0, 0
-; GISEL-GFX11-NEXT:    s_sext_i32_i16 s1, 0xff
+; GISEL-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT:    s_sext_i32_i16 s4, 0
+; GISEL-GFX11-NEXT:    s_sext_i32_i16 s5, 0xff
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT:    s_sext_i32_i16 s2, s6
-; GISEL-GFX11-NEXT:    s_sext_i32_i16 s3, s7
-; GISEL-GFX11-NEXT:    s_max_i32 s2, s2, s0
-; GISEL-GFX11-NEXT:    s_max_i32 s0, s3, s0
 ; GISEL-GFX11-NEXT:    s_sext_i32_i16 s2, s2
-; GISEL-GFX11-NEXT:    s_sext_i32_i16 s0, s0
-; GISEL-GFX11-NEXT:    s_min_i32 s2, s2, s1
-; GISEL-GFX11-NEXT:    s_min_i32 s0, s0, s1
+; GISEL-GFX11-NEXT:    s_sext_i32_i16 s3, s3
+; GISEL-GFX11-NEXT:    s_max_i32 s2, s2, s4
+; GISEL-GFX11-NEXT:    s_max_i32 s3, s3, s4
+; GISEL-GFX11-NEXT:    s_sext_i32_i16 s2, s2
+; GISEL-GFX11-NEXT:    s_sext_i32_i16 s3, s3
+; GISEL-GFX11-NEXT:    s_min_i32 s2, s2, s5
+; GISEL-GFX11-NEXT:    s_min_i32 s3, s3, s5
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-GFX11-NEXT:    s_pack_ll_b32_b16 s0, s2, s0
-; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s0
-; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GISEL-GFX11-NEXT:    s_pack_ll_b32_b16 s2, s2, s3
+; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
+; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GISEL-GFX11-NEXT:    s_nop 0
 ; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/xor.ll b/llvm/test/CodeGen/AMDGPU/xor.ll
index 5422bfa5389c9..e15fd7f29671a 100644
--- a/llvm/test/CodeGen/AMDGPU/xor.ll
+++ b/llvm/test/CodeGen/AMDGPU/xor.ll
@@ -276,12 +276,12 @@ define amdgpu_kernel void @scalar_xor_i32(ptr addrspace(1) %out, i32 %a, i32 %b)
 ;
 ; VI-LABEL: scalar_xor_i32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_xor_b32 s0, s6, s7
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    s_xor_b32 s2, s2, s3
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
   %result = xor i32 %a, %b

>From a9205abe803b1cc6bbebe9147eb64fae6615cbd1 Mon Sep 17 00:00:00 2001
From: Christudasan Devadasan <Christudasan.Devadasan at amd.com>
Date: Tue, 25 Jun 2024 05:21:48 +0000
Subject: [PATCH 3/6] take the alignment into consideration.

---
 .../Target/AMDGPU/SILoadStoreOptimizer.cpp    |  52 +--
 .../AMDGPU/GlobalISel/fp-atomics-gfx940.ll    |  12 +-
 .../GlobalISel/llvm.amdgcn.update.dpp.ll      |  28 +-
 .../test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll | 152 +++----
 .../test/CodeGen/AMDGPU/GlobalISel/udivrem.ll |  84 ++--
 llvm/test/CodeGen/AMDGPU/add.v2i16.ll         |  14 +-
 .../AMDGPU/amdgpu-codegenprepare-idiv.ll      | 408 +++++++++---------
 llvm/test/CodeGen/AMDGPU/build_vector.ll      |  12 +-
 llvm/test/CodeGen/AMDGPU/cluster_stores.ll    |  88 ++--
 .../CodeGen/AMDGPU/combine-cond-add-sub.ll    |  20 +-
 llvm/test/CodeGen/AMDGPU/ctlz.ll              | 180 ++++----
 llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll   | 158 +++----
 llvm/test/CodeGen/AMDGPU/cttz.ll              | 184 ++++----
 llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll   | 126 +++---
 llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll     | 262 +++++------
 .../AMDGPU/divergence-driven-buildvector.ll   |  32 +-
 llvm/test/CodeGen/AMDGPU/ds_read2.ll          |  16 +-
 llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll     |  20 +-
 llvm/test/CodeGen/AMDGPU/fdiv.ll              |  86 ++--
 llvm/test/CodeGen/AMDGPU/flat_atomics.ll      |  20 +-
 .../CodeGen/AMDGPU/flat_atomics_i32_system.ll |  56 +--
 llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll       | 180 ++++----
 llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll |  12 +-
 llvm/test/CodeGen/AMDGPU/fshl.ll              |  42 +-
 llvm/test/CodeGen/AMDGPU/fshr.ll              |  14 +-
 llvm/test/CodeGen/AMDGPU/global_atomics.ll    |  16 +-
 .../AMDGPU/global_atomics_i32_system.ll       |  96 ++---
 .../insert_waitcnt_for_precise_memory.ll      | 160 +++----
 .../CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll   |  14 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll    |  32 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll    |  24 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll    |  16 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll    |  16 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll    |  16 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.permlane.ll    | 112 ++---
 .../AMDGPU/llvm.amdgcn.sched.group.barrier.ll | 244 +++++------
 llvm/test/CodeGen/AMDGPU/llvm.exp.ll          |  48 +--
 llvm/test/CodeGen/AMDGPU/llvm.exp10.ll        |  48 +--
 llvm/test/CodeGen/AMDGPU/llvm.exp2.ll         |  12 +-
 llvm/test/CodeGen/AMDGPU/llvm.log2.ll         |  12 +-
 llvm/test/CodeGen/AMDGPU/llvm.round.ll        |  76 ++--
 llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll        |  14 +-
 llvm/test/CodeGen/AMDGPU/madak.ll             | 104 ++---
 llvm/test/CodeGen/AMDGPU/memory_clause.ll     |  18 +-
 .../CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll  | 160 +++----
 llvm/test/CodeGen/AMDGPU/mul_int24.ll         |  40 +-
 llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll |  52 +--
 llvm/test/CodeGen/AMDGPU/packed-op-sel.ll     |  72 ++--
 llvm/test/CodeGen/AMDGPU/preload-kernargs.ll  |  48 +--
 .../AMDGPU/ptr-buffer-alias-scheduling.ll     |  36 +-
 llvm/test/CodeGen/AMDGPU/rotl.ll              |   8 +-
 llvm/test/CodeGen/AMDGPU/rotr.ll              |   6 +-
 llvm/test/CodeGen/AMDGPU/shl.v2i16.ll         |  30 +-
 llvm/test/CodeGen/AMDGPU/sub.ll               |   8 +-
 llvm/test/CodeGen/AMDGPU/sub.v2i16.ll         |  30 +-
 llvm/test/CodeGen/AMDGPU/v_cndmask.ll         |  10 +-
 llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll   |  34 +-
 llvm/test/CodeGen/AMDGPU/wave32.ll            |  54 +--
 58 files changed, 1955 insertions(+), 1969 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index f1c9f19346c7d..7bdc494ff545d 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -1725,36 +1725,30 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
     case 8:
       return AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
     }
-  case S_LOAD_IMM:
-    // Use the constrained opcodes when the subtarget has the XNACK support
-    // enabled.
-    if (STM->isXNACKEnabled()) {
-      switch (Width) {
-      default:
-        return 0;
-      case 2:
-        return AMDGPU::S_LOAD_DWORDX2_IMM_ec;
-      case 3:
-        return AMDGPU::S_LOAD_DWORDX3_IMM_ec;
-      case 4:
-        return AMDGPU::S_LOAD_DWORDX4_IMM_ec;
-      case 8:
-        return AMDGPU::S_LOAD_DWORDX8_IMM_ec;
-      }
-    } else {
-      switch (Width) {
-      default:
-        return 0;
-      case 2:
-        return AMDGPU::S_LOAD_DWORDX2_IMM;
-      case 3:
-        return AMDGPU::S_LOAD_DWORDX3_IMM;
-      case 4:
-        return AMDGPU::S_LOAD_DWORDX4_IMM;
-      case 8:
-        return AMDGPU::S_LOAD_DWORDX8_IMM;
-      }
+  case S_LOAD_IMM: {
+    // If XNACK is enabled, use the constrained opcodes when the first load is
+    // under-aligned.
+    const MachineMemOperand *MMO = *CI.I->memoperands_begin();
+    auto NeedsConstrainedOpc = [&MMO, Width](const GCNSubtarget &ST) {
+      return ST.isXNACKEnabled() && MMO->getAlign().value() < Width;
+    };
+    switch (Width) {
+    default:
+      return 0;
+    case 2:
+      return NeedsConstrainedOpc(*STM) ? AMDGPU::S_LOAD_DWORDX2_IMM_ec
+                                       : AMDGPU::S_LOAD_DWORDX2_IMM;
+    case 3:
+      return NeedsConstrainedOpc(*STM) ? AMDGPU::S_LOAD_DWORDX3_IMM_ec
+                                       : AMDGPU::S_LOAD_DWORDX3_IMM;
+    case 4:
+      return NeedsConstrainedOpc(*STM) ? AMDGPU::S_LOAD_DWORDX4_IMM_ec
+                                       : AMDGPU::S_LOAD_DWORDX4_IMM;
+    case 8:
+      return NeedsConstrainedOpc(*STM) ? AMDGPU::S_LOAD_DWORDX8_IMM_ec
+                                       : AMDGPU::S_LOAD_DWORDX8_IMM;
     }
+  }
   case GLOBAL_LOAD:
     switch (Width) {
     default:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll
index a018ea5bf18f1..fff341b07881b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll
@@ -159,10 +159,10 @@ define <2 x i16> @global_atomic_fadd_v2bf16_rtn(ptr addrspace(1) %ptr, <2 x i16>
 define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, <2 x half> %data) {
 ; GFX940-LABEL: local_atomic_fadd_v2f16_noret:
 ; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX940-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NEXT:    v_mov_b32_e32 v0, s0
+; GFX940-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NEXT:    ds_pk_add_f16 v0, v1
 ; GFX940-NEXT:    s_endpgm
   %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0)
@@ -183,10 +183,10 @@ define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half>
 define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, <2 x i16> %data) {
 ; GFX940-LABEL: local_atomic_fadd_v2bf16_noret:
 ; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX940-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v0, s3
-; GFX940-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
 ; GFX940-NEXT:    ds_pk_add_bf16 v1, v0
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
index 1092bb4dc834a..727184a36c006 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
@@ -19,13 +19,13 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in1, i32 %in2) {
 ;
 ; GFX10-LABEL: dpp_test:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s6
-; GFX10-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: dpp_test:
@@ -174,16 +174,16 @@ define amdgpu_kernel void @update_dppv2i32_test(ptr addrspace(1) %arg, <2 x i32>
 ;
 ; GFX10-LABEL: update_dppv2i32_test:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[4:5]
-; GFX10-NEXT:    v_mov_b32_e32 v2, s6
-; GFX10-NEXT:    v_mov_b32_e32 v3, s7
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[0:1]
+; GFX10-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; GFX10-NEXT:    v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
-; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[4:5]
+; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: update_dppv2i32_test:
@@ -229,16 +229,16 @@ define amdgpu_kernel void @update_dppv2f32_test(ptr addrspace(1) %arg, <2 x floa
 ;
 ; GFX10-LABEL: update_dppv2f32_test:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[4:5]
-; GFX10-NEXT:    v_mov_b32_e32 v2, s6
-; GFX10-NEXT:    v_mov_b32_e32 v3, s7
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[0:1]
+; GFX10-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; GFX10-NEXT:    v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
-; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[4:5]
+; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: update_dppv2f32_test:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
index 8a2274cbfbf62..b666f45521661 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
@@ -692,121 +692,121 @@ define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1
 ;
 ; GFX9-LABEL: sdivrem_v2i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x0
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s0, s14, 31
-; GFX9-NEXT:    s_add_i32 s1, s14, s0
-; GFX9-NEXT:    s_xor_b32 s1, s1, s0
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s1
-; GFX9-NEXT:    s_ashr_i32 s2, s15, 31
-; GFX9-NEXT:    s_add_i32 s3, s15, s2
-; GFX9-NEXT:    s_xor_b32 s3, s3, s2
+; GFX9-NEXT:    s_ashr_i32 s8, s6, 31
+; GFX9-NEXT:    s_add_i32 s6, s6, s8
+; GFX9-NEXT:    s_xor_b32 s6, s6, s8
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s6
+; GFX9-NEXT:    s_ashr_i32 s9, s7, 31
+; GFX9-NEXT:    s_add_i32 s7, s7, s9
+; GFX9-NEXT:    s_xor_b32 s7, s7, s9
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s3
-; GFX9-NEXT:    s_sub_i32 s6, 0, s1
-; GFX9-NEXT:    s_ashr_i32 s4, s12, 31
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s7
+; GFX9-NEXT:    s_sub_i32 s12, 0, s6
+; GFX9-NEXT:    s_ashr_i32 s10, s4, 31
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GFX9-NEXT:    s_sub_i32 s7, 0, s3
-; GFX9-NEXT:    s_ashr_i32 s5, s13, 31
-; GFX9-NEXT:    v_mul_lo_u32 v2, s6, v0
+; GFX9-NEXT:    s_add_i32 s4, s4, s10
+; GFX9-NEXT:    s_xor_b32 s4, s4, s10
+; GFX9-NEXT:    v_mul_lo_u32 v2, s12, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    s_add_i32 s6, s12, s4
+; GFX9-NEXT:    s_sub_i32 s12, 0, s7
 ; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
-; GFX9-NEXT:    s_xor_b32 s6, s6, s4
-; GFX9-NEXT:    v_mul_lo_u32 v3, s7, v1
-; GFX9-NEXT:    s_add_i32 s7, s13, s5
+; GFX9-NEXT:    s_ashr_i32 s11, s5, 31
+; GFX9-NEXT:    v_mul_lo_u32 v3, s12, v1
+; GFX9-NEXT:    s_add_i32 s5, s5, s11
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v0, s6, v0
+; GFX9-NEXT:    v_mul_hi_u32 v0, s4, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v2, v1, v3
-; GFX9-NEXT:    s_xor_b32 s7, s7, s5
-; GFX9-NEXT:    s_xor_b32 s0, s4, s0
-; GFX9-NEXT:    v_mul_lo_u32 v3, v0, s1
+; GFX9-NEXT:    s_xor_b32 s5, s5, s11
+; GFX9-NEXT:    v_mul_lo_u32 v3, v0, s6
 ; GFX9-NEXT:    v_add_u32_e32 v1, v1, v2
 ; GFX9-NEXT:    v_add_u32_e32 v2, 1, v0
-; GFX9-NEXT:    v_mul_hi_u32 v1, s7, v1
-; GFX9-NEXT:    v_sub_u32_e32 v3, s6, v3
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s1, v3
+; GFX9-NEXT:    v_mul_hi_u32 v1, s5, v1
+; GFX9-NEXT:    v_sub_u32_e32 v3, s4, v3
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s6, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v2, s1, v3
+; GFX9-NEXT:    v_subrev_u32_e32 v2, s6, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s1, v2
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s6, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v3, s1, v2
+; GFX9-NEXT:    v_subrev_u32_e32 v3, s6, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v3, v1, s3
+; GFX9-NEXT:    v_mul_lo_u32 v3, v1, s7
 ; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
-; GFX9-NEXT:    v_xor_b32_e32 v0, s0, v0
-; GFX9-NEXT:    v_subrev_u32_e32 v0, s0, v0
-; GFX9-NEXT:    v_sub_u32_e32 v3, s7, v3
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v3
+; GFX9-NEXT:    s_xor_b32 s4, s10, s8
+; GFX9-NEXT:    v_xor_b32_e32 v0, s4, v0
+; GFX9-NEXT:    v_sub_u32_e32 v3, s5, v3
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s7, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v4, s3, v3
+; GFX9-NEXT:    v_subrev_u32_e32 v4, s7, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v3
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s7, v3
+; GFX9-NEXT:    v_subrev_u32_e32 v0, s4, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v4, s3, v3
-; GFX9-NEXT:    s_xor_b32 s0, s5, s2
+; GFX9-NEXT:    v_subrev_u32_e32 v4, s7, v3
+; GFX9-NEXT:    s_xor_b32 s4, s11, s9
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v1, s0, v1
-; GFX9-NEXT:    v_xor_b32_e32 v2, s4, v2
-; GFX9-NEXT:    v_subrev_u32_e32 v1, s0, v1
-; GFX9-NEXT:    v_xor_b32_e32 v3, s5, v3
+; GFX9-NEXT:    v_xor_b32_e32 v1, s4, v1
+; GFX9-NEXT:    v_xor_b32_e32 v2, s10, v2
+; GFX9-NEXT:    v_subrev_u32_e32 v1, s4, v1
+; GFX9-NEXT:    v_xor_b32_e32 v3, s11, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-NEXT:    v_subrev_u32_e32 v2, s4, v2
-; GFX9-NEXT:    v_subrev_u32_e32 v3, s5, v3
-; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[8:9]
-; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[10:11]
+; GFX9-NEXT:    v_subrev_u32_e32 v2, s10, v2
+; GFX9-NEXT:    v_subrev_u32_e32 v3, s11, v3
+; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: sdivrem_v2i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x0
+; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[4:5], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_ashr_i32 s1, s14, 31
-; GFX10-NEXT:    s_ashr_i32 s2, s15, 31
-; GFX10-NEXT:    s_add_i32 s0, s14, s1
-; GFX10-NEXT:    s_add_i32 s3, s15, s2
-; GFX10-NEXT:    s_xor_b32 s4, s0, s1
+; GFX10-NEXT:    s_ashr_i32 s1, s10, 31
+; GFX10-NEXT:    s_ashr_i32 s2, s11, 31
+; GFX10-NEXT:    s_add_i32 s0, s10, s1
+; GFX10-NEXT:    s_add_i32 s3, s11, s2
+; GFX10-NEXT:    s_xor_b32 s10, s0, s1
 ; GFX10-NEXT:    s_xor_b32 s3, s3, s2
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s4
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s10
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s3
-; GFX10-NEXT:    s_sub_i32 s0, 0, s4
-; GFX10-NEXT:    s_sub_i32 s5, 0, s3
-; GFX10-NEXT:    s_ashr_i32 s6, s13, 31
+; GFX10-NEXT:    s_sub_i32 s0, 0, s10
+; GFX10-NEXT:    s_sub_i32 s11, 0, s3
+; GFX10-NEXT:    s_ashr_i32 s12, s9, 31
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GFX10-NEXT:    s_add_i32 s7, s13, s6
-; GFX10-NEXT:    s_xor_b32 s7, s7, s6
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX10-NEXT:    v_mul_lo_u32 v2, s0, v0
-; GFX10-NEXT:    v_mul_lo_u32 v3, s5, v1
-; GFX10-NEXT:    s_ashr_i32 s5, s12, 31
-; GFX10-NEXT:    s_add_i32 s0, s12, s5
-; GFX10-NEXT:    s_xor_b32 s1, s5, s1
-; GFX10-NEXT:    s_xor_b32 s0, s0, s5
+; GFX10-NEXT:    v_mul_lo_u32 v3, s11, v1
+; GFX10-NEXT:    s_ashr_i32 s11, s8, 31
+; GFX10-NEXT:    s_add_i32 s0, s8, s11
+; GFX10-NEXT:    s_add_i32 s8, s9, s12
+; GFX10-NEXT:    s_xor_b32 s0, s0, s11
+; GFX10-NEXT:    s_xor_b32 s8, s8, s12
 ; GFX10-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX10-NEXT:    v_mul_hi_u32 v3, v1, v3
+; GFX10-NEXT:    s_xor_b32 s1, s11, s1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_add_nc_u32_e32 v1, v1, v3
 ; GFX10-NEXT:    v_mul_hi_u32 v0, s0, v0
-; GFX10-NEXT:    v_mul_hi_u32 v1, s7, v1
-; GFX10-NEXT:    v_mul_lo_u32 v2, v0, s4
+; GFX10-NEXT:    v_mul_hi_u32 v1, s8, v1
+; GFX10-NEXT:    v_mul_lo_u32 v2, v0, s10
 ; GFX10-NEXT:    v_mul_lo_u32 v3, v1, s3
 ; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v5, 1, v1
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v2, s0, v2
-; GFX10-NEXT:    v_sub_nc_u32_e32 v3, s7, v3
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s4, v2
+; GFX10-NEXT:    v_sub_nc_u32_e32 v3, s8, v3
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s10, v2
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s3, v3
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s4, v2
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s10, v2
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s3, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
@@ -814,26 +814,26 @@ define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
 ; GFX10-NEXT:    v_add_nc_u32_e32 v5, 1, v1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v0
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s4, v2
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s10, v2
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s3, v3
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s4, v2
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s10, v2
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s3, v3
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
-; GFX10-NEXT:    s_xor_b32 s0, s6, s2
+; GFX10-NEXT:    s_xor_b32 s0, s12, s2
 ; GFX10-NEXT:    v_xor_b32_e32 v0, s1, v0
 ; GFX10-NEXT:    v_xor_b32_e32 v1, s0, v1
-; GFX10-NEXT:    v_xor_b32_e32 v2, s5, v2
-; GFX10-NEXT:    v_xor_b32_e32 v3, s6, v3
+; GFX10-NEXT:    v_xor_b32_e32 v2, s11, v2
+; GFX10-NEXT:    v_xor_b32_e32 v3, s12, v3
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v0, s1, v0
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v1, s0, v1
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, s5, v2
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, s6, v3
-; GFX10-NEXT:    global_store_dwordx2 v4, v[0:1], s[8:9]
-; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[10:11]
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, s11, v2
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, s12, v3
+; GFX10-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
+; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[6:7]
 ; GFX10-NEXT:    s_endpgm
   %div = sdiv <2 x i32> %x, %y
   store <2 x i32> %div, ptr addrspace(1) %out0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
index 62d8b7d6f045c..a58397eccaba7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
@@ -576,12 +576,12 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1
 ;
 ; GFX9-LABEL: udivrem_v2i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x0
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[4:5], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s14
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s15
-; GFX9-NEXT:    s_sub_i32 s0, 0, s14
-; GFX9-NEXT:    s_sub_i32 s1, 0, s15
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s10
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s11
+; GFX9-NEXT:    s_sub_i32 s0, 0, s10
+; GFX9-NEXT:    s_sub_i32 s1, 0, s11
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
@@ -593,47 +593,47 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v0, s12, v0
+; GFX9-NEXT:    v_mul_hi_u32 v0, s8, v0
 ; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
-; GFX9-NEXT:    v_mul_hi_u32 v1, s13, v1
-; GFX9-NEXT:    v_mul_lo_u32 v2, v0, s14
+; GFX9-NEXT:    v_mul_hi_u32 v1, s9, v1
+; GFX9-NEXT:    v_mul_lo_u32 v2, v0, s10
 ; GFX9-NEXT:    v_add_u32_e32 v4, 1, v0
-; GFX9-NEXT:    v_mul_lo_u32 v3, v1, s15
+; GFX9-NEXT:    v_mul_lo_u32 v3, v1, s11
 ; GFX9-NEXT:    v_add_u32_e32 v5, 1, v1
-; GFX9-NEXT:    v_sub_u32_e32 v2, s12, v2
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s14, v2
-; GFX9-NEXT:    v_sub_u32_e32 v3, s13, v3
+; GFX9-NEXT:    v_sub_u32_e32 v2, s8, v2
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
+; GFX9-NEXT:    v_sub_u32_e32 v3, s9, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v4, s14, v2
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s15, v3
+; GFX9-NEXT:    v_subrev_u32_e32 v4, s10, v2
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[0:1]
-; GFX9-NEXT:    v_subrev_u32_e32 v5, s15, v3
+; GFX9-NEXT:    v_subrev_u32_e32 v5, s11, v3
 ; GFX9-NEXT:    v_add_u32_e32 v4, 1, v0
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s14, v2
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v4, s14, v2
+; GFX9-NEXT:    v_subrev_u32_e32 v4, s10, v2
 ; GFX9-NEXT:    v_add_u32_e32 v5, 1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s15, v3
-; GFX9-NEXT:    v_subrev_u32_e32 v4, s15, v3
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
+; GFX9-NEXT:    v_subrev_u32_e32 v4, s11, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[8:9]
-; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[10:11]
+; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
+; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: udivrem_v2i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x0
+; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[4:5], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s14
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s15
-; GFX10-NEXT:    s_sub_i32 s0, 0, s14
-; GFX10-NEXT:    s_sub_i32 s1, 0, s15
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s10
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s11
+; GFX10-NEXT:    s_sub_i32 s0, 0, s10
+; GFX10-NEXT:    s_sub_i32 s1, 0, s11
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
@@ -646,34 +646,34 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX10-NEXT:    v_mul_hi_u32 v3, v1, v3
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_add_nc_u32_e32 v1, v1, v3
-; GFX10-NEXT:    v_mul_hi_u32 v0, s12, v0
-; GFX10-NEXT:    v_mul_hi_u32 v1, s13, v1
-; GFX10-NEXT:    v_mul_lo_u32 v2, v0, s14
-; GFX10-NEXT:    v_mul_lo_u32 v3, v1, s15
+; GFX10-NEXT:    v_mul_hi_u32 v0, s8, v0
+; GFX10-NEXT:    v_mul_hi_u32 v1, s9, v1
+; GFX10-NEXT:    v_mul_lo_u32 v2, v0, s10
+; GFX10-NEXT:    v_mul_lo_u32 v3, v1, s11
 ; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v5, 1, v1
-; GFX10-NEXT:    v_sub_nc_u32_e32 v2, s12, v2
-; GFX10-NEXT:    v_sub_nc_u32_e32 v3, s13, v3
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s14, v2
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s15, v3
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s14, v2
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s15, v3
+; GFX10-NEXT:    v_sub_nc_u32_e32 v2, s8, v2
+; GFX10-NEXT:    v_sub_nc_u32_e32 v3, s9, v3
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s10, v2
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s11, v3
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s10, v2
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s11, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
 ; GFX10-NEXT:    v_add_nc_u32_e32 v5, 1, v1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v0
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s14, v2
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s15, v3
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s14, v2
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s15, v3
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s10, v2
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s11, v3
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s10, v2
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s11, v3
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
-; GFX10-NEXT:    global_store_dwordx2 v8, v[0:1], s[8:9]
-; GFX10-NEXT:    global_store_dwordx2 v8, v[2:3], s[10:11]
+; GFX10-NEXT:    global_store_dwordx2 v8, v[0:1], s[4:5]
+; GFX10-NEXT:    global_store_dwordx2 v8, v[2:3], s[6:7]
 ; GFX10-NEXT:    s_endpgm
   %div = udiv <2 x i32> %x, %y
   store <2 x i32> %div, ptr addrspace(1) %out0
diff --git a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
index 611a7b566070c..6f67ce4de9ce5 100644
--- a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
@@ -243,21 +243,21 @@ define amdgpu_kernel void @s_test_add_v2i16_kernarg(ptr addrspace(1) %out, <2 x
 ;
 ; GFX9-LABEL: s_test_add_v2i16_kernarg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-NEXT:    v_pk_add_u16 v1, s6, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_pk_add_u16 v1, s2, v1
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: s_test_add_v2i16_kernarg:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_pk_add_u16 v1, s6, s7
-; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT:    v_pk_add_u16 v1, s2, s3
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_test_add_v2i16_kernarg:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index 559871d162e13..8144fb7a3b646 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -72,31 +72,31 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ;
 ; GFX9-LABEL: udiv_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s7
-; GFX9-NEXT:    s_sub_i32 s0, 0, s7
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
+; GFX9-NEXT:    s_sub_i32 s4, 0, s3
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_readfirstlane_b32 s1, v0
-; GFX9-NEXT:    s_mul_i32 s0, s0, s1
-; GFX9-NEXT:    s_mul_hi_u32 s0, s1, s0
-; GFX9-NEXT:    s_add_i32 s1, s1, s0
-; GFX9-NEXT:    s_mul_hi_u32 s0, s6, s1
-; GFX9-NEXT:    s_mul_i32 s1, s0, s7
-; GFX9-NEXT:    s_sub_i32 s1, s6, s1
-; GFX9-NEXT:    s_add_i32 s2, s0, 1
-; GFX9-NEXT:    s_sub_i32 s3, s1, s7
-; GFX9-NEXT:    s_cmp_ge_u32 s1, s7
-; GFX9-NEXT:    s_cselect_b32 s0, s2, s0
-; GFX9-NEXT:    s_cselect_b32 s1, s3, s1
-; GFX9-NEXT:    s_add_i32 s2, s0, 1
-; GFX9-NEXT:    s_cmp_ge_u32 s1, s7
-; GFX9-NEXT:    s_cselect_b32 s0, s2, s0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX9-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX9-NEXT:    s_mul_i32 s4, s4, s5
+; GFX9-NEXT:    s_mul_hi_u32 s4, s5, s4
+; GFX9-NEXT:    s_add_i32 s5, s5, s4
+; GFX9-NEXT:    s_mul_hi_u32 s4, s2, s5
+; GFX9-NEXT:    s_mul_i32 s5, s4, s3
+; GFX9-NEXT:    s_sub_i32 s2, s2, s5
+; GFX9-NEXT:    s_add_i32 s6, s4, 1
+; GFX9-NEXT:    s_sub_i32 s5, s2, s3
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX9-NEXT:    s_cselect_b32 s4, s6, s4
+; GFX9-NEXT:    s_cselect_b32 s2, s5, s2
+; GFX9-NEXT:    s_add_i32 s5, s4, 1
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX9-NEXT:    s_cselect_b32 s2, s5, s4
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = udiv i32 %x, %y
   store i32 %r, ptr addrspace(1) %out
@@ -167,29 +167,29 @@ define amdgpu_kernel void @urem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ;
 ; GFX9-LABEL: urem_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s7
-; GFX9-NEXT:    s_sub_i32 s0, 0, s7
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
+; GFX9-NEXT:    s_sub_i32 s4, 0, s3
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_readfirstlane_b32 s1, v0
-; GFX9-NEXT:    s_mul_i32 s0, s0, s1
-; GFX9-NEXT:    s_mul_hi_u32 s0, s1, s0
-; GFX9-NEXT:    s_add_i32 s1, s1, s0
-; GFX9-NEXT:    s_mul_hi_u32 s0, s6, s1
-; GFX9-NEXT:    s_mul_i32 s0, s0, s7
-; GFX9-NEXT:    s_sub_i32 s0, s6, s0
-; GFX9-NEXT:    s_sub_i32 s1, s0, s7
-; GFX9-NEXT:    s_cmp_ge_u32 s0, s7
-; GFX9-NEXT:    s_cselect_b32 s0, s1, s0
-; GFX9-NEXT:    s_sub_i32 s1, s0, s7
-; GFX9-NEXT:    s_cmp_ge_u32 s0, s7
-; GFX9-NEXT:    s_cselect_b32 s0, s1, s0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX9-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX9-NEXT:    s_mul_i32 s4, s4, s5
+; GFX9-NEXT:    s_mul_hi_u32 s4, s5, s4
+; GFX9-NEXT:    s_add_i32 s5, s5, s4
+; GFX9-NEXT:    s_mul_hi_u32 s4, s2, s5
+; GFX9-NEXT:    s_mul_i32 s4, s4, s3
+; GFX9-NEXT:    s_sub_i32 s2, s2, s4
+; GFX9-NEXT:    s_sub_i32 s4, s2, s3
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX9-NEXT:    s_cselect_b32 s2, s4, s2
+; GFX9-NEXT:    s_sub_i32 s4, s2, s3
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX9-NEXT:    s_cselect_b32 s2, s4, s2
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = urem i32 %x, %y
   store i32 %r, ptr addrspace(1) %out
@@ -280,37 +280,37 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ;
 ; GFX9-LABEL: sdiv_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_abs_i32 s0, s7
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s0
-; GFX9-NEXT:    s_xor_b32 s1, s6, s7
-; GFX9-NEXT:    s_abs_i32 s2, s6
-; GFX9-NEXT:    s_sub_i32 s3, 0, s0
+; GFX9-NEXT:    s_abs_i32 s4, s3
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s4
+; GFX9-NEXT:    s_sub_i32 s5, 0, s4
+; GFX9-NEXT:    s_xor_b32 s3, s2, s3
+; GFX9-NEXT:    s_abs_i32 s2, s2
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    s_ashr_i32 s1, s1, 31
+; GFX9-NEXT:    s_ashr_i32 s3, s3, 31
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s6, v0
-; GFX9-NEXT:    s_mul_i32 s3, s3, s6
-; GFX9-NEXT:    s_mul_hi_u32 s3, s6, s3
-; GFX9-NEXT:    s_add_i32 s6, s6, s3
-; GFX9-NEXT:    s_mul_hi_u32 s3, s2, s6
-; GFX9-NEXT:    s_mul_i32 s6, s3, s0
+; GFX9-NEXT:    s_mul_i32 s5, s5, s6
+; GFX9-NEXT:    s_mul_hi_u32 s5, s6, s5
+; GFX9-NEXT:    s_add_i32 s6, s6, s5
+; GFX9-NEXT:    s_mul_hi_u32 s5, s2, s6
+; GFX9-NEXT:    s_mul_i32 s6, s5, s4
 ; GFX9-NEXT:    s_sub_i32 s2, s2, s6
-; GFX9-NEXT:    s_add_i32 s7, s3, 1
-; GFX9-NEXT:    s_sub_i32 s6, s2, s0
-; GFX9-NEXT:    s_cmp_ge_u32 s2, s0
-; GFX9-NEXT:    s_cselect_b32 s3, s7, s3
+; GFX9-NEXT:    s_add_i32 s7, s5, 1
+; GFX9-NEXT:    s_sub_i32 s6, s2, s4
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s4
+; GFX9-NEXT:    s_cselect_b32 s5, s7, s5
 ; GFX9-NEXT:    s_cselect_b32 s2, s6, s2
-; GFX9-NEXT:    s_add_i32 s6, s3, 1
-; GFX9-NEXT:    s_cmp_ge_u32 s2, s0
-; GFX9-NEXT:    s_cselect_b32 s0, s6, s3
-; GFX9-NEXT:    s_xor_b32 s0, s0, s1
-; GFX9-NEXT:    s_sub_i32 s0, s0, s1
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX9-NEXT:    s_add_i32 s6, s5, 1
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s4
+; GFX9-NEXT:    s_cselect_b32 s2, s6, s5
+; GFX9-NEXT:    s_xor_b32 s2, s2, s3
+; GFX9-NEXT:    s_sub_i32 s2, s2, s3
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = sdiv i32 %x, %y
   store i32 %r, ptr addrspace(1) %out
@@ -394,34 +394,34 @@ define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ;
 ; GFX9-LABEL: srem_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_abs_i32 s0, s7
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s0
-; GFX9-NEXT:    s_ashr_i32 s1, s6, 31
-; GFX9-NEXT:    s_abs_i32 s2, s6
-; GFX9-NEXT:    s_sub_i32 s3, 0, s0
+; GFX9-NEXT:    s_abs_i32 s3, s3
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
+; GFX9-NEXT:    s_sub_i32 s5, 0, s3
+; GFX9-NEXT:    s_ashr_i32 s4, s2, 31
+; GFX9-NEXT:    s_abs_i32 s2, s2
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s6, v0
-; GFX9-NEXT:    s_mul_i32 s3, s3, s6
-; GFX9-NEXT:    s_mul_hi_u32 s3, s6, s3
-; GFX9-NEXT:    s_add_i32 s6, s6, s3
-; GFX9-NEXT:    s_mul_hi_u32 s3, s2, s6
-; GFX9-NEXT:    s_mul_i32 s3, s3, s0
-; GFX9-NEXT:    s_sub_i32 s2, s2, s3
-; GFX9-NEXT:    s_sub_i32 s3, s2, s0
-; GFX9-NEXT:    s_cmp_ge_u32 s2, s0
-; GFX9-NEXT:    s_cselect_b32 s2, s3, s2
-; GFX9-NEXT:    s_sub_i32 s3, s2, s0
-; GFX9-NEXT:    s_cmp_ge_u32 s2, s0
-; GFX9-NEXT:    s_cselect_b32 s0, s3, s2
-; GFX9-NEXT:    s_xor_b32 s0, s0, s1
-; GFX9-NEXT:    s_sub_i32 s0, s0, s1
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX9-NEXT:    s_mul_i32 s5, s5, s6
+; GFX9-NEXT:    s_mul_hi_u32 s5, s6, s5
+; GFX9-NEXT:    s_add_i32 s6, s6, s5
+; GFX9-NEXT:    s_mul_hi_u32 s5, s2, s6
+; GFX9-NEXT:    s_mul_i32 s5, s5, s3
+; GFX9-NEXT:    s_sub_i32 s2, s2, s5
+; GFX9-NEXT:    s_sub_i32 s5, s2, s3
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX9-NEXT:    s_cselect_b32 s2, s5, s2
+; GFX9-NEXT:    s_sub_i32 s5, s2, s3
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX9-NEXT:    s_cselect_b32 s2, s5, s2
+; GFX9-NEXT:    s_xor_b32 s2, s2, s4
+; GFX9-NEXT:    s_sub_i32 s2, s2, s4
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = srem i32 %x, %y
   store i32 %r, ptr addrspace(1) %out
@@ -5482,13 +5482,13 @@ define amdgpu_kernel void @udiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x
 ;
 ; GFX9-LABEL: udiv_i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_add_i32 s0, s7, 12
-; GFX9-NEXT:    s_lshr_b32 s0, s6, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    s_add_i32 s3, s3, 12
+; GFX9-NEXT:    s_lshr_b32 s2, s2, s3
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %shl.y = shl i32 4096, %y
   %r = udiv i32 %x, %shl.y
@@ -5524,14 +5524,14 @@ define amdgpu_kernel void @udiv_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3
 ;
 ; GFX9-LABEL: udiv_v2i32_pow2k_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshr_b32 s0, s6, 12
-; GFX9-NEXT:    s_lshr_b32 s1, s7, 12
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    s_lshr_b32 s2, s2, 12
+; GFX9-NEXT:    s_lshr_b32 s3, s3, 12
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = udiv <2 x i32> %x, <i32 4096, i32 4096>
   store <2 x i32> %r, ptr addrspace(1) %out
@@ -5570,18 +5570,18 @@ define amdgpu_kernel void @udiv_v2i32_mixed_pow2k_denom(ptr addrspace(1) %out, <
 ;
 ; GFX9-LABEL: udiv_v2i32_mixed_pow2k_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mul_hi_u32 s1, s7, 0x100101
-; GFX9-NEXT:    s_sub_i32 s2, s7, s1
-; GFX9-NEXT:    s_lshr_b32 s2, s2, 1
-; GFX9-NEXT:    s_add_i32 s2, s2, s1
-; GFX9-NEXT:    s_lshr_b32 s0, s6, 12
-; GFX9-NEXT:    s_lshr_b32 s1, s2, 11
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    s_mul_hi_u32 s4, s3, 0x100101
+; GFX9-NEXT:    s_sub_i32 s3, s3, s4
+; GFX9-NEXT:    s_lshr_b32 s3, s3, 1
+; GFX9-NEXT:    s_add_i32 s3, s3, s4
+; GFX9-NEXT:    s_lshr_b32 s2, s2, 12
+; GFX9-NEXT:    s_lshr_b32 s3, s3, 11
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = udiv <2 x i32> %x, <i32 4096, i32 4095>
   store <2 x i32> %r, ptr addrspace(1) %out
@@ -5875,14 +5875,14 @@ define amdgpu_kernel void @urem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x
 ;
 ; GFX9-LABEL: urem_i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b32 s0, 0x1000, s7
-; GFX9-NEXT:    s_add_i32 s0, s0, -1
-; GFX9-NEXT:    s_and_b32 s0, s6, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    s_lshl_b32 s3, 0x1000, s3
+; GFX9-NEXT:    s_add_i32 s3, s3, -1
+; GFX9-NEXT:    s_and_b32 s2, s2, s3
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %shl.y = shl i32 4096, %y
   %r = urem i32 %x, %shl.y
@@ -5918,14 +5918,14 @@ define amdgpu_kernel void @urem_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3
 ;
 ; GFX9-LABEL: urem_v2i32_pow2k_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s0, s6, 0xfff
-; GFX9-NEXT:    s_and_b32 s1, s7, 0xfff
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    s_and_b32 s2, s2, 0xfff
+; GFX9-NEXT:    s_and_b32 s3, s3, 0xfff
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = urem <2 x i32> %x, <i32 4096, i32 4096>
   store <2 x i32> %r, ptr addrspace(1) %out
@@ -6234,41 +6234,41 @@ define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x
 ;
 ; GFX9-LABEL: sdiv_i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b32 s0, 0x1000, s7
-; GFX9-NEXT:    s_ashr_i32 s1, s0, 31
-; GFX9-NEXT:    s_add_i32 s0, s0, s1
-; GFX9-NEXT:    s_xor_b32 s0, s0, s1
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s0
-; GFX9-NEXT:    s_ashr_i32 s2, s6, 31
-; GFX9-NEXT:    s_add_i32 s3, s6, s2
-; GFX9-NEXT:    s_sub_i32 s6, 0, s0
+; GFX9-NEXT:    s_lshl_b32 s3, 0x1000, s3
+; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
+; GFX9-NEXT:    s_add_i32 s3, s3, s4
+; GFX9-NEXT:    s_xor_b32 s3, s3, s4
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
+; GFX9-NEXT:    s_sub_i32 s6, 0, s3
+; GFX9-NEXT:    s_ashr_i32 s5, s2, 31
+; GFX9-NEXT:    s_add_i32 s2, s2, s5
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    s_xor_b32 s3, s3, s2
+; GFX9-NEXT:    s_xor_b32 s2, s2, s5
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s7, v0
 ; GFX9-NEXT:    s_mul_i32 s6, s6, s7
 ; GFX9-NEXT:    s_mul_hi_u32 s6, s7, s6
 ; GFX9-NEXT:    s_add_i32 s7, s7, s6
-; GFX9-NEXT:    s_mul_hi_u32 s6, s3, s7
-; GFX9-NEXT:    s_mul_i32 s8, s6, s0
-; GFX9-NEXT:    s_sub_i32 s3, s3, s8
+; GFX9-NEXT:    s_mul_hi_u32 s6, s2, s7
+; GFX9-NEXT:    s_mul_i32 s8, s6, s3
+; GFX9-NEXT:    s_sub_i32 s2, s2, s8
 ; GFX9-NEXT:    s_add_i32 s7, s6, 1
-; GFX9-NEXT:    s_sub_i32 s8, s3, s0
-; GFX9-NEXT:    s_cmp_ge_u32 s3, s0
+; GFX9-NEXT:    s_sub_i32 s8, s2, s3
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
 ; GFX9-NEXT:    s_cselect_b32 s6, s7, s6
-; GFX9-NEXT:    s_cselect_b32 s3, s8, s3
+; GFX9-NEXT:    s_cselect_b32 s2, s8, s2
 ; GFX9-NEXT:    s_add_i32 s7, s6, 1
-; GFX9-NEXT:    s_cmp_ge_u32 s3, s0
-; GFX9-NEXT:    s_cselect_b32 s0, s7, s6
-; GFX9-NEXT:    s_xor_b32 s1, s2, s1
-; GFX9-NEXT:    s_xor_b32 s0, s0, s1
-; GFX9-NEXT:    s_sub_i32 s0, s0, s1
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX9-NEXT:    s_cselect_b32 s2, s7, s6
+; GFX9-NEXT:    s_xor_b32 s3, s5, s4
+; GFX9-NEXT:    s_xor_b32 s2, s2, s3
+; GFX9-NEXT:    s_sub_i32 s2, s2, s3
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %shl.y = shl i32 4096, %y
   %r = sdiv i32 %x, %shl.y
@@ -6310,20 +6310,20 @@ define amdgpu_kernel void @sdiv_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3
 ;
 ; GFX9-LABEL: sdiv_v2i32_pow2k_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s0, s6, 31
-; GFX9-NEXT:    s_ashr_i32 s1, s7, 31
-; GFX9-NEXT:    s_lshr_b32 s0, s0, 20
-; GFX9-NEXT:    s_lshr_b32 s1, s1, 20
-; GFX9-NEXT:    s_add_i32 s0, s6, s0
-; GFX9-NEXT:    s_add_i32 s1, s7, s1
-; GFX9-NEXT:    s_ashr_i32 s0, s0, 12
-; GFX9-NEXT:    s_ashr_i32 s1, s1, 12
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    s_ashr_i32 s4, s2, 31
+; GFX9-NEXT:    s_ashr_i32 s5, s3, 31
+; GFX9-NEXT:    s_lshr_b32 s4, s4, 20
+; GFX9-NEXT:    s_lshr_b32 s5, s5, 20
+; GFX9-NEXT:    s_add_i32 s2, s2, s4
+; GFX9-NEXT:    s_add_i32 s3, s3, s5
+; GFX9-NEXT:    s_ashr_i32 s2, s2, 12
+; GFX9-NEXT:    s_ashr_i32 s3, s3, 12
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = sdiv <2 x i32> %x, <i32 4096, i32 4096>
   store <2 x i32> %r, ptr addrspace(1) %out
@@ -6365,21 +6365,21 @@ define amdgpu_kernel void @ssdiv_v2i32_mixed_pow2k_denom(ptr addrspace(1) %out,
 ;
 ; GFX9-LABEL: ssdiv_v2i32_mixed_pow2k_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s0, s6, 31
-; GFX9-NEXT:    s_mul_hi_i32 s1, s7, 0x80080081
-; GFX9-NEXT:    s_lshr_b32 s0, s0, 20
-; GFX9-NEXT:    s_add_i32 s1, s1, s7
-; GFX9-NEXT:    s_add_i32 s0, s6, s0
-; GFX9-NEXT:    s_lshr_b32 s2, s1, 31
-; GFX9-NEXT:    s_ashr_i32 s1, s1, 11
-; GFX9-NEXT:    s_ashr_i32 s0, s0, 12
-; GFX9-NEXT:    s_add_i32 s1, s1, s2
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    s_ashr_i32 s4, s2, 31
+; GFX9-NEXT:    s_mul_hi_i32 s5, s3, 0x80080081
+; GFX9-NEXT:    s_lshr_b32 s4, s4, 20
+; GFX9-NEXT:    s_add_i32 s5, s5, s3
+; GFX9-NEXT:    s_add_i32 s2, s2, s4
+; GFX9-NEXT:    s_lshr_b32 s3, s5, 31
+; GFX9-NEXT:    s_ashr_i32 s4, s5, 11
+; GFX9-NEXT:    s_ashr_i32 s2, s2, 12
+; GFX9-NEXT:    s_add_i32 s4, s4, s3
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = sdiv <2 x i32> %x, <i32 4096, i32 4095>
   store <2 x i32> %r, ptr addrspace(1) %out
@@ -6746,38 +6746,38 @@ define amdgpu_kernel void @srem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x
 ;
 ; GFX9-LABEL: srem_i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b32 s0, 0x1000, s7
-; GFX9-NEXT:    s_ashr_i32 s1, s0, 31
-; GFX9-NEXT:    s_add_i32 s0, s0, s1
-; GFX9-NEXT:    s_xor_b32 s0, s0, s1
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s0
-; GFX9-NEXT:    s_ashr_i32 s1, s6, 31
-; GFX9-NEXT:    s_add_i32 s2, s6, s1
-; GFX9-NEXT:    s_sub_i32 s3, 0, s0
+; GFX9-NEXT:    s_lshl_b32 s3, 0x1000, s3
+; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
+; GFX9-NEXT:    s_add_i32 s3, s3, s4
+; GFX9-NEXT:    s_xor_b32 s3, s3, s4
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
+; GFX9-NEXT:    s_sub_i32 s5, 0, s3
+; GFX9-NEXT:    s_ashr_i32 s4, s2, 31
+; GFX9-NEXT:    s_add_i32 s2, s2, s4
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    s_xor_b32 s2, s2, s1
+; GFX9-NEXT:    s_xor_b32 s2, s2, s4
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s6, v0
-; GFX9-NEXT:    s_mul_i32 s3, s3, s6
-; GFX9-NEXT:    s_mul_hi_u32 s3, s6, s3
-; GFX9-NEXT:    s_add_i32 s6, s6, s3
-; GFX9-NEXT:    s_mul_hi_u32 s3, s2, s6
-; GFX9-NEXT:    s_mul_i32 s3, s3, s0
-; GFX9-NEXT:    s_sub_i32 s2, s2, s3
-; GFX9-NEXT:    s_sub_i32 s3, s2, s0
-; GFX9-NEXT:    s_cmp_ge_u32 s2, s0
-; GFX9-NEXT:    s_cselect_b32 s2, s3, s2
-; GFX9-NEXT:    s_sub_i32 s3, s2, s0
-; GFX9-NEXT:    s_cmp_ge_u32 s2, s0
-; GFX9-NEXT:    s_cselect_b32 s0, s3, s2
-; GFX9-NEXT:    s_xor_b32 s0, s0, s1
-; GFX9-NEXT:    s_sub_i32 s0, s0, s1
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX9-NEXT:    s_mul_i32 s5, s5, s6
+; GFX9-NEXT:    s_mul_hi_u32 s5, s6, s5
+; GFX9-NEXT:    s_add_i32 s6, s6, s5
+; GFX9-NEXT:    s_mul_hi_u32 s5, s2, s6
+; GFX9-NEXT:    s_mul_i32 s5, s5, s3
+; GFX9-NEXT:    s_sub_i32 s2, s2, s5
+; GFX9-NEXT:    s_sub_i32 s5, s2, s3
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX9-NEXT:    s_cselect_b32 s2, s5, s2
+; GFX9-NEXT:    s_sub_i32 s5, s2, s3
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX9-NEXT:    s_cselect_b32 s2, s5, s2
+; GFX9-NEXT:    s_xor_b32 s2, s2, s4
+; GFX9-NEXT:    s_sub_i32 s2, s2, s4
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %shl.y = shl i32 4096, %y
   %r = srem i32 %x, %shl.y
@@ -6821,22 +6821,22 @@ define amdgpu_kernel void @srem_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3
 ;
 ; GFX9-LABEL: srem_v2i32_pow2k_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s0, s6, 31
-; GFX9-NEXT:    s_ashr_i32 s1, s7, 31
-; GFX9-NEXT:    s_lshr_b32 s0, s0, 20
-; GFX9-NEXT:    s_lshr_b32 s1, s1, 20
-; GFX9-NEXT:    s_add_i32 s0, s6, s0
-; GFX9-NEXT:    s_add_i32 s1, s7, s1
-; GFX9-NEXT:    s_and_b32 s0, s0, 0xfffff000
-; GFX9-NEXT:    s_and_b32 s1, s1, 0xfffff000
-; GFX9-NEXT:    s_sub_i32 s0, s6, s0
-; GFX9-NEXT:    s_sub_i32 s1, s7, s1
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    s_ashr_i32 s4, s2, 31
+; GFX9-NEXT:    s_ashr_i32 s5, s3, 31
+; GFX9-NEXT:    s_lshr_b32 s4, s4, 20
+; GFX9-NEXT:    s_lshr_b32 s5, s5, 20
+; GFX9-NEXT:    s_add_i32 s4, s2, s4
+; GFX9-NEXT:    s_add_i32 s5, s3, s5
+; GFX9-NEXT:    s_and_b32 s4, s4, 0xfffff000
+; GFX9-NEXT:    s_sub_i32 s2, s2, s4
+; GFX9-NEXT:    s_and_b32 s4, s5, 0xfffff000
+; GFX9-NEXT:    s_sub_i32 s3, s3, s4
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = srem <2 x i32> %x, <i32 4096, i32 4096>
   store <2 x i32> %r, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/build_vector.ll b/llvm/test/CodeGen/AMDGPU/build_vector.ll
index e914635d6c26f..b26d15ed3a1c8 100644
--- a/llvm/test/CodeGen/AMDGPU/build_vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/build_vector.ll
@@ -316,14 +316,14 @@ define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out,
 ;
 ; GFX940-LABEL: build_v2i32_from_v4i16_shuffle:
 ; GFX940:       ; %bb.0: ; %entry
-; GFX940-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX940-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX940-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    s_lshl_b32 s0, s7, 16
-; GFX940-NEXT:    s_lshl_b32 s1, s6, 16
-; GFX940-NEXT:    v_mov_b32_e32 v0, s1
-; GFX940-NEXT:    v_mov_b32_e32 v1, s0
-; GFX940-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5] sc0 sc1
+; GFX940-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX940-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX940-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
 ; GFX940-NEXT:    s_endpgm
 entry:
   %shuf = shufflevector <4 x i16> %in, <4 x i16> zeroinitializer, <2 x i32> <i32 0, i32 2>
diff --git a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll
index d511bb1f4a257..b6948dab6bf9f 100644
--- a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll
+++ b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll
@@ -49,20 +49,20 @@ define amdgpu_kernel void @cluster_load_cluster_store(ptr noalias %lb, ptr noali
 ;
 ; GFX10-LABEL: cluster_load_cluster_store:
 ; GFX10:       ; %bb.0: ; %bb
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_add_u32 s0, s4, 8
-; GFX10-NEXT:    s_addc_u32 s1, s5, 0
-; GFX10-NEXT:    s_add_u32 s2, s4, 16
-; GFX10-NEXT:    v_mov_b32_e32 v3, s1
-; GFX10-NEXT:    s_addc_u32 s3, s5, 0
-; GFX10-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-NEXT:    v_mov_b32_e32 v2, s0
-; GFX10-NEXT:    s_add_u32 s0, s4, 24
-; GFX10-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-NEXT:    s_addc_u32 s1, s5, 0
-; GFX10-NEXT:    v_mov_b32_e32 v5, s3
-; GFX10-NEXT:    v_mov_b32_e32 v4, s2
+; GFX10-NEXT:    s_add_u32 s4, s0, 8
+; GFX10-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-NEXT:    s_add_u32 s6, s0, 16
+; GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-NEXT:    s_addc_u32 s7, s1, 0
+; GFX10-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-NEXT:    s_add_u32 s0, s0, 24
+; GFX10-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-NEXT:    v_mov_b32_e32 v3, s5
+; GFX10-NEXT:    v_mov_b32_e32 v4, s6
+; GFX10-NEXT:    v_mov_b32_e32 v5, s7
 ; GFX10-NEXT:    v_mov_b32_e32 v7, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v6, s0
 ; GFX10-NEXT:    s_clause 0x3
@@ -70,16 +70,16 @@ define amdgpu_kernel void @cluster_load_cluster_store(ptr noalias %lb, ptr noali
 ; GFX10-NEXT:    flat_load_dword v9, v[2:3]
 ; GFX10-NEXT:    flat_load_dword v10, v[4:5]
 ; GFX10-NEXT:    flat_load_dword v11, v[6:7]
-; GFX10-NEXT:    s_add_u32 s0, s6, 8
-; GFX10-NEXT:    s_addc_u32 s1, s7, 0
-; GFX10-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-NEXT:    s_add_u32 s0, s2, 8
+; GFX10-NEXT:    s_addc_u32 s1, s3, 0
+; GFX10-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s0
-; GFX10-NEXT:    s_add_u32 s0, s6, 16
-; GFX10-NEXT:    s_addc_u32 s1, s7, 0
-; GFX10-NEXT:    s_add_u32 s2, s6, 24
-; GFX10-NEXT:    v_mov_b32_e32 v1, s7
-; GFX10-NEXT:    s_addc_u32 s3, s7, 0
+; GFX10-NEXT:    s_add_u32 s0, s2, 16
+; GFX10-NEXT:    s_addc_u32 s1, s3, 0
+; GFX10-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-NEXT:    s_add_u32 s2, s2, 24
+; GFX10-NEXT:    s_addc_u32 s3, s3, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v5, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v7, s3
@@ -175,20 +175,20 @@ define amdgpu_kernel void @cluster_load_valu_cluster_store(ptr noalias %lb, ptr
 ;
 ; GFX10-LABEL: cluster_load_valu_cluster_store:
 ; GFX10:       ; %bb.0: ; %bb
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_add_u32 s0, s4, 8
-; GFX10-NEXT:    s_addc_u32 s1, s5, 0
-; GFX10-NEXT:    s_add_u32 s2, s4, 16
-; GFX10-NEXT:    v_mov_b32_e32 v3, s1
-; GFX10-NEXT:    v_mov_b32_e32 v2, s0
-; GFX10-NEXT:    s_addc_u32 s3, s5, 0
-; GFX10-NEXT:    s_add_u32 s0, s4, 24
-; GFX10-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-NEXT:    s_addc_u32 s1, s5, 0
-; GFX10-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-NEXT:    v_mov_b32_e32 v5, s3
-; GFX10-NEXT:    v_mov_b32_e32 v4, s2
+; GFX10-NEXT:    s_add_u32 s4, s0, 8
+; GFX10-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-NEXT:    s_add_u32 s6, s0, 16
+; GFX10-NEXT:    v_mov_b32_e32 v3, s5
+; GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-NEXT:    s_addc_u32 s7, s1, 0
+; GFX10-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-NEXT:    s_add_u32 s0, s0, 24
+; GFX10-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-NEXT:    v_mov_b32_e32 v4, s6
+; GFX10-NEXT:    v_mov_b32_e32 v5, s7
 ; GFX10-NEXT:    flat_load_dword v6, v[2:3]
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s0
@@ -196,18 +196,18 @@ define amdgpu_kernel void @cluster_load_valu_cluster_store(ptr noalias %lb, ptr
 ; GFX10-NEXT:    flat_load_dword v8, v[0:1]
 ; GFX10-NEXT:    flat_load_dword v9, v[4:5]
 ; GFX10-NEXT:    flat_load_dword v10, v[2:3]
-; GFX10-NEXT:    s_add_u32 s0, s6, 8
-; GFX10-NEXT:    s_addc_u32 s1, s7, 0
-; GFX10-NEXT:    s_add_u32 s2, s6, 16
+; GFX10-NEXT:    s_add_u32 s0, s2, 8
+; GFX10-NEXT:    s_addc_u32 s1, s3, 0
+; GFX10-NEXT:    s_add_u32 s4, s2, 16
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s1
-; GFX10-NEXT:    s_addc_u32 s3, s7, 0
-; GFX10-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-NEXT:    s_addc_u32 s5, s3, 0
+; GFX10-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s0
-; GFX10-NEXT:    s_add_u32 s0, s6, 24
-; GFX10-NEXT:    v_mov_b32_e32 v1, s7
-; GFX10-NEXT:    v_mov_b32_e32 v5, s3
-; GFX10-NEXT:    s_addc_u32 s1, s7, 0
-; GFX10-NEXT:    v_mov_b32_e32 v4, s2
+; GFX10-NEXT:    s_add_u32 s0, s2, 24
+; GFX10-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-NEXT:    v_mov_b32_e32 v4, s4
+; GFX10-NEXT:    s_addc_u32 s1, s3, 0
+; GFX10-NEXT:    v_mov_b32_e32 v5, s5
 ; GFX10-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v11, 1, v6
 ; GFX10-NEXT:    v_mov_b32_e32 v7, s1
diff --git a/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll b/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll
index 9c7fa1537c0c2..33c0d90f94a39 100644
--- a/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll
@@ -665,17 +665,17 @@ define amdgpu_kernel void @sub_zext_setcc_commute(ptr addrspace(1) nocapture %ar
 ;
 ; GFX9-LABEL: sub_zext_setcc_commute:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, v0, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v2, s[4:5]
+; GFX9-NEXT:    global_load_dword v3, v2, s[0:1]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v3
-; GFX9-NEXT:    v_add_u32_e32 v0, s6, v0
-; GFX9-NEXT:    v_subrev_u32_e32 v0, s7, v0
-; GFX9-NEXT:    global_store_dword v2, v0, s[4:5]
+; GFX9-NEXT:    v_add_u32_e32 v0, s2, v0
+; GFX9-NEXT:    v_subrev_u32_e32 v0, s3, v0
+; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 bb:
   %x = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -714,17 +714,17 @@ define amdgpu_kernel void @sub_sext_setcc_commute(ptr addrspace(1) nocapture %ar
 ;
 ; GFX9-LABEL: sub_sext_setcc_commute:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, v0, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v2, s[4:5]
+; GFX9-NEXT:    global_load_dword v3, v2, s[0:1]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v3
-; GFX9-NEXT:    v_add_u32_e32 v0, s6, v0
-; GFX9-NEXT:    v_subrev_u32_e32 v0, s7, v0
-; GFX9-NEXT:    global_store_dword v2, v0, s[4:5]
+; GFX9-NEXT:    v_add_u32_e32 v0, s2, v0
+; GFX9-NEXT:    v_subrev_u32_e32 v0, s3, v0
+; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 bb:
   %x = tail call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll
index 3145ee1f6141e..4decf39d04013 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll
@@ -164,28 +164,28 @@ define amdgpu_kernel void @v_ctlz_i32(ptr addrspace(1) noalias %out, ptr addrspa
 ;
 ; GFX10-LABEL: v_ctlz_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_ctlz_i32:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_ctlz_i32:
@@ -277,32 +277,32 @@ define amdgpu_kernel void @v_ctlz_v2i32(ptr addrspace(1) noalias %out, ptr addrs
 ;
 ; GFX10-LABEL: v_ctlz_v2i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[6:7]
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbh_u32_e32 v1, v1
 ; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX10-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
-; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_ctlz_v2i32:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dwordx2 v[0:1], v0, s[6:7]
+; GFX10-GISEL-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
-; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_ctlz_v2i32:
@@ -411,11 +411,11 @@ define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrs
 ;
 ; GFX10-LABEL: v_ctlz_v4i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx4 v[0:3], v0, s[6:7]
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbh_u32_e32 v3, v3
 ; GFX10-NEXT:    v_ffbh_u32_e32 v2, v2
@@ -425,16 +425,16 @@ define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrs
 ; GFX10-NEXT:    v_min_u32_e32 v2, 32, v2
 ; GFX10-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
-; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
+; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_ctlz_v4i32:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dwordx4 v[0:3], v0, s[6:7]
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
@@ -444,7 +444,7 @@ define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrs
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v2, 32, v2
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v3, 32, v3
-; GFX10-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_ctlz_v4i32:
@@ -550,28 +550,28 @@ define amdgpu_kernel void @v_ctlz_i8(ptr addrspace(1) noalias %out, ptr addrspac
 ;
 ; GFX10-LABEL: v_ctlz_i8:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ubyte v1, v0, s[6:7]
+; GFX10-NEXT:    global_load_ubyte v1, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbh_u32_e32 v1, v1
 ; GFX10-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v1, 24, v1
-; GFX10-NEXT:    global_store_byte v0, v1, s[4:5]
+; GFX10-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_ctlz_i8:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_ubyte v1, v0, s[6:7]
+; GFX10-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX10-GISEL-NEXT:    v_subrev_nc_u32_e32 v1, 24, v1
-; GFX10-GISEL-NEXT:    global_store_byte v0, v1, s[4:5]
+; GFX10-GISEL-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_ctlz_i8:
@@ -737,24 +737,24 @@ define amdgpu_kernel void @s_ctlz_i64_trunc(ptr addrspace(1) noalias %out, i64 %
 ;
 ; GFX10-LABEL: s_ctlz_i64_trunc:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_flbit_i32_b64 s0, s[6:7]
-; GFX10-NEXT:    s_min_u32 s0, s0, 64
-; GFX10-NEXT:    v_mov_b32_e32 v1, s0
-; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT:    s_flbit_i32_b64 s2, s[2:3]
+; GFX10-NEXT:    s_min_u32 s2, s2, 64
+; GFX10-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: s_ctlz_i64_trunc:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_flbit_i32_b64 s0, s[6:7]
-; GFX10-GISEL-NEXT:    s_min_u32 s0, s0, 64
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-GISEL-NEXT:    s_flbit_i32_b64 s2, s[2:3]
+; GFX10-GISEL-NEXT:    s_min_u32 s2, s2, 64
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_ctlz_i64_trunc:
@@ -847,25 +847,25 @@ define amdgpu_kernel void @v_ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspa
 ;
 ; GFX10-LABEL: v_ctlz_i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v2, s[6:7]
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX10-NEXT:    v_ffbh_u32_e32 v1, v1
 ; GFX10-NEXT:    v_add_nc_u32_e64 v0, v0, 32 clamp
 ; GFX10-NEXT:    v_min3_u32 v0, v0, v1, 64
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_ctlz_i64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dwordx2 v[0:1], v2, s[6:7]
+; GFX10-GISEL-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
@@ -873,7 +873,7 @@ define amdgpu_kernel void @v_ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspa
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, v1, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 64, v0
-; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_ctlz_i64:
@@ -974,33 +974,33 @@ define amdgpu_kernel void @v_ctlz_i64_trunc(ptr addrspace(1) noalias %out, ptr a
 ;
 ; GFX10-LABEL: v_ctlz_i64_trunc:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[1:2], v1, s[6:7]
+; GFX10-NEXT:    global_load_dwordx2 v[1:2], v1, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbh_u32_e32 v1, v1
 ; GFX10-NEXT:    v_ffbh_u32_e32 v2, v2
 ; GFX10-NEXT:    v_add_nc_u32_e64 v1, v1, 32 clamp
 ; GFX10-NEXT:    v_min3_u32 v1, v1, v2, 64
-; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_ctlz_i64_trunc:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dwordx2 v[1:2], v1, s[6:7]
+; GFX10-GISEL-NEXT:    global_load_dwordx2 v[1:2], v1, s[2:3]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v2, v2
 ; GFX10-GISEL-NEXT:    v_add_nc_u32_e64 v1, v1, 32 clamp
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, v2, v1
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 64, v1
-; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_ctlz_i64_trunc:
@@ -1090,29 +1090,29 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out,
 ;
 ; GFX10-LABEL: v_ctlz_i32_sel_eq_neg1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_ctlz_i32_sel_eq_neg1:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, -1, vcc_lo
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_ctlz_i32_sel_eq_neg1:
@@ -1197,29 +1197,29 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out,
 ;
 ; GFX10-LABEL: v_ctlz_i32_sel_ne_neg1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_ctlz_i32_sel_ne_neg1:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
 ; GFX10-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e32 v0, -1, v1, vcc_lo
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_ctlz_i32_sel_ne_neg1:
@@ -1313,32 +1313,32 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias %
 ;
 ; GFX10-LABEL: v_ctlz_i32_sel_eq_bitwidth:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 32, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_ctlz_i32_sel_eq_bitwidth:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 32, v0
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc_lo
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_ctlz_i32_sel_eq_bitwidth:
@@ -1435,32 +1435,32 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ;
 ; GFX10-LABEL: v_ctlz_i32_sel_ne_bitwidth:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 32, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_ctlz_i32_sel_ne_bitwidth:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
 ; GFX10-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 32, v0
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_ctlz_i32_sel_ne_bitwidth:
@@ -1552,22 +1552,22 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ;
 ; GFX10-LABEL: v_ctlz_i8_sel_eq_neg1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ubyte v0, v0, s[6:7]
+; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
-; GFX10-NEXT:    global_store_byte v1, v0, s[4:5]
+; GFX10-NEXT:    global_store_byte v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_ctlz_i8_sel_eq_neg1:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
 ; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo
 ; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
@@ -1578,7 +1578,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; GFX10-GISEL-NEXT:    v_subrev_nc_u32_e32 v1, 24, v1
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, -1, vcc_lo
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-GISEL-NEXT:    global_store_byte v1, v0, s[4:5]
+; GFX10-GISEL-NEXT:    global_store_byte v1, v0, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_ctlz_i8_sel_eq_neg1:
@@ -1674,25 +1674,25 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ;
 ; GFX10-LABEL: v_ctlz_i16_sel_eq_neg1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ushort v1, v0, s[6:7]
+; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbh_u32_e32 v2, v1
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
 ; GFX10-NEXT:    v_min_u32_e32 v2, 32, v2
 ; GFX10-NEXT:    v_add_nc_u32_e32 v2, -16, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo
-; GFX10-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_ctlz_i16_sel_eq_neg1:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_ushort v1, v0, s[6:7]
+; GFX10-GISEL-NEXT:    global_load_ushort v1, v0, s[2:3]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v2, v1
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
@@ -1700,7 +1700,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; GFX10-GISEL-NEXT:    v_subrev_nc_u32_e32 v2, 16, v2
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v2, 0xffff, vcc_lo
-; GFX10-GISEL-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX10-GISEL-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_ctlz_i16_sel_eq_neg1:
@@ -1795,23 +1795,23 @@ define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out,
 ;
 ; GFX10-LABEL: v_ctlz_i7_sel_eq_neg1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ubyte v0, v0, s[6:7]
+; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0x7f, v0
-; GFX10-NEXT:    global_store_byte v1, v0, s[4:5]
+; GFX10-NEXT:    global_store_byte v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_ctlz_i7_sel_eq_neg1:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
 ; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo
 ; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
@@ -1824,7 +1824,7 @@ define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out,
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0x7f, vcc_lo
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0x7f, v0
-; GFX10-GISEL-NEXT:    global_store_byte v1, v0, s[4:5]
+; GFX10-GISEL-NEXT:    global_store_byte v1, v0, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_ctlz_i7_sel_eq_neg1:
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index a377714ebf737..756b819099682 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -134,14 +134,14 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out,
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
-; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
@@ -211,15 +211,15 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(ptr addrspace(1) noalias %out
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i32:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dwordx2 v[0:1], v0, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
-; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr <2 x i32>, ptr addrspace(1) %valptr, i32 %tid
@@ -295,17 +295,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v4i32(ptr addrspace(1) noalias %out
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v4i32:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dwordx4 v[0:3], v0, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v2, v2
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v3, v3
-; GFX9-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr <4 x i32>, ptr addrspace(1) %valptr, i32 %tid
@@ -562,14 +562,14 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no
 ;
 ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i64_with_select:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-GISEL-NEXT:    s_mov_b32 s1, 0
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_flbit_i32_b64 s0, s[6:7]
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    s_flbit_i32_b64 s4, s[2:3]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %ctlz = tail call i64 @llvm.ctlz.i64(i64 %val, i1 true) nounwind readnone
   %ctlz_ret = icmp ne i64 %val, 0
@@ -650,17 +650,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i8_with_select:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v1
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v2, v2
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
-; GFX9-GISEL-NEXT:    global_store_byte v0, v1, s[4:5]
+; GFX9-GISEL-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i8, ptr addrspace(1) %arrayidx, align 1
   %ctlz = tail call i8 @llvm.ctlz.i8(i8 %val, i1 true) nounwind readnone
@@ -754,11 +754,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i16_with_select:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[6:7]
-; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[6:7] offset:1
+; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:1
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
@@ -766,7 +766,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
-; GFX9-GISEL-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX9-GISEL-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i16, ptr addrspace(1) %arrayidx, align 1
   %ctlz = tail call i16 @llvm.ctlz.i16(i16 %val, i1 true) nounwind readnone
@@ -870,13 +870,13 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_with_select:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[6:7]
-; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[6:7] offset:1
-; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[6:7] offset:3
-; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[6:7] offset:2
+; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:1
+; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[2:3] offset:3
+; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:2
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
@@ -887,7 +887,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v2, v1
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i32, ptr addrspace(1) %arrayidx, align 1
   %ctlz = tail call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
@@ -1051,17 +1051,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i64_with_select:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_ubyte v0, v1, s[6:7]
-; GFX9-GISEL-NEXT:    global_load_ubyte v2, v1, s[6:7] offset:1
-; GFX9-GISEL-NEXT:    global_load_ubyte v3, v1, s[6:7] offset:2
-; GFX9-GISEL-NEXT:    global_load_ubyte v4, v1, s[6:7] offset:3
-; GFX9-GISEL-NEXT:    global_load_ubyte v5, v1, s[6:7] offset:4
-; GFX9-GISEL-NEXT:    global_load_ubyte v6, v1, s[6:7] offset:5
-; GFX9-GISEL-NEXT:    global_load_ubyte v7, v1, s[6:7] offset:6
-; GFX9-GISEL-NEXT:    global_load_ubyte v8, v1, s[6:7] offset:7
+; GFX9-GISEL-NEXT:    global_load_ubyte v0, v1, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:1
+; GFX9-GISEL-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:2
+; GFX9-GISEL-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:3
+; GFX9-GISEL-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:4
+; GFX9-GISEL-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:5
+; GFX9-GISEL-NEXT:    global_load_ubyte v7, v1, s[2:3] offset:6
+; GFX9-GISEL-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:7
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v2, 8, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(5)
@@ -1082,7 +1082,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
 ; GFX9-GISEL-NEXT:    v_min_u32_e32 v0, v4, v0
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 64, v0, vcc
-; GFX9-GISEL-NEXT:    global_store_dwordx2 v1, v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i64, ptr addrspace(1) %arrayidx, align 1
   %ctlz = tail call i64 @llvm.ctlz.i64(i64 %val, i1 true) nounwind readnone
@@ -1159,11 +1159,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i8:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s7
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v1, v0
 ; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v3, vcc
 ; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
@@ -1171,7 +1171,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
-; GFX9-GISEL-NEXT:    global_store_byte v1, v0, s[4:5]
+; GFX9-GISEL-NEXT:    global_store_byte v1, v0, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i8, ptr addrspace(1) %valptr, i32 %tid
@@ -1283,12 +1283,12 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias
 ;
 ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i64_trunc:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_flbit_i32_b64 s0, s[6:7]
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX9-GISEL-NEXT:    s_flbit_i32_b64 s2, s[2:3]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true)
   %trunc = trunc i64 %ctlz to i32
@@ -1365,17 +1365,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out,
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dwordx2 v[0:1], v2, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
 ; GFX9-GISEL-NEXT:    v_add_u32_e32 v0, 32, v0
 ; GFX9-GISEL-NEXT:    v_min_u32_e32 v0, v1, v0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
@@ -1455,17 +1455,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i64_trunc:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dwordx2 v[1:2], v1, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_dwordx2 v[1:2], v1, s[2:3]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v2, v2
 ; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, 32, v1
 ; GFX9-GISEL-NEXT:    v_min_u32_e32 v1, v2, v1
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
@@ -1535,16 +1535,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(ptr addrspace(1) no
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, -1, vcc
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
@@ -1614,16 +1614,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_neg1(ptr addrspace(1) no
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_ne_neg1:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, -1, v1, vcc
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
@@ -1698,11 +1698,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noa
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i8_sel_eq_neg1:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s7
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v1, v0
 ; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v3, vcc
 ; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
@@ -1710,9 +1710,9 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noa
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v0
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v2, v2
-; GFX9-GISEL-NEXT:    v_cmp_eq_u32_sdwa s[0:1], v0, v1 src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, -1, s[0:1]
-; GFX9-GISEL-NEXT:    global_store_byte v1, v0, s[4:5]
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_sdwa s[2:3], v0, v1 src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, -1, s[2:3]
+; GFX9-GISEL-NEXT:    global_store_byte v1, v0, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %valptr.gep = getelementptr i8, ptr addrspace(1) %valptr, i32 %tid
@@ -1800,17 +1800,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(ptr addrspa
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1_two_use:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v2, v0
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, -1, vcc
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
@@ -1889,16 +1889,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(ptr addrspace(1) noali
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_0:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
@@ -1973,16 +1973,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(ptr addrspace(1) noali
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_ne_0:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
@@ -2058,16 +2058,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(ptr addrspace(1
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_cmp_non0:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
@@ -2143,16 +2143,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(ptr addrspace(1
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_ne_cmp_non0:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 1, v0
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/cttz.ll b/llvm/test/CodeGen/AMDGPU/cttz.ll
index ec532c8e4adc3..ee2894a66fbfc 100644
--- a/llvm/test/CodeGen/AMDGPU/cttz.ll
+++ b/llvm/test/CodeGen/AMDGPU/cttz.ll
@@ -148,28 +148,28 @@ define amdgpu_kernel void @v_cttz_i32(ptr addrspace(1) noalias %out, ptr addrspa
 ;
 ; GFX10-LABEL: v_cttz_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
 ; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_cttz_i32:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v0, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
@@ -246,32 +246,32 @@ define amdgpu_kernel void @v_cttz_v2i32(ptr addrspace(1) noalias %out, ptr addrs
 ;
 ; GFX10-LABEL: v_cttz_v2i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[6:7]
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbl_b32_e32 v1, v1
 ; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
 ; GFX10-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
-; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_cttz_v2i32:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dwordx2 v[0:1], v0, s[6:7]
+; GFX10-GISEL-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v0, v0
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v1, v1
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
-; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr <2 x i32>, ptr addrspace(1) %valptr, i32 %tid
@@ -362,11 +362,11 @@ define amdgpu_kernel void @v_cttz_v4i32(ptr addrspace(1) noalias %out, ptr addrs
 ;
 ; GFX10-LABEL: v_cttz_v4i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx4 v[0:3], v0, s[6:7]
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbl_b32_e32 v3, v3
 ; GFX10-NEXT:    v_ffbl_b32_e32 v2, v2
@@ -376,16 +376,16 @@ define amdgpu_kernel void @v_cttz_v4i32(ptr addrspace(1) noalias %out, ptr addrs
 ; GFX10-NEXT:    v_min_u32_e32 v2, 32, v2
 ; GFX10-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
-; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
+; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_cttz_v4i32:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dwordx4 v[0:3], v0, s[6:7]
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v0, v0
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v1, v1
@@ -395,7 +395,7 @@ define amdgpu_kernel void @v_cttz_v4i32(ptr addrspace(1) noalias %out, ptr addrs
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v2, 32, v2
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v3, 32, v3
-; GFX10-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr <4 x i32>, ptr addrspace(1) %valptr, i32 %tid
@@ -475,26 +475,26 @@ define amdgpu_kernel void @v_cttz_i8(ptr addrspace(1) noalias %out, ptr addrspac
 ;
 ; GFX10-LABEL: v_cttz_i8:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ubyte v1, v0, s[6:7]
+; GFX10-NEXT:    global_load_ubyte v1, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_or_b32_e32 v1, 0x100, v1
 ; GFX10-NEXT:    v_ffbl_b32_e32 v1, v1
-; GFX10-NEXT:    global_store_byte v0, v1, s[4:5]
+; GFX10-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_cttz_i8:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_ubyte v1, v0, s[6:7]
+; GFX10-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_or_b32_e32 v1, 0x100, v1
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v1, v1
-; GFX10-GISEL-NEXT:    global_store_byte v0, v1, s[4:5]
+; GFX10-GISEL-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %val = load i8, ptr addrspace(1) %valptr
   %cttz = call i8 @llvm.cttz.i8(i8 %val, i1 false) nounwind readnone
@@ -629,24 +629,24 @@ define amdgpu_kernel void @s_cttz_i64_trunc(ptr addrspace(1) noalias %out, i64 %
 ;
 ; GFX10-LABEL: s_cttz_i64_trunc:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_ff1_i32_b64 s0, s[6:7]
-; GFX10-NEXT:    s_min_u32 s0, s0, 64
-; GFX10-NEXT:    v_mov_b32_e32 v1, s0
-; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT:    s_ff1_i32_b64 s2, s[2:3]
+; GFX10-NEXT:    s_min_u32 s2, s2, 64
+; GFX10-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: s_cttz_i64_trunc:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_ff1_i32_b64 s0, s[6:7]
-; GFX10-GISEL-NEXT:    s_min_u32 s0, s0, 64
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-GISEL-NEXT:    s_ff1_i32_b64 s2, s[2:3]
+; GFX10-GISEL-NEXT:    s_min_u32 s2, s2, 64
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %cttz = call i64 @llvm.cttz.i64(i64 %val, i1 false)
   %trunc = trunc i64 %cttz to i32
@@ -726,25 +726,25 @@ define amdgpu_kernel void @v_cttz_i64(ptr addrspace(1) noalias %out, ptr addrspa
 ;
 ; GFX10-LABEL: v_cttz_i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v2, s[6:7]
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbl_b32_e32 v1, v1
 ; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
 ; GFX10-NEXT:    v_add_nc_u32_e64 v1, v1, 32 clamp
 ; GFX10-NEXT:    v_min3_u32 v0, v0, v1, 64
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_cttz_i64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dwordx2 v[0:1], v2, s[6:7]
+; GFX10-GISEL-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v1, v1
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v0, v0
@@ -752,7 +752,7 @@ define amdgpu_kernel void @v_cttz_i64(ptr addrspace(1) noalias %out, ptr addrspa
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, v0, v1
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 64, v0
-; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
@@ -835,33 +835,33 @@ define amdgpu_kernel void @v_cttz_i64_trunc(ptr addrspace(1) noalias %out, ptr a
 ;
 ; GFX10-LABEL: v_cttz_i64_trunc:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[1:2], v1, s[6:7]
+; GFX10-NEXT:    global_load_dwordx2 v[1:2], v1, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbl_b32_e32 v2, v2
 ; GFX10-NEXT:    v_ffbl_b32_e32 v1, v1
 ; GFX10-NEXT:    v_add_nc_u32_e64 v2, v2, 32 clamp
 ; GFX10-NEXT:    v_min3_u32 v1, v1, v2, 64
-; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_cttz_i64_trunc:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dwordx2 v[1:2], v1, s[6:7]
+; GFX10-GISEL-NEXT:    global_load_dwordx2 v[1:2], v1, s[2:3]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v2, v2
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v1, v1
 ; GFX10-GISEL-NEXT:    v_add_nc_u32_e64 v2, v2, 32 clamp
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, v1, v2
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 64, v1
-; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
@@ -933,29 +933,29 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out,
 ;
 ; GFX10-LABEL: v_cttz_i32_sel_eq_neg1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_cttz_i32_sel_eq_neg1:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v1, v0
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, -1, vcc_lo
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
@@ -1027,29 +1027,29 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out,
 ;
 ; GFX10-LABEL: v_cttz_i32_sel_ne_neg1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_cttz_i32_sel_ne_neg1:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v1, v0
 ; GFX10-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e32 v0, -1, v1, vcc_lo
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
@@ -1130,32 +1130,32 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias %
 ;
 ; GFX10-LABEL: v_cttz_i32_sel_eq_bitwidth:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
 ; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 32, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_cttz_i32_sel_eq_bitwidth:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v0, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 32, v0
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc_lo
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
@@ -1235,32 +1235,32 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ;
 ; GFX10-LABEL: v_cttz_i32_sel_ne_bitwidth:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
 ; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 32, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_cttz_i32_sel_ne_bitwidth:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v0, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
 ; GFX10-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 32, v0
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
@@ -1335,32 +1335,32 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ;
 ; GFX10-LABEL: v_cttz_i8_sel_eq_neg1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ubyte v0, v0, s[6:7]
+; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
-; GFX10-NEXT:    global_store_byte v1, v0, s[4:5]
+; GFX10-NEXT:    global_store_byte v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_cttz_i8_sel_eq_neg1:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
 ; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_or_b32_e32 v1, 0x100, v0
-; GFX10-GISEL-NEXT:    v_cmp_eq_u32_sdwa s0, v0, v2 src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_sdwa s2, v0, v2 src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v1, v1
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, -1, s0
-; GFX10-GISEL-NEXT:    global_store_byte v2, v0, s[4:5]
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, -1, s2
+; GFX10-GISEL-NEXT:    global_store_byte v2, v0, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %valptr.gep = getelementptr i8, ptr addrspace(1) %valptr, i32 %tid
@@ -1442,31 +1442,31 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ;
 ; GFX10-LABEL: v_cttz_i16_sel_eq_neg1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ushort v1, v0, s[6:7]
+; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_or_b32_e32 v2, 0x10000, v1
 ; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v1
 ; GFX10-NEXT:    v_ffbl_b32_e32 v2, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo
-; GFX10-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_cttz_i16_sel_eq_neg1:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_ushort v1, v0, s[6:7]
+; GFX10-GISEL-NEXT:    global_load_ushort v1, v0, s[2:3]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_or_b32_e32 v2, 0x10000, v1
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v2, v2
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v2, 0xffff, vcc_lo
-; GFX10-GISEL-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX10-GISEL-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %val = load i16, ptr addrspace(1) %valptr
   %cttz = call i16 @llvm.cttz.i16(i16 %val, i1 false) nounwind readnone
@@ -1542,23 +1542,23 @@ define amdgpu_kernel void @v_cttz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out,
 ;
 ; GFX10-LABEL: v_cttz_i7_sel_eq_neg1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ubyte v0, v0, s[6:7]
+; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0x7f, v0
-; GFX10-NEXT:    global_store_byte v1, v0, s[4:5]
+; GFX10-NEXT:    global_store_byte v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_cttz_i7_sel_eq_neg1:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
 ; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo
 ; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
@@ -1570,7 +1570,7 @@ define amdgpu_kernel void @v_cttz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out,
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0x7f, vcc_lo
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0x7f, v0
-; GFX10-GISEL-NEXT:    global_store_byte v1, v0, s[4:5]
+; GFX10-GISEL-NEXT:    global_store_byte v1, v0, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %valptr.gep = getelementptr i7, ptr addrspace(1) %valptr, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
index 086d99916ba04..392a44318b0a5 100644
--- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
@@ -121,14 +121,14 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32(ptr addrspace(1) noalias %out,
 ;
 ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i32:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v0, v0
-; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
@@ -198,15 +198,15 @@ define amdgpu_kernel void @v_cttz_zero_undef_v2i32(ptr addrspace(1) noalias %out
 ;
 ; GFX9-GISEL-LABEL: v_cttz_zero_undef_v2i32:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dwordx2 v[0:1], v0, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v0, v0
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v1, v1
-; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr <2 x i32>, ptr addrspace(1) %valptr, i32 %tid
@@ -282,17 +282,17 @@ define amdgpu_kernel void @v_cttz_zero_undef_v4i32(ptr addrspace(1) noalias %out
 ;
 ; GFX9-GISEL-LABEL: v_cttz_zero_undef_v4i32:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dwordx4 v[0:3], v0, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v0, v0
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v1, v1
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v2, v2
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v3, v3
-; GFX9-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr <4 x i32>, ptr addrspace(1) %valptr, i32 %tid
@@ -538,14 +538,14 @@ define amdgpu_kernel void @s_cttz_zero_undef_i64_with_select(ptr addrspace(1) no
 ;
 ; GFX9-GISEL-LABEL: s_cttz_zero_undef_i64_with_select:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-GISEL-NEXT:    s_mov_b32 s1, 0
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s0, s[6:7]
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s4, s[2:3]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 true) nounwind readnone
   %cttz_ret = icmp ne i64 %val, 0
@@ -622,16 +622,16 @@ define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa
 ;
 ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i8_with_select:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v2, v1
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
-; GFX9-GISEL-NEXT:    global_store_byte v0, v1, s[4:5]
+; GFX9-GISEL-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i8, ptr addrspace(1) %arrayidx, align 1
   %cttz = tail call i8 @llvm.cttz.i8(i8 %val, i1 true) nounwind readnone
@@ -721,18 +721,18 @@ define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(ptr addrspace(1) no
 ;
 ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i16_with_select:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[6:7]
-; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[6:7] offset:1
+; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:1
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v2, v1
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
-; GFX9-GISEL-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX9-GISEL-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i16, ptr addrspace(1) %arrayidx, align 1
   %cttz = tail call i16 @llvm.cttz.i16(i16 %val, i1 true) nounwind readnone
@@ -836,13 +836,13 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(ptr addrspace(1) no
 ;
 ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i32_with_select:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[6:7]
-; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[6:7] offset:1
-; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[6:7] offset:3
-; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[6:7] offset:2
+; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:1
+; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[2:3] offset:3
+; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:2
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
@@ -853,7 +853,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(ptr addrspace(1) no
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v2, v1
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i32, ptr addrspace(1) %arrayidx, align 1
   %cttz = tail call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone
@@ -1017,17 +1017,17 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no
 ;
 ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i64_with_select:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_ubyte v0, v1, s[6:7]
-; GFX9-GISEL-NEXT:    global_load_ubyte v2, v1, s[6:7] offset:1
-; GFX9-GISEL-NEXT:    global_load_ubyte v3, v1, s[6:7] offset:2
-; GFX9-GISEL-NEXT:    global_load_ubyte v4, v1, s[6:7] offset:3
-; GFX9-GISEL-NEXT:    global_load_ubyte v5, v1, s[6:7] offset:4
-; GFX9-GISEL-NEXT:    global_load_ubyte v6, v1, s[6:7] offset:5
-; GFX9-GISEL-NEXT:    global_load_ubyte v7, v1, s[6:7] offset:6
-; GFX9-GISEL-NEXT:    global_load_ubyte v8, v1, s[6:7] offset:7
+; GFX9-GISEL-NEXT:    global_load_ubyte v0, v1, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:1
+; GFX9-GISEL-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:2
+; GFX9-GISEL-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:3
+; GFX9-GISEL-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:4
+; GFX9-GISEL-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:5
+; GFX9-GISEL-NEXT:    global_load_ubyte v7, v1, s[2:3] offset:6
+; GFX9-GISEL-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:7
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v2, 8, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(5)
@@ -1048,7 +1048,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
 ; GFX9-GISEL-NEXT:    v_min_u32_e32 v0, v0, v4
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 64, v0, vcc
-; GFX9-GISEL-NEXT:    global_store_dwordx2 v1, v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i64, ptr addrspace(1) %arrayidx, align 1
   %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 true) nounwind readnone
@@ -1152,13 +1152,13 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out,
 ;
 ; GFX9-GISEL-LABEL: v_cttz_i32_sel_eq_neg1:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[6:7]
-; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[6:7] offset:1
-; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[6:7] offset:3
-; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[6:7] offset:2
+; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:1
+; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[2:3] offset:3
+; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:2
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
@@ -1170,7 +1170,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out,
 ; GFX9-GISEL-NEXT:    v_min_u32_e32 v2, 32, v2
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, v2, -1, vcc
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i32, ptr addrspace(1) %arrayidx, align 1
   %ctlz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
@@ -1274,13 +1274,13 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out,
 ;
 ; GFX9-GISEL-LABEL: v_cttz_i32_sel_ne_neg1:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[6:7]
-; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[6:7] offset:1
-; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[6:7] offset:3
-; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[6:7] offset:2
+; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:1
+; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[2:3] offset:3
+; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:2
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
@@ -1292,7 +1292,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out,
 ; GFX9-GISEL-NEXT:    v_min_u32_e32 v2, 32, v2
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, -1, v2, vcc
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i32, ptr addrspace(1) %arrayidx, align 1
   %ctlz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
@@ -1404,13 +1404,13 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ;
 ; GFX9-GISEL-LABEL: v_cttz_i32_sel_ne_bitwidth:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[6:7]
-; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[6:7] offset:1
-; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[6:7] offset:3
-; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[6:7] offset:2
+; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:1
+; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[2:3] offset:3
+; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:2
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
@@ -1422,7 +1422,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; GFX9-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v1
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, -1, v1, vcc
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i32, ptr addrspace(1) %arrayidx, align 1
   %ctlz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
@@ -1498,18 +1498,18 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ;
 ; GFX9-GISEL-LABEL: v_cttz_i8_sel_eq_neg1:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xff
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_or_b32_e32 v3, 0x100, v1
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v3, v3
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
-; GFX9-GISEL-NEXT:    global_store_byte v0, v1, s[4:5]
+; GFX9-GISEL-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i8, ptr addrspace(1) %arrayidx, align 1
   %ctlz = call i8 @llvm.cttz.i8(i8 %val, i1 false) nounwind readnone
@@ -1597,12 +1597,12 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ;
 ; GFX9-GISEL-LABEL: v_cttz_i16_sel_eq_neg1:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xffff
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[6:7]
-; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[6:7] offset:1
+; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:1
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
 ; GFX9-GISEL-NEXT:    v_or_b32_e32 v2, 0x10000, v1
@@ -1610,7 +1610,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-GISEL-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX9-GISEL-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i16, ptr addrspace(1) %arrayidx, align 1
   %ctlz = call i16 @llvm.cttz.i16(i16 %val, i1 false) nounwind readnone
diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
index 4226728dbe118..097604d57803e 100644
--- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@@ -933,24 +933,24 @@ define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr add
 ;
 ; GFX10-LABEL: load_i8_to_f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ubyte v0, v0, s[6:7]
+; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: load_i8_to_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ubyte v0, v0, s[6:7]
+; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: load_i8_to_f32:
@@ -1013,28 +1013,28 @@ define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr
 ;
 ; GFX10-LABEL: load_v2i8_to_v2f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ushort v0, v0, s[6:7]
+; GFX10-NEXT:    global_load_ushort v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: load_v2i8_to_v2f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v0, v0, s[6:7]
+; GFX9-NEXT:    global_load_ushort v0, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: load_v2i8_to_v2f32:
@@ -1102,30 +1102,30 @@ define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr
 ;
 ; GFX10-LABEL: load_v3i8_to_v3f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
 ; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX10-NEXT:    global_store_dwordx3 v3, v[0:2], s[4:5]
+; GFX10-NEXT:    global_store_dwordx3 v3, v[0:2], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: load_v3i8_to_v3f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
 ; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX9-NEXT:    global_store_dwordx3 v3, v[0:2], s[4:5]
+; GFX9-NEXT:    global_store_dwordx3 v3, v[0:2], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: load_v3i8_to_v3f32:
@@ -1194,32 +1194,32 @@ define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr
 ;
 ; GFX10-LABEL: load_v4i8_to_v4f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
 ; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
 ; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
+; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: load_v4i8_to_v4f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
 ; GFX9-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
 ; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
+; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: load_v4i8_to_v4f32:
@@ -1312,15 +1312,15 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias
 ;
 ; GFX10-LABEL: load_v4i8_to_v4f32_unaligned:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x3
-; GFX10-NEXT:    global_load_ubyte v1, v0, s[6:7] offset:3
-; GFX10-NEXT:    global_load_ubyte v2, v0, s[6:7] offset:2
-; GFX10-NEXT:    global_load_ubyte v4, v0, s[6:7] offset:1
-; GFX10-NEXT:    global_load_ubyte v5, v0, s[6:7]
+; GFX10-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:3
+; GFX10-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:2
+; GFX10-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:1
+; GFX10-NEXT:    global_load_ubyte v5, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(3)
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v3, v1
 ; GFX10-NEXT:    s_waitcnt vmcnt(2)
@@ -1329,19 +1329,19 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v1, v4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v5
-; GFX10-NEXT:    global_store_dwordx4 v6, v[0:3], s[4:5]
+; GFX10-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: load_v4i8_to_v4f32_unaligned:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ubyte v1, v0, s[6:7] offset:3
-; GFX9-NEXT:    global_load_ubyte v2, v0, s[6:7] offset:2
-; GFX9-NEXT:    global_load_ubyte v4, v0, s[6:7] offset:1
-; GFX9-NEXT:    global_load_ubyte v5, v0, s[6:7]
+; GFX9-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:3
+; GFX9-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:2
+; GFX9-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:1
+; GFX9-NEXT:    global_load_ubyte v5, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v3, v1
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
@@ -1350,7 +1350,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v1, v4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v5
-; GFX9-NEXT:    global_store_dwordx4 v6, v[0:3], s[4:5]
+; GFX9-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: load_v4i8_to_v4f32_unaligned:
@@ -1643,12 +1643,12 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
 ;
 ; GFX10-LABEL: load_v4i8_to_v4f32_2_uses:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffffff00, v0
@@ -1666,21 +1666,22 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
 ; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
 ; GFX10-NEXT:    v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
-; GFX10-NEXT:    global_store_dword v4, v5, s[6:7]
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX10-NEXT:    global_store_dword v4, v5, s[2:3]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: load_v4i8_to_v4f32_2_uses:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    s_movk_i32 s4, 0xff00
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 9
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v4, v0, s[0:1]
-; GFX9-NEXT:    s_movk_i32 s0, 0xff00
-; GFX9-NEXT:    s_movk_i32 s1, 0x900
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_movk_i32 s5, 0x900
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte3_e32 v3, v4
 ; GFX9-NEXT:    v_cvt_f32_ubyte2_e32 v2, v4
@@ -1688,16 +1689,17 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v4
 ; GFX9-NEXT:    v_and_b32_e32 v7, 0xffffff00, v4
 ; GFX9-NEXT:    v_add_u16_e32 v8, 9, v4
-; GFX9-NEXT:    v_and_b32_sdwa v9, v4, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v9, v4, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    v_add_u16_sdwa v4, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    global_store_dwordx4 v5, v[0:3], s[4:5]
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1]
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u16_e32 v0, 0x900, v0
-; GFX9-NEXT:    v_add_u16_sdwa v1, v1, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_add_u16_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX9-NEXT:    global_store_dword v5, v0, s[6:7]
+; GFX9-NEXT:    global_store_dword v5, v0, s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: load_v4i8_to_v4f32_2_uses:
@@ -1837,17 +1839,17 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr
 ;
 ; GFX10-LABEL: load_v7i8_to_v7f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x5
-; GFX10-NEXT:    global_load_ubyte v4, v0, s[6:7] offset:6
-; GFX10-NEXT:    global_load_ubyte v1, v0, s[6:7] offset:3
-; GFX10-NEXT:    global_load_ubyte v2, v0, s[6:7] offset:2
-; GFX10-NEXT:    global_load_ubyte v5, v0, s[6:7] offset:1
-; GFX10-NEXT:    global_load_short_d16 v7, v0, s[6:7] offset:4
-; GFX10-NEXT:    global_load_ubyte v0, v0, s[6:7]
+; GFX10-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:6
+; GFX10-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:3
+; GFX10-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:2
+; GFX10-NEXT:    global_load_ubyte v5, v0, s[2:3] offset:1
+; GFX10-NEXT:    global_load_short_d16 v7, v0, s[2:3] offset:4
+; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(5)
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v6, v4
 ; GFX10-NEXT:    s_waitcnt vmcnt(4)
@@ -1861,22 +1863,22 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v4, v7
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX10-NEXT:    global_store_dwordx3 v8, v[4:6], s[4:5] offset:16
-; GFX10-NEXT:    global_store_dwordx4 v8, v[0:3], s[4:5]
+; GFX10-NEXT:    global_store_dwordx3 v8, v[4:6], s[0:1] offset:16
+; GFX10-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: load_v7i8_to_v7f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v10, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ubyte v1, v0, s[6:7] offset:6
-; GFX9-NEXT:    global_load_ushort v2, v0, s[6:7] offset:4
-; GFX9-NEXT:    global_load_ubyte v3, v0, s[6:7] offset:3
-; GFX9-NEXT:    global_load_ubyte v7, v0, s[6:7] offset:2
-; GFX9-NEXT:    global_load_ubyte v8, v0, s[6:7] offset:1
-; GFX9-NEXT:    global_load_ubyte v9, v0, s[6:7]
+; GFX9-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:6
+; GFX9-NEXT:    global_load_ushort v2, v0, s[2:3] offset:4
+; GFX9-NEXT:    global_load_ubyte v3, v0, s[2:3] offset:3
+; GFX9-NEXT:    global_load_ubyte v7, v0, s[2:3] offset:2
+; GFX9-NEXT:    global_load_ubyte v8, v0, s[2:3] offset:1
+; GFX9-NEXT:    global_load_ubyte v9, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v6, v1
 ; GFX9-NEXT:    s_waitcnt vmcnt(4)
@@ -1890,8 +1892,8 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v1, v8
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v9
-; GFX9-NEXT:    global_store_dwordx4 v10, v[0:3], s[4:5]
-; GFX9-NEXT:    global_store_dwordx3 v10, v[4:6], s[4:5] offset:16
+; GFX9-NEXT:    global_store_dwordx4 v10, v[0:3], s[0:1]
+; GFX9-NEXT:    global_store_dwordx3 v10, v[4:6], s[0:1] offset:16
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: load_v7i8_to_v7f32:
@@ -1988,11 +1990,11 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr
 ;
 ; GFX10-LABEL: load_v8i8_to_v8f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v10, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[8:9], v0, s[6:7]
+; GFX10-NEXT:    global_load_dwordx2 v[8:9], v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte3_e32 v7, v9
 ; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v6, v9
@@ -2002,17 +2004,17 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr
 ; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v2, v8
 ; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v1, v8
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v8
-; GFX10-NEXT:    global_store_dwordx4 v10, v[4:7], s[4:5] offset:16
-; GFX10-NEXT:    global_store_dwordx4 v10, v[0:3], s[4:5]
+; GFX10-NEXT:    global_store_dwordx4 v10, v[4:7], s[0:1] offset:16
+; GFX10-NEXT:    global_store_dwordx4 v10, v[0:3], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: load_v8i8_to_v8f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v9, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[7:8], v0, s[6:7]
+; GFX9-NEXT:    global_load_dwordx2 v[7:8], v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte3_e32 v3, v7
 ; GFX9-NEXT:    v_cvt_f32_ubyte2_e32 v2, v7
@@ -2022,8 +2024,8 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr
 ; GFX9-NEXT:    v_cvt_f32_ubyte2_e32 v6, v8
 ; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v5, v8
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v4, v8
-; GFX9-NEXT:    global_store_dwordx4 v9, v[4:7], s[4:5] offset:16
-; GFX9-NEXT:    global_store_dwordx4 v9, v[0:3], s[4:5]
+; GFX9-NEXT:    global_store_dwordx4 v9, v[4:7], s[0:1] offset:16
+; GFX9-NEXT:    global_store_dwordx4 v9, v[0:3], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: load_v8i8_to_v8f32:
@@ -2096,28 +2098,28 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %ou
 ;
 ; GFX10-LABEL: i8_zext_inreg_i32_to_f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: i8_zext_inreg_i32_to_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: i8_zext_inreg_i32_to_f32:
@@ -2182,26 +2184,26 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %ou
 ;
 ; GFX10-LABEL: i8_zext_inreg_hi1_to_f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: i8_zext_inreg_hi1_to_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
-; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: i8_zext_inreg_hi1_to_f32:
@@ -2264,24 +2266,24 @@ define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr
 ;
 ; GFX10-LABEL: i8_zext_i32_to_f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ubyte v0, v0, s[6:7]
+; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: i8_zext_i32_to_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ubyte v0, v0, s[6:7]
+; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: i8_zext_i32_to_f32:
@@ -2367,15 +2369,15 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou
 ;
 ; GFX10-LABEL: v4i8_zext_v4i32_to_v4f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x3
-; GFX10-NEXT:    global_load_ubyte v1, v0, s[6:7] offset:3
-; GFX10-NEXT:    global_load_ubyte v2, v0, s[6:7] offset:2
-; GFX10-NEXT:    global_load_ubyte v4, v0, s[6:7] offset:1
-; GFX10-NEXT:    global_load_ubyte v5, v0, s[6:7]
+; GFX10-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:3
+; GFX10-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:2
+; GFX10-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:1
+; GFX10-NEXT:    global_load_ubyte v5, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(3)
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v3, v1
 ; GFX10-NEXT:    s_waitcnt vmcnt(2)
@@ -2384,19 +2386,19 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v1, v4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v5
-; GFX10-NEXT:    global_store_dwordx4 v6, v[0:3], s[4:5]
+; GFX10-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v4i8_zext_v4i32_to_v4f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ubyte v1, v0, s[6:7] offset:3
-; GFX9-NEXT:    global_load_ubyte v2, v0, s[6:7] offset:2
-; GFX9-NEXT:    global_load_ubyte v4, v0, s[6:7] offset:1
-; GFX9-NEXT:    global_load_ubyte v5, v0, s[6:7]
+; GFX9-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:3
+; GFX9-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:2
+; GFX9-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:1
+; GFX9-NEXT:    global_load_ubyte v5, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v3, v1
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
@@ -2405,7 +2407,7 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v1, v4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v5
-; GFX9-NEXT:    global_store_dwordx4 v6, v[0:3], s[4:5]
+; GFX9-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v4i8_zext_v4i32_to_v4f32:
@@ -2477,26 +2479,26 @@ define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, p
 ;
 ; GFX10-LABEL: extract_byte0_to_f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: extract_byte0_to_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: extract_byte0_to_f32:
@@ -2558,26 +2560,26 @@ define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, p
 ;
 ; GFX10-LABEL: extract_byte1_to_f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: extract_byte1_to_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
-; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: extract_byte1_to_f32:
@@ -2640,26 +2642,26 @@ define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, p
 ;
 ; GFX10-LABEL: extract_byte2_to_f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v0, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: extract_byte2_to_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte2_e32 v0, v0
-; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: extract_byte2_to_f32:
@@ -2722,26 +2724,26 @@ define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, p
 ;
 ; GFX10-LABEL: extract_byte3_to_f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte3_e32 v0, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: extract_byte3_to_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte3_e32 v0, v0
-; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: extract_byte3_to_f32:
@@ -2823,16 +2825,16 @@ define amdgpu_kernel void @cvt_ubyte0_or_multiuse(ptr addrspace(1) %in, ptr addr
 ;
 ; GFX9-LABEL: cvt_ubyte0_or_multiuse:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[4:5]
+; GFX9-NEXT:    global_load_dword v0, v0, s[0:1]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_e32 v0, 0x80000001, v0
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v2, v0
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX9-NEXT:    global_store_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: cvt_ubyte0_or_multiuse:
diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
index 739fff5084135..8f31bb1fe0a81 100644
--- a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
@@ -376,22 +376,22 @@ define amdgpu_kernel void @uniform_vec_i16_LH(ptr addrspace(1) %out, i16 %a, i32
 ;
 ; GFX9-LABEL: uniform_vec_i16_LH:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_pack_lh_b32_b16 s0, s6, s7
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    s_pack_lh_b32_b16 s2, s2, s3
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX906-LABEL: uniform_vec_i16_LH:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX906-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX906-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    s_pack_lh_b32_b16 s0, s6, s7
-; GFX906-NEXT:    v_mov_b32_e32 v1, s0
-; GFX906-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX906-NEXT:    s_pack_lh_b32_b16 s2, s2, s3
+; GFX906-NEXT:    v_mov_b32_e32 v1, s2
+; GFX906-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX906-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: uniform_vec_i16_LH:
@@ -466,22 +466,22 @@ define amdgpu_kernel void @uniform_vec_i16_HH(ptr addrspace(1) %out, i32 %a, i32
 ;
 ; GFX9-LABEL: uniform_vec_i16_HH:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_pack_hh_b32_b16 s0, s6, s7
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    s_pack_hh_b32_b16 s2, s2, s3
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX906-LABEL: uniform_vec_i16_HH:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX906-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX906-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    s_pack_hh_b32_b16 s0, s6, s7
-; GFX906-NEXT:    v_mov_b32_e32 v1, s0
-; GFX906-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX906-NEXT:    s_pack_hh_b32_b16 s2, s2, s3
+; GFX906-NEXT:    v_mov_b32_e32 v1, s2
+; GFX906-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX906-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: uniform_vec_i16_HH:
diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
index 4cef7015811fe..777a8f3fef1c1 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
@@ -319,16 +319,16 @@ define amdgpu_kernel void @read2_ptr_is_subreg_arg_f32(ptr addrspace(1) %out, <2
 ;
 ; GFX9-LABEL: read2_ptr_is_subreg_arg_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x0
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-NEXT:    v_mov_b32_e32 v2, s7
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX9-NEXT:    ds_read_b32 v1, v1 offset:32
 ; GFX9-NEXT:    ds_read_b32 v2, v2
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
@@ -370,16 +370,16 @@ define amdgpu_kernel void @read2_ptr_is_subreg_arg_offset_f32(ptr addrspace(1) %
 ;
 ; GFX9-LABEL: read2_ptr_is_subreg_arg_offset_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x0
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-NEXT:    v_mov_b32_e32 v2, s7
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX9-NEXT:    ds_read_b32 v1, v1 offset:32
 ; GFX9-NEXT:    ds_read_b32 v2, v2 offset:32
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
index ae280c5a443e1..b8936911f0576 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
@@ -1831,21 +1831,21 @@ define amdgpu_kernel void @s_copysign_v2f16(ptr addrspace(1) %arg_out, <2 x half
 ;
 ; GFX9-LABEL: s_copysign_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NEXT:    s_movk_i32 s0, 0x7fff
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-NEXT:    v_mov_b32_e32 v2, s7
-; GFX9-NEXT:    s_lshr_b32 s1, s7, 16
-; GFX9-NEXT:    s_lshr_b32 s2, s6, 16
-; GFX9-NEXT:    v_bfi_b32 v1, s0, v1, v2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-NEXT:    s_lshr_b32 s3, s3, 16
+; GFX9-NEXT:    s_lshr_b32 s2, s2, 16
+; GFX9-NEXT:    v_bfi_b32 v1, s4, v1, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    v_bfi_b32 v2, s0, v2, v3
+; GFX9-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-NEXT:    v_bfi_b32 v2, s4, v2, v3
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_copysign_v2f16:
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.ll b/llvm/test/CodeGen/AMDGPU/fdiv.ll
index 6c5b2917855fc..0468175c5df50 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.ll
@@ -118,10 +118,10 @@ define amdgpu_kernel void @s_fdiv_f32_ninf(ptr addrspace(1) %out, float %a, floa
 ;
 ; GFX10-LABEL: s_fdiv_f32_ninf:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_div_scale_f32 v0, s0, s7, s7, s6
-; GFX10-NEXT:    v_div_scale_f32 v2, vcc_lo, s6, s7, s6
+; GFX10-NEXT:    v_div_scale_f32 v0, s4, s3, s3, s2
+; GFX10-NEXT:    v_div_scale_f32 v2, vcc_lo, s2, s3, s2
 ; GFX10-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX10-NEXT:    s_denorm_mode 15
 ; GFX10-NEXT:    v_fma_f32 v3, -v0, v1, 1.0
@@ -133,8 +133,8 @@ define amdgpu_kernel void @s_fdiv_f32_ninf(ptr addrspace(1) %out, float %a, floa
 ; GFX10-NEXT:    s_denorm_mode 12
 ; GFX10-NEXT:    v_div_fmas_f32 v0, v0, v1, v3
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    v_div_fixup_f32 v0, v0, s7, s6
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    v_div_fixup_f32 v0, v0, s3, s2
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_fdiv_f32_ninf:
@@ -275,21 +275,21 @@ define amdgpu_kernel void @s_fdiv_f32_ieee(ptr addrspace(1) %out, float %a, floa
 ;
 ; GFX10-LABEL: s_fdiv_f32_ieee:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_div_scale_f32 v0, s0, s7, s7, s6
+; GFX10-NEXT:    v_div_scale_f32 v0, s4, s3, s3, s2
 ; GFX10-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX10-NEXT:    v_fma_f32 v2, -v0, v1, 1.0
 ; GFX10-NEXT:    v_fmac_f32_e32 v1, v2, v1
-; GFX10-NEXT:    v_div_scale_f32 v2, vcc_lo, s6, s7, s6
+; GFX10-NEXT:    v_div_scale_f32 v2, vcc_lo, s2, s3, s2
 ; GFX10-NEXT:    v_mul_f32_e32 v3, v2, v1
 ; GFX10-NEXT:    v_fma_f32 v4, -v0, v3, v2
 ; GFX10-NEXT:    v_fmac_f32_e32 v3, v4, v1
 ; GFX10-NEXT:    v_fma_f32 v0, -v0, v3, v2
 ; GFX10-NEXT:    v_div_fmas_f32 v0, v0, v1, v3
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    v_div_fixup_f32 v0, v0, s7, s6
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    v_div_fixup_f32 v0, v0, s3, s2
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_fdiv_f32_ieee:
@@ -370,16 +370,16 @@ define amdgpu_kernel void @s_fdiv_25ulp_f32(ptr addrspace(1) %out, float %a, flo
 ;
 ; GFX10-LABEL: s_fdiv_25ulp_f32:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_cmp_lt_f32_e64 s0, 0x6f800000, |s7|
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x2f800000, s0
-; GFX10-NEXT:    v_mul_f32_e32 v1, s7, v0
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s4, 0x6f800000, |s3|
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x2f800000, s4
+; GFX10-NEXT:    v_mul_f32_e32 v1, s3, v0
 ; GFX10-NEXT:    v_rcp_f32_e32 v1, v1
-; GFX10-NEXT:    v_mul_f32_e32 v1, s6, v1
+; GFX10-NEXT:    v_mul_f32_e32 v1, s2, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX10-NEXT:    global_store_dword v2, v0, s[4:5]
+; GFX10-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_fdiv_25ulp_f32:
@@ -482,18 +482,18 @@ define amdgpu_kernel void @s_fdiv_25ulp_ieee_f32(ptr addrspace(1) %out, float %a
 ;
 ; GFX10-LABEL: s_fdiv_25ulp_ieee_f32:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_frexp_mant_f32_e32 v0, s7
-; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v1, s7
-; GFX10-NEXT:    v_frexp_mant_f32_e32 v2, s6
-; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v3, s6
+; GFX10-NEXT:    v_frexp_mant_f32_e32 v0, s3
+; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v1, s3
+; GFX10-NEXT:    v_frexp_mant_f32_e32 v2, s2
+; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v3, s2
 ; GFX10-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v1, v3, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX10-NEXT:    global_store_dword v2, v0, s[4:5]
+; GFX10-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_fdiv_25ulp_ieee_f32:
@@ -559,12 +559,12 @@ define amdgpu_kernel void @s_fdiv_fast_ieee_f32(ptr addrspace(1) %out, float %a,
 ;
 ; GFX10-LABEL: s_fdiv_fast_ieee_f32:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_rcp_f32_e32 v0, s7
-; GFX10-NEXT:    v_mul_f32_e32 v0, s6, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    v_rcp_f32_e32 v0, s3
+; GFX10-NEXT:    v_mul_f32_e32 v0, s2, v0
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_fdiv_fast_ieee_f32:
@@ -623,12 +623,12 @@ define amdgpu_kernel void @s_fdiv_f32_fast_math(ptr addrspace(1) %out, float %a,
 ;
 ; GFX10-LABEL: s_fdiv_f32_fast_math:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_rcp_f32_e32 v0, s7
-; GFX10-NEXT:    v_mul_f32_e32 v0, s6, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    v_rcp_f32_e32 v0, s3
+; GFX10-NEXT:    v_mul_f32_e32 v0, s2, v0
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_fdiv_f32_fast_math:
@@ -687,12 +687,12 @@ define amdgpu_kernel void @s_fdiv_ulp25_f32_fast_math(ptr addrspace(1) %out, flo
 ;
 ; GFX10-LABEL: s_fdiv_ulp25_f32_fast_math:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_rcp_f32_e32 v0, s7
-; GFX10-NEXT:    v_mul_f32_e32 v0, s6, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    v_rcp_f32_e32 v0, s3
+; GFX10-NEXT:    v_mul_f32_e32 v0, s2, v0
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_fdiv_ulp25_f32_fast_math:
@@ -829,10 +829,10 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_daz(ptr addrspace(1) %out, float %a,
 ;
 ; GFX10-LABEL: s_fdiv_f32_arcp_daz:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_div_scale_f32 v0, s0, s7, s7, s6
-; GFX10-NEXT:    v_div_scale_f32 v2, vcc_lo, s6, s7, s6
+; GFX10-NEXT:    v_div_scale_f32 v0, s4, s3, s3, s2
+; GFX10-NEXT:    v_div_scale_f32 v2, vcc_lo, s2, s3, s2
 ; GFX10-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX10-NEXT:    s_denorm_mode 15
 ; GFX10-NEXT:    v_fma_f32 v3, -v0, v1, 1.0
@@ -844,8 +844,8 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_daz(ptr addrspace(1) %out, float %a,
 ; GFX10-NEXT:    s_denorm_mode 12
 ; GFX10-NEXT:    v_div_fmas_f32 v0, v0, v1, v3
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    v_div_fixup_f32 v0, v0, s7, s6
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    v_div_fixup_f32 v0, v0, s3, s2
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_fdiv_f32_arcp_daz:
@@ -916,12 +916,12 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_ninf(ptr addrspace(1) %out, float %a,
 ;
 ; GFX10-LABEL: s_fdiv_f32_arcp_ninf:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_rcp_f32_e32 v0, s7
-; GFX10-NEXT:    v_mul_f32_e32 v0, s6, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    v_rcp_f32_e32 v0, s3
+; GFX10-NEXT:    v_mul_f32_e32 v0, s2, v0
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_fdiv_f32_arcp_ninf:
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
index 26714dcc6dfac..e44572985e6d2 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
@@ -4315,12 +4315,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr %out, i32 %in, i32 %old
 ;
 ; GCN3-LABEL: atomic_cmpxchg_i32_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v0, s4
-; GCN3-NEXT:    v_mov_b32_e32 v2, s6
-; GCN3-NEXT:    v_mov_b32_e32 v1, s5
-; GCN3-NEXT:    v_mov_b32_e32 v3, s7
+; GCN3-NEXT:    v_mov_b32_e32 v0, s0
+; GCN3-NEXT:    v_mov_b32_e32 v2, s2
+; GCN3-NEXT:    v_mov_b32_e32 v1, s1
+; GCN3-NEXT:    v_mov_b32_e32 v3, s3
 ; GCN3-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
@@ -4570,12 +4570,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32(ptr %out, i32 %in, i32 %old) {
 ;
 ; GCN3-LABEL: atomic_cmpxchg_i32:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v0, s4
-; GCN3-NEXT:    v_mov_b32_e32 v2, s6
-; GCN3-NEXT:    v_mov_b32_e32 v1, s5
-; GCN3-NEXT:    v_mov_b32_e32 v3, s7
+; GCN3-NEXT:    v_mov_b32_e32 v0, s0
+; GCN3-NEXT:    v_mov_b32_e32 v2, s2
+; GCN3-NEXT:    v_mov_b32_e32 v1, s1
+; GCN3-NEXT:    v_mov_b32_e32 v3, s3
 ; GCN3-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll
index 66aacd7062a6d..5bd527149572e 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll
@@ -3883,13 +3883,13 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 %
 ;
 ; GCN3-LABEL: atomic_max_i32_addr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_ashr_i32 s1, s7, 31
-; GCN3-NEXT:    s_mov_b32 s0, s7
-; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN3-NEXT:    s_add_u32 s0, s4, s0
-; GCN3-NEXT:    s_addc_u32 s1, s5, s1
+; GCN3-NEXT:    s_ashr_i32 s5, s3, 31
+; GCN3-NEXT:    s_mov_b32 s4, s3
+; GCN3-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
+; GCN3-NEXT:    s_add_u32 s0, s0, s4
+; GCN3-NEXT:    s_addc_u32 s1, s1, s5
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    flat_load_dword v3, v[0:1] offset:16
@@ -3897,7 +3897,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 %
 ; GCN3-NEXT:  .LBB88_1: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_max_i32_e32 v2, s6, v3
+; GCN3-NEXT:    v_max_i32_e32 v2, s2, v3
 ; GCN3-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
@@ -4085,13 +4085,13 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index)
 ;
 ; GCN3-LABEL: atomic_max_i32_addr64:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_ashr_i32 s1, s7, 31
-; GCN3-NEXT:    s_mov_b32 s0, s7
-; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN3-NEXT:    s_add_u32 s0, s4, s0
-; GCN3-NEXT:    s_addc_u32 s1, s5, s1
+; GCN3-NEXT:    s_ashr_i32 s5, s3, 31
+; GCN3-NEXT:    s_mov_b32 s4, s3
+; GCN3-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
+; GCN3-NEXT:    s_add_u32 s0, s0, s4
+; GCN3-NEXT:    s_addc_u32 s1, s1, s5
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    flat_load_dword v3, v[0:1]
@@ -4099,7 +4099,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index)
 ; GCN3-NEXT:  .LBB90_1: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_max_i32_e32 v2, s6, v3
+; GCN3-NEXT:    v_max_i32_e32 v2, s2, v3
 ; GCN3-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
@@ -5026,13 +5026,13 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32
 ;
 ; GCN3-LABEL: atomic_umax_i32_addr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_ashr_i32 s1, s7, 31
-; GCN3-NEXT:    s_mov_b32 s0, s7
-; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN3-NEXT:    s_add_u32 s0, s4, s0
-; GCN3-NEXT:    s_addc_u32 s1, s5, s1
+; GCN3-NEXT:    s_ashr_i32 s5, s3, 31
+; GCN3-NEXT:    s_mov_b32 s4, s3
+; GCN3-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
+; GCN3-NEXT:    s_add_u32 s0, s0, s4
+; GCN3-NEXT:    s_addc_u32 s1, s1, s5
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    flat_load_dword v3, v[0:1] offset:16
@@ -5040,7 +5040,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32
 ; GCN3-NEXT:  .LBB102_1: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_max_u32_e32 v2, s6, v3
+; GCN3-NEXT:    v_max_u32_e32 v2, s2, v3
 ; GCN3-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
@@ -6820,13 +6820,13 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 %
 ;
 ; GCN3-LABEL: atomic_min_i32_addr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_ashr_i32 s1, s7, 31
-; GCN3-NEXT:    s_mov_b32 s0, s7
-; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN3-NEXT:    s_add_u32 s0, s4, s0
-; GCN3-NEXT:    s_addc_u32 s1, s5, s1
+; GCN3-NEXT:    s_ashr_i32 s5, s3, 31
+; GCN3-NEXT:    s_mov_b32 s4, s3
+; GCN3-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
+; GCN3-NEXT:    s_add_u32 s0, s0, s4
+; GCN3-NEXT:    s_addc_u32 s1, s1, s5
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    flat_load_dword v3, v[0:1] offset:16
@@ -6834,7 +6834,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 %
 ; GCN3-NEXT:  .LBB125_1: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_min_i32_e32 v2, s6, v3
+; GCN3-NEXT:    v_min_i32_e32 v2, s2, v3
 ; GCN3-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
diff --git a/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll
index 9943976dd86da..718be90eb75fc 100644
--- a/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll
@@ -1389,49 +1389,49 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out,
 ;
 ; GFX10-FLUSH-LABEL: mad_sub_f16:
 ; GFX10-FLUSH:       ; %bb.0:
-; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX10-FLUSH-NEXT:    v_sub_f16_e32 v1, v1, v3
-; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX10-DENORM-STRICT-LABEL: mad_sub_f16:
 ; GFX10-DENORM-STRICT:       ; %bb.0:
-; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-STRICT-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX10-DENORM-STRICT-NEXT:    v_sub_f16_e32 v1, v1, v3
-; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-DENORM-STRICT-NEXT:    s_endpgm
 ;
 ; GFX10-DENORM-CONTRACT-LABEL: mad_sub_f16:
 ; GFX10-DENORM-CONTRACT:       ; %bb.0:
-; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, v1, v2, -v3
-; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-DENORM-CONTRACT-NEXT:    s_endpgm
 ;
 ; GFX11-FLUSH-LABEL: mad_sub_f16:
@@ -1558,49 +1558,49 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o
 ;
 ; GFX10-FLUSH-LABEL: mad_sub_inv_f16:
 ; GFX10-FLUSH:       ; %bb.0:
-; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX10-FLUSH-NEXT:    v_sub_f16_e32 v1, v3, v1
-; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX10-DENORM-STRICT-LABEL: mad_sub_inv_f16:
 ; GFX10-DENORM-STRICT:       ; %bb.0:
-; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-STRICT-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX10-DENORM-STRICT-NEXT:    v_sub_f16_e32 v1, v3, v1
-; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-DENORM-STRICT-NEXT:    s_endpgm
 ;
 ; GFX10-DENORM-CONTRACT-LABEL: mad_sub_inv_f16:
 ; GFX10-DENORM-CONTRACT:       ; %bb.0:
-; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, -v1, v2, v3
-; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-DENORM-CONTRACT-NEXT:    s_endpgm
 ;
 ; GFX11-FLUSH-LABEL: mad_sub_inv_f16:
@@ -1727,49 +1727,49 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture %
 ;
 ; GFX10-FLUSH-LABEL: mad_sub_fabs_f16:
 ; GFX10-FLUSH:       ; %bb.0:
-; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX10-FLUSH-NEXT:    v_sub_f16_e64 v1, v1, |v3|
-; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX10-DENORM-STRICT-LABEL: mad_sub_fabs_f16:
 ; GFX10-DENORM-STRICT:       ; %bb.0:
-; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-STRICT-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX10-DENORM-STRICT-NEXT:    v_sub_f16_e64 v1, v1, |v3|
-; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-DENORM-STRICT-NEXT:    s_endpgm
 ;
 ; GFX10-DENORM-CONTRACT-LABEL: mad_sub_fabs_f16:
 ; GFX10-DENORM-CONTRACT:       ; %bb.0:
-; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, v1, v2, -|v3|
-; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-DENORM-CONTRACT-NEXT:    s_endpgm
 ;
 ; GFX11-FLUSH-LABEL: mad_sub_fabs_f16:
@@ -1897,49 +1897,49 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu
 ;
 ; GFX10-FLUSH-LABEL: mad_sub_fabs_inv_f16:
 ; GFX10-FLUSH:       ; %bb.0:
-; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX10-FLUSH-NEXT:    v_sub_f16_e64 v1, |v3|, v1
-; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX10-DENORM-STRICT-LABEL: mad_sub_fabs_inv_f16:
 ; GFX10-DENORM-STRICT:       ; %bb.0:
-; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-STRICT-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX10-DENORM-STRICT-NEXT:    v_sub_f16_e64 v1, |v3|, v1
-; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-DENORM-STRICT-NEXT:    s_endpgm
 ;
 ; GFX10-DENORM-CONTRACT-LABEL: mad_sub_fabs_inv_f16:
 ; GFX10-DENORM-CONTRACT:       ; %bb.0:
-; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, -v1, v2, |v3|
-; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-DENORM-CONTRACT-NEXT:    s_endpgm
 ;
 ; GFX11-FLUSH-LABEL: mad_sub_fabs_inv_f16:
@@ -2067,49 +2067,49 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o
 ;
 ; GFX10-FLUSH-LABEL: neg_neg_mad_f16:
 ; GFX10-FLUSH:       ; %bb.0:
-; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX10-FLUSH-NEXT:    v_add_f16_e32 v1, v3, v1
-; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX10-DENORM-STRICT-LABEL: neg_neg_mad_f16:
 ; GFX10-DENORM-STRICT:       ; %bb.0:
-; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-STRICT-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX10-DENORM-STRICT-NEXT:    v_add_f16_e32 v1, v3, v1
-; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-DENORM-STRICT-NEXT:    s_endpgm
 ;
 ; GFX10-DENORM-CONTRACT-LABEL: neg_neg_mad_f16:
 ; GFX10-DENORM-CONTRACT:       ; %bb.0:
-; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-CONTRACT-NEXT:    v_fmac_f16_e32 v3, v1, v2
-; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v3, s[4:5]
+; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v3, s[0:1]
 ; GFX10-DENORM-CONTRACT-NEXT:    s_endpgm
 ;
 ; GFX11-FLUSH-LABEL: neg_neg_mad_f16:
@@ -2238,49 +2238,49 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture %
 ;
 ; GFX10-FLUSH-LABEL: mad_fabs_sub_f16:
 ; GFX10-FLUSH:       ; %bb.0:
-; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLUSH-NEXT:    v_mul_f16_e64 v1, v1, |v2|
 ; GFX10-FLUSH-NEXT:    v_sub_f16_e32 v1, v1, v3
-; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX10-DENORM-STRICT-LABEL: mad_fabs_sub_f16:
 ; GFX10-DENORM-STRICT:       ; %bb.0:
-; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-STRICT-NEXT:    v_mul_f16_e64 v1, v1, |v2|
 ; GFX10-DENORM-STRICT-NEXT:    v_sub_f16_e32 v1, v1, v3
-; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-DENORM-STRICT-NEXT:    s_endpgm
 ;
 ; GFX10-DENORM-CONTRACT-LABEL: mad_fabs_sub_f16:
 ; GFX10-DENORM-CONTRACT:       ; %bb.0:
-; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, v1, |v2|, -v3
-; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-DENORM-CONTRACT-NEXT:    s_endpgm
 ;
 ; GFX11-FLUSH-LABEL: mad_fabs_sub_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll
index f55e9f4821b47..5761c198e20ba 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll
@@ -316,10 +316,10 @@ define <2 x i16> @global_atomic_fadd_v2bf16_rtn(ptr addrspace(1) %ptr, <2 x i16>
 define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, <2 x half> %data) {
 ; GFX940-LABEL: local_atomic_fadd_v2f16_noret:
 ; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX940-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NEXT:    v_mov_b32_e32 v0, s0
+; GFX940-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NEXT:    ds_pk_add_f16 v0, v1
 ; GFX940-NEXT:    s_endpgm
 ;
@@ -359,10 +359,10 @@ define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half>
 define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, <2 x i16> %data) {
 ; GFX940-LABEL: local_atomic_fadd_v2bf16_noret:
 ; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX940-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NEXT:    v_mov_b32_e32 v0, s0
+; GFX940-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
 ; GFX940-NEXT:    ds_pk_add_bf16 v0, v1
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/fshl.ll b/llvm/test/CodeGen/AMDGPU/fshl.ll
index 3c4087fe391b6..4ea3323a9dbfc 100644
--- a/llvm/test/CodeGen/AMDGPU/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshl.ll
@@ -137,12 +137,12 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) {
 ;
 ; GFX9-LABEL: fshl_i32_imm:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-NEXT:    v_alignbit_b32 v1, s6, v1, 25
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_alignbit_b32 v1, s2, v1, 25
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; R600-LABEL: fshl_i32_imm:
@@ -159,11 +159,11 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) {
 ;
 ; GFX10-LABEL: fshl_i32_imm:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_alignbit_b32 v1, s6, s7, 25
-; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT:    v_alignbit_b32 v1, s2, s3, 25
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fshl_i32_imm:
@@ -734,15 +734,15 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) {
 ;
 ; GFX9-LABEL: orxor2or1:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b32 s0, s6, 7
-; GFX9-NEXT:    s_or_b32 s0, s7, s0
-; GFX9-NEXT:    s_cmp_eq_u32 s0, 0
-; GFX9-NEXT:    s_cselect_b32 s0, s6, s7
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    s_lshl_b32 s4, s2, 7
+; GFX9-NEXT:    s_or_b32 s4, s3, s4
+; GFX9-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX9-NEXT:    s_cselect_b32 s2, s2, s3
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; R600-LABEL: orxor2or1:
@@ -761,15 +761,15 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) {
 ;
 ; GFX10-LABEL: orxor2or1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_lshl_b32 s0, s6, 7
-; GFX10-NEXT:    s_or_b32 s0, s7, s0
-; GFX10-NEXT:    s_cmp_eq_u32 s0, 0
-; GFX10-NEXT:    s_cselect_b32 s0, s6, s7
-; GFX10-NEXT:    v_mov_b32_e32 v1, s0
-; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT:    s_lshl_b32 s4, s2, 7
+; GFX10-NEXT:    s_or_b32 s4, s3, s4
+; GFX10-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX10-NEXT:    s_cselect_b32 s2, s2, s3
+; GFX10-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: orxor2or1:
diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll
index e8377763e4be2..e8310e73f9a47 100644
--- a/llvm/test/CodeGen/AMDGPU/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshr.ll
@@ -129,12 +129,12 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) {
 ;
 ; GFX9-LABEL: fshr_i32_imm:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-NEXT:    v_alignbit_b32 v1, s6, v1, 7
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_alignbit_b32 v1, s2, v1, 7
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; R600-LABEL: fshr_i32_imm:
@@ -151,11 +151,11 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) {
 ;
 ; GFX10-LABEL: fshr_i32_imm:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_alignbit_b32 v1, s6, s7, 7
-; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT:    v_alignbit_b32 v1, s2, s3, 7
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fshr_i32_imm:
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics.ll b/llvm/test/CodeGen/AMDGPU/global_atomics.ll
index d4398e5367c7f..dac3a3db7b450 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics.ll
@@ -4155,12 +4155,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr addrspace(1) %out, i32
 ;
 ; GFX9-LABEL: atomic_cmpxchg_i32_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-NEXT:    global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
@@ -4406,12 +4406,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32(ptr addrspace(1) %out, i32 %in, i3
 ;
 ; GFX9-LABEL: atomic_cmpxchg_i32:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-NEXT:    global_atomic_cmpswap v2, v[0:1], s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
index 1fa7c52a68802..516c92f1640ea 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
@@ -4679,28 +4679,28 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i
 ;
 ; GFX9-LABEL: atomic_max_i32_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s1, s7, 31
-; GFX9-NEXT:    s_mov_b32 s0, s7
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX9-NEXT:    s_add_u32 s0, s4, s0
-; GFX9-NEXT:    s_addc_u32 s1, s5, s1
-; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x10
+; GFX9-NEXT:    s_ashr_i32 s5, s3, 31
+; GFX9-NEXT:    s_mov_b32 s4, s3
+; GFX9-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
+; GFX9-NEXT:    s_add_u32 s0, s0, s4
+; GFX9-NEXT:    s_addc_u32 s1, s1, s5
+; GFX9-NEXT:    s_load_dword s3, s[0:1], 0x10
+; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:  .LBB91_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_max_i32_e32 v0, s6, v1
+; GFX9-NEXT:    v_max_i32_e32 v0, s2, v1
 ; GFX9-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB91_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_endpgm
@@ -4890,28 +4890,28 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in,
 ;
 ; GFX9-LABEL: atomic_max_i32_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s1, s7, 31
-; GFX9-NEXT:    s_mov_b32 s0, s7
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX9-NEXT:    s_add_u32 s0, s4, s0
-; GFX9-NEXT:    s_addc_u32 s1, s5, s1
-; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT:    s_ashr_i32 s5, s3, 31
+; GFX9-NEXT:    s_mov_b32 s4, s3
+; GFX9-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
+; GFX9-NEXT:    s_add_u32 s0, s0, s4
+; GFX9-NEXT:    s_addc_u32 s1, s1, s5
+; GFX9-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:  .LBB93_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_max_i32_e32 v0, s6, v1
+; GFX9-NEXT:    v_max_i32_e32 v0, s2, v1
 ; GFX9-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB93_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_endpgm
@@ -5932,28 +5932,28 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out,
 ;
 ; GFX9-LABEL: atomic_umax_i32_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s1, s7, 31
-; GFX9-NEXT:    s_mov_b32 s0, s7
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX9-NEXT:    s_add_u32 s0, s4, s0
-; GFX9-NEXT:    s_addc_u32 s1, s5, s1
-; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x10
+; GFX9-NEXT:    s_ashr_i32 s5, s3, 31
+; GFX9-NEXT:    s_mov_b32 s4, s3
+; GFX9-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
+; GFX9-NEXT:    s_add_u32 s0, s0, s4
+; GFX9-NEXT:    s_addc_u32 s1, s1, s5
+; GFX9-NEXT:    s_load_dword s3, s[0:1], 0x10
+; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:  .LBB105_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_max_u32_e32 v0, s6, v1
+; GFX9-NEXT:    v_max_u32_e32 v0, s2, v1
 ; GFX9-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB105_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_endpgm
@@ -7923,28 +7923,28 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i
 ;
 ; GFX9-LABEL: atomic_min_i32_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s1, s7, 31
-; GFX9-NEXT:    s_mov_b32 s0, s7
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX9-NEXT:    s_add_u32 s0, s4, s0
-; GFX9-NEXT:    s_addc_u32 s1, s5, s1
-; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x10
+; GFX9-NEXT:    s_ashr_i32 s5, s3, 31
+; GFX9-NEXT:    s_mov_b32 s4, s3
+; GFX9-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
+; GFX9-NEXT:    s_add_u32 s0, s0, s4
+; GFX9-NEXT:    s_addc_u32 s1, s1, s5
+; GFX9-NEXT:    s_load_dword s3, s[0:1], 0x10
+; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:  .LBB128_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_min_i32_e32 v0, s6, v1
+; GFX9-NEXT:    v_min_i32_e32 v0, s2, v1
 ; GFX9-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB128_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
index d5265e364a17e..df03e89370377 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
@@ -436,121 +436,121 @@ entry:
 define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ; GFX9-LABEL: udiv_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s7
-; GFX9-NEXT:    s_sub_i32 s0, 0, s7
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
+; GFX9-NEXT:    s_sub_i32 s4, 0, s3
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_readfirstlane_b32 s1, v0
-; GFX9-NEXT:    s_mul_i32 s0, s0, s1
-; GFX9-NEXT:    s_mul_hi_u32 s0, s1, s0
-; GFX9-NEXT:    s_add_i32 s1, s1, s0
-; GFX9-NEXT:    s_mul_hi_u32 s0, s6, s1
-; GFX9-NEXT:    s_mul_i32 s1, s0, s7
-; GFX9-NEXT:    s_sub_i32 s1, s6, s1
-; GFX9-NEXT:    s_add_i32 s2, s0, 1
-; GFX9-NEXT:    s_sub_i32 s3, s1, s7
-; GFX9-NEXT:    s_cmp_ge_u32 s1, s7
-; GFX9-NEXT:    s_cselect_b32 s0, s2, s0
-; GFX9-NEXT:    s_cselect_b32 s1, s3, s1
-; GFX9-NEXT:    s_add_i32 s2, s0, 1
-; GFX9-NEXT:    s_cmp_ge_u32 s1, s7
-; GFX9-NEXT:    s_cselect_b32 s0, s2, s0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX9-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX9-NEXT:    s_mul_i32 s4, s4, s5
+; GFX9-NEXT:    s_mul_hi_u32 s4, s5, s4
+; GFX9-NEXT:    s_add_i32 s5, s5, s4
+; GFX9-NEXT:    s_mul_hi_u32 s4, s2, s5
+; GFX9-NEXT:    s_mul_i32 s5, s4, s3
+; GFX9-NEXT:    s_sub_i32 s2, s2, s5
+; GFX9-NEXT:    s_add_i32 s6, s4, 1
+; GFX9-NEXT:    s_sub_i32 s5, s2, s3
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX9-NEXT:    s_cselect_b32 s4, s6, s4
+; GFX9-NEXT:    s_cselect_b32 s2, s5, s2
+; GFX9-NEXT:    s_add_i32 s5, s4, 1
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX9-NEXT:    s_cselect_b32 s2, s5, s4
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX90A-LABEL: udiv_i32:
 ; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, 0
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s7
-; GFX90A-NEXT:    s_sub_i32 s0, 0, s7
+; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s3
+; GFX90A-NEXT:    s_sub_i32 s4, 0, s3
 ; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX90A-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX90A-NEXT:    v_readfirstlane_b32 s1, v0
-; GFX90A-NEXT:    s_mul_i32 s0, s0, s1
-; GFX90A-NEXT:    s_mul_hi_u32 s0, s1, s0
-; GFX90A-NEXT:    s_add_i32 s1, s1, s0
-; GFX90A-NEXT:    s_mul_hi_u32 s0, s6, s1
-; GFX90A-NEXT:    s_mul_i32 s1, s0, s7
-; GFX90A-NEXT:    s_sub_i32 s1, s6, s1
-; GFX90A-NEXT:    s_add_i32 s2, s0, 1
-; GFX90A-NEXT:    s_sub_i32 s3, s1, s7
-; GFX90A-NEXT:    s_cmp_ge_u32 s1, s7
-; GFX90A-NEXT:    s_cselect_b32 s0, s2, s0
-; GFX90A-NEXT:    s_cselect_b32 s1, s3, s1
-; GFX90A-NEXT:    s_add_i32 s2, s0, 1
-; GFX90A-NEXT:    s_cmp_ge_u32 s1, s7
-; GFX90A-NEXT:    s_cselect_b32 s0, s2, s0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX90A-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX90A-NEXT:    s_mul_i32 s4, s4, s5
+; GFX90A-NEXT:    s_mul_hi_u32 s4, s5, s4
+; GFX90A-NEXT:    s_add_i32 s5, s5, s4
+; GFX90A-NEXT:    s_mul_hi_u32 s4, s2, s5
+; GFX90A-NEXT:    s_mul_i32 s5, s4, s3
+; GFX90A-NEXT:    s_sub_i32 s2, s2, s5
+; GFX90A-NEXT:    s_add_i32 s6, s4, 1
+; GFX90A-NEXT:    s_sub_i32 s5, s2, s3
+; GFX90A-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX90A-NEXT:    s_cselect_b32 s4, s6, s4
+; GFX90A-NEXT:    s_cselect_b32 s2, s5, s2
+; GFX90A-NEXT:    s_add_i32 s5, s4, 1
+; GFX90A-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX90A-NEXT:    s_cselect_b32 s2, s5, s4
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s2
+; GFX90A-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: udiv_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s7
-; GFX10-NEXT:    s_sub_i32 s1, 0, s7
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s3
+; GFX10-NEXT:    s_sub_i32 s5, 0, s3
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-NEXT:    s_mul_i32 s1, s1, s0
-; GFX10-NEXT:    s_mul_hi_u32 s1, s0, s1
-; GFX10-NEXT:    s_add_i32 s0, s0, s1
-; GFX10-NEXT:    s_mul_hi_u32 s0, s6, s0
-; GFX10-NEXT:    s_mul_i32 s1, s0, s7
-; GFX10-NEXT:    s_add_i32 s2, s0, 1
-; GFX10-NEXT:    s_sub_i32 s1, s6, s1
-; GFX10-NEXT:    s_sub_i32 s3, s1, s7
-; GFX10-NEXT:    s_cmp_ge_u32 s1, s7
-; GFX10-NEXT:    s_cselect_b32 s0, s2, s0
-; GFX10-NEXT:    s_cselect_b32 s1, s3, s1
-; GFX10-NEXT:    s_add_i32 s2, s0, 1
-; GFX10-NEXT:    s_cmp_ge_u32 s1, s7
-; GFX10-NEXT:    s_cselect_b32 s0, s2, s0
-; GFX10-NEXT:    v_mov_b32_e32 v1, s0
-; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT:    s_mul_i32 s5, s5, s4
+; GFX10-NEXT:    s_mul_hi_u32 s5, s4, s5
+; GFX10-NEXT:    s_add_i32 s4, s4, s5
+; GFX10-NEXT:    s_mul_hi_u32 s4, s2, s4
+; GFX10-NEXT:    s_mul_i32 s5, s4, s3
+; GFX10-NEXT:    s_sub_i32 s2, s2, s5
+; GFX10-NEXT:    s_add_i32 s5, s4, 1
+; GFX10-NEXT:    s_sub_i32 s6, s2, s3
+; GFX10-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX10-NEXT:    s_cselect_b32 s4, s5, s4
+; GFX10-NEXT:    s_cselect_b32 s2, s6, s2
+; GFX10-NEXT:    s_add_i32 s5, s4, 1
+; GFX10-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX10-NEXT:    s_cselect_b32 s2, s5, s4
+; GFX10-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-FLATSCR-LABEL: udiv_i32:
 ; GFX9-FLATSCR:       ; %bb.0:
-; GFX9-FLATSCR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-FLATSCR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-FLATSCR-NEXT:    v_cvt_f32_u32_e32 v0, s7
-; GFX9-FLATSCR-NEXT:    s_sub_i32 s0, 0, s7
+; GFX9-FLATSCR-NEXT:    v_cvt_f32_u32_e32 v0, s3
+; GFX9-FLATSCR-NEXT:    s_sub_i32 s4, 0, s3
 ; GFX9-FLATSCR-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-FLATSCR-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-FLATSCR-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-FLATSCR-NEXT:    v_readfirstlane_b32 s1, v0
-; GFX9-FLATSCR-NEXT:    s_mul_i32 s0, s0, s1
-; GFX9-FLATSCR-NEXT:    s_mul_hi_u32 s0, s1, s0
-; GFX9-FLATSCR-NEXT:    s_add_i32 s1, s1, s0
-; GFX9-FLATSCR-NEXT:    s_mul_hi_u32 s0, s6, s1
-; GFX9-FLATSCR-NEXT:    s_mul_i32 s1, s0, s7
-; GFX9-FLATSCR-NEXT:    s_sub_i32 s1, s6, s1
-; GFX9-FLATSCR-NEXT:    s_add_i32 s2, s0, 1
-; GFX9-FLATSCR-NEXT:    s_sub_i32 s3, s1, s7
-; GFX9-FLATSCR-NEXT:    s_cmp_ge_u32 s1, s7
-; GFX9-FLATSCR-NEXT:    s_cselect_b32 s0, s2, s0
-; GFX9-FLATSCR-NEXT:    s_cselect_b32 s1, s3, s1
-; GFX9-FLATSCR-NEXT:    s_add_i32 s2, s0, 1
-; GFX9-FLATSCR-NEXT:    s_cmp_ge_u32 s1, s7
-; GFX9-FLATSCR-NEXT:    s_cselect_b32 s0, s2, s0
-; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-FLATSCR-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX9-FLATSCR-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX9-FLATSCR-NEXT:    s_mul_i32 s4, s4, s5
+; GFX9-FLATSCR-NEXT:    s_mul_hi_u32 s4, s5, s4
+; GFX9-FLATSCR-NEXT:    s_add_i32 s5, s5, s4
+; GFX9-FLATSCR-NEXT:    s_mul_hi_u32 s4, s2, s5
+; GFX9-FLATSCR-NEXT:    s_mul_i32 s5, s4, s3
+; GFX9-FLATSCR-NEXT:    s_sub_i32 s2, s2, s5
+; GFX9-FLATSCR-NEXT:    s_add_i32 s6, s4, 1
+; GFX9-FLATSCR-NEXT:    s_sub_i32 s5, s2, s3
+; GFX9-FLATSCR-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX9-FLATSCR-NEXT:    s_cselect_b32 s4, s6, s4
+; GFX9-FLATSCR-NEXT:    s_cselect_b32 s2, s5, s2
+; GFX9-FLATSCR-NEXT:    s_add_i32 s5, s4, 1
+; GFX9-FLATSCR-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX9-FLATSCR-NEXT:    s_cselect_b32 s2, s5, s4
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-FLATSCR-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
index 45a1afbf11992..920ff8a927e2d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
@@ -32,21 +32,21 @@ define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, float %x
 ;
 ; GFX9-LABEL: s_cvt_pkrtz_v2f16_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-NEXT:    v_cvt_pkrtz_f16_f32 v1, s6, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_cvt_pkrtz_f16_f32 v1, s2, v1
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: s_cvt_pkrtz_v2f16_f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_cvt_pkrtz_f16_f32_e64 v1, s6, s7
-; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT:    v_cvt_pkrtz_f16_f32_e64 v1, s2, s3
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_cvt_pkrtz_v2f16_f32:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll
index 5d20a848bd6a6..17b941c59fd3f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll
@@ -27,12 +27,12 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f32_oeq_with_fabs:
 ; SDAG-GFX10:       ; %bb.0:
-; SDAG-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_eq_f32_e64 s0, s6, |s7|
-; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s0
-; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_eq_f32_e64 s2, s2, |s3|
+; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
+; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
 ;
 ; GISEL-GFX11-LABEL: v_fcmp_f32_oeq_with_fabs:
@@ -50,12 +50,12 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f32_oeq_with_fabs:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_eq_f32_e64 s0, s6, |s7|
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_eq_f32_e64 s2, s2, |s3|
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %temp = call float @llvm.fabs.f32(float %a)
   %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float %temp, i32 1)
@@ -78,12 +78,12 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace(
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs:
 ; SDAG-GFX10:       ; %bb.0:
-; SDAG-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_eq_f32_e64 s0, |s6|, |s7|
-; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s0
-; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_eq_f32_e64 s2, |s2|, |s3|
+; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
+; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
 ;
 ; GISEL-GFX11-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs:
@@ -101,12 +101,12 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace(
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_eq_f32_e64 s0, |s6|, |s7|
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_eq_f32_e64 s2, |s2|, |s3|
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %temp = call float @llvm.fabs.f32(float %a)
   %src_input = call float @llvm.fabs.f32(float %src)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll
index 674fec1b865a6..ce055d6527996 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll
@@ -30,14 +30,14 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float
 ;
 ; GFX9-LABEL: v_fcmp_f32_oeq_with_fabs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s7
-; GFX9-NEXT:    v_cmp_eq_f32_e64 s[0:1], s6, |v0|
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s3
+; GFX9-NEXT:    v_cmp_eq_f32_e64 s[2:3], s2, |v0|
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_fcmp_f32_oeq_with_fabs:
@@ -88,14 +88,14 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace(
 ;
 ; GFX9-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s7
-; GFX9-NEXT:    v_cmp_eq_f32_e64 s[0:1], |s6|, |v0|
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s3
+; GFX9-NEXT:    v_cmp_eq_f32_e64 s[2:3], |s2|, |v0|
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll
index 8fe85e49a4207..309fd99031155 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll
@@ -1759,16 +1759,16 @@ define amdgpu_kernel void @v_icmp_i1_ne0(ptr addrspace(1) %out, i32 %a, i32 %b)
 ;
 ; GFX10-LABEL: v_icmp_i1_ne0:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_cmp_gt_u32 s6, 1
-; GFX10-NEXT:    s_cselect_b32 s0, -1, 0
-; GFX10-NEXT:    s_cmp_gt_u32 s7, 2
-; GFX10-NEXT:    s_cselect_b32 s1, -1, 0
-; GFX10-NEXT:    s_and_b32 s0, s0, s1
-; GFX10-NEXT:    v_mov_b32_e32 v1, s0
-; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT:    s_cmp_gt_u32 s2, 1
+; GFX10-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX10-NEXT:    s_cmp_gt_u32 s3, 2
+; GFX10-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX10-NEXT:    s_and_b32 s2, s2, s3
+; GFX10-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
   %c0 = icmp ugt i32 %a, 1
   %c1 = icmp ugt i32 %b, 2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll
index a650f999835c6..5f979e0177f58 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll
@@ -1986,17 +1986,17 @@ define amdgpu_kernel void @v_icmp_i1_ne0(ptr addrspace(1) %out, i32 %a, i32 %b)
 ;
 ; GFX9-LABEL: v_icmp_i1_ne0:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_cmp_gt_u32 s6, 1
-; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GFX9-NEXT:    s_cmp_gt_u32 s7, 2
+; GFX9-NEXT:    s_cmp_gt_u32 s2, 1
+; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GFX9-NEXT:    s_cmp_gt_u32 s3, 2
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    s_and_b64 s[2:3], s[4:5], s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %c0 = icmp ugt i32 %a, 1
   %c1 = icmp ugt i32 %b, 2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
index b0706025f0b68..dba67a03c000e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
@@ -14,12 +14,12 @@ entry:
 define amdgpu_kernel void @test_iglp_opt_mfma_gemm(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 {
 ; GCN-LABEL: test_iglp_opt_mfma_gemm:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
 ; GCN-NEXT:    v_mov_b32_e32 v3, 2.0
 ; GCN-NEXT:    ; iglp_opt mask(0x00000000)
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_add_u32_e32 v1, s2, v0
+; GCN-NEXT:    v_add_u32_e32 v1, s0, v0
 ; GCN-NEXT:    v_add_u32_e32 v2, 0x6000, v1
 ; GCN-NEXT:    ds_read_b128 a[28:31], v2 offset:57456
 ; GCN-NEXT:    ds_read_b128 a[24:27], v2 offset:57440
@@ -44,7 +44,7 @@ define amdgpu_kernel void @test_iglp_opt_mfma_gemm(ptr addrspace(3) noalias %in,
 ; GCN-NEXT:    ds_read_b128 a[152:155], v1 offset:96
 ; GCN-NEXT:    ds_read_b128 a[68:71], v1 offset:24592
 ; GCN-NEXT:    ds_read_b128 a[64:67], v1 offset:24576
-; GCN-NEXT:    v_add_u32_e32 v0, s3, v0
+; GCN-NEXT:    v_add_u32_e32 v0, s1, v0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(4)
 ; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[32:63], v2, v3, a[32:63]
 ; GCN-NEXT:    ds_read_b128 a[148:151], v1 offset:80
@@ -80,7 +80,7 @@ define amdgpu_kernel void @test_iglp_opt_mfma_gemm(ptr addrspace(3) noalias %in,
 ; GCN-NEXT:    ds_write_b128 v0, a[136:139] offset:32
 ; GCN-NEXT:    ds_write_b128 v0, a[132:135] offset:16
 ; GCN-NEXT:    ds_write_b128 v0, a[128:131]
-; GCN-NEXT:    v_mov_b32_e32 v0, s3
+; GCN-NEXT:    v_mov_b32_e32 v0, s1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(8)
 ; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[64:95], v2, v3, a[64:95]
 ; GCN-NEXT:    ds_write_b128 v0, a[56:59] offset:24672
@@ -151,13 +151,13 @@ entry:
 define amdgpu_kernel void @test_iglp_opt_rev_mfma_gemm(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 {
 ; GCN-LABEL: test_iglp_opt_rev_mfma_gemm:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
 ; GCN-NEXT:    v_mov_b32_e32 v2, 1.0
 ; GCN-NEXT:    v_mov_b32_e32 v3, 2.0
 ; GCN-NEXT:    ; iglp_opt mask(0x00000001)
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_add_u32_e32 v1, s2, v0
+; GCN-NEXT:    v_add_u32_e32 v1, s0, v0
 ; GCN-NEXT:    ds_read_b128 a[28:31], v1 offset:112
 ; GCN-NEXT:    ds_read_b128 a[24:27], v1 offset:96
 ; GCN-NEXT:    ds_read_b128 a[20:23], v1 offset:80
@@ -176,7 +176,7 @@ define amdgpu_kernel void @test_iglp_opt_rev_mfma_gemm(ptr addrspace(3) noalias
 ; GCN-NEXT:    ds_read_b128 a[136:139], v1 offset:8224
 ; GCN-NEXT:    ds_read_b128 a[132:135], v1 offset:8208
 ; GCN-NEXT:    ds_read_b128 a[128:131], v1 offset:8192
-; GCN-NEXT:    v_add_u32_e32 v0, s3, v0
+; GCN-NEXT:    v_add_u32_e32 v0, s1, v0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[128:159], v2, v3, a[128:159]
 ; GCN-NEXT:    ds_read_b128 a[124:127], v1 offset:24688
@@ -218,7 +218,7 @@ define amdgpu_kernel void @test_iglp_opt_rev_mfma_gemm(ptr addrspace(3) noalias
 ; GCN-NEXT:    ds_write_b128 v0, a[8:11] offset:32
 ; GCN-NEXT:    ds_write_b128 v0, a[4:7] offset:16
 ; GCN-NEXT:    ds_write_b128 v0, a[0:3]
-; GCN-NEXT:    v_mov_b32_e32 v0, s3
+; GCN-NEXT:    v_mov_b32_e32 v0, s1
 ; GCN-NEXT:    ds_write_b128 v0, a[152:155] offset:8288
 ; GCN-NEXT:    ds_write_b128 v0, a[156:159] offset:8304
 ; GCN-NEXT:    ds_write_b128 v0, a[144:147] offset:8256
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
index cd92529b77165..265d64f47bb23 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
@@ -245,26 +245,24 @@ define amdgpu_kernel void @v_permlane16_b32_vvv(ptr addrspace(1) %out, i32 %src0
 define amdgpu_kernel void @v_permlane16_b32_vvs(ptr addrspace(1) %out, i32 %src0, i32 %src2) {
 ; GFX10-SDAG-LABEL: v_permlane16_b32_vvs:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-SDAG-NEXT:    s_mov_b32 null, 0
-; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s6
-; GFX10-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, s7
-; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v1, v1, s2, s3
+; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_permlane16_b32_vvs:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-GISEL-NEXT:    s_mov_b32 null, 0
-; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s6
-; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s7
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, s4, s3
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_permlane16_b32_vvs:
@@ -327,17 +325,27 @@ define amdgpu_kernel void @v_permlane16_b32_vvs(ptr addrspace(1) %out, i32 %src0
 }
 
 define amdgpu_kernel void @v_permlane16_b32_vsv(ptr addrspace(1) %out, i32 %src0, i32 %src1) {
-; GFX10-LABEL: v_permlane16_b32_vsv:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-NEXT:    s_mov_b32 null, 0
-; GFX10-NEXT:    v_readfirstlane_b32 s0, v1
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s6
-; GFX10-NEXT:    v_permlane16_b32 v0, v0, s7, s0
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
-; GFX10-NEXT:    s_endpgm
+; GFX10-SDAG-LABEL: v_permlane16_b32_vsv:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v0, v0, s3, s2
+; GFX10-SDAG-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlane16_b32_vsv:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, s3, s4
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_permlane16_b32_vsv:
 ; GFX11-SDAG:       ; %bb.0:
@@ -773,26 +781,24 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv(ptr addrspace(1) %out, i32 %src
 define amdgpu_kernel void @v_permlanex16_b32_vvs(ptr addrspace(1) %out, i32 %src0, i32 %src2) {
 ; GFX10-SDAG-LABEL: v_permlanex16_b32_vvs:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-SDAG-NEXT:    s_mov_b32 null, 0
-; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s6
-; GFX10-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, s7
-; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s2, s3
+; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_permlanex16_b32_vvs:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-GISEL-NEXT:    s_mov_b32 null, 0
-; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s6
-; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s7
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s4, s3
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_permlanex16_b32_vvs:
@@ -855,17 +861,27 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs(ptr addrspace(1) %out, i32 %src
 }
 
 define amdgpu_kernel void @v_permlanex16_b32_vsv(ptr addrspace(1) %out, i32 %src0, i32 %src1) {
-; GFX10-LABEL: v_permlanex16_b32_vsv:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-NEXT:    s_mov_b32 null, 0
-; GFX10-NEXT:    v_readfirstlane_b32 s0, v1
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s6
-; GFX10-NEXT:    v_permlanex16_b32 v0, v0, s7, s0
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
-; GFX10-NEXT:    s_endpgm
+; GFX10-SDAG-LABEL: v_permlanex16_b32_vsv:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s3, s2
+; GFX10-SDAG-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlanex16_b32_vsv:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s3, s4
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_permlanex16_b32_vsv:
 ; GFX11-SDAG:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
index ae5b62ffb285b..10f09b6390aba 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
@@ -29,18 +29,18 @@ entry:
 define amdgpu_kernel void @test_sched_group_barrier_pipeline_READ_VALU_WRITE(ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %out) #0 {
 ; GCN-LABEL: test_sched_group_barrier_pipeline_READ_VALU_WRITE:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GCN-NEXT:    v_lshlrev_b32_e32 v32, 7, v0
-; GCN-NEXT:    ; kill: killed $sgpr4_sgpr5
+; GCN-NEXT:    ; kill: killed $sgpr0_sgpr1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    global_load_dwordx4 v[0:3], v32, s[4:5]
-; GCN-NEXT:    global_load_dwordx4 v[4:7], v32, s[4:5] offset:16
-; GCN-NEXT:    global_load_dwordx4 v[8:11], v32, s[4:5] offset:32
-; GCN-NEXT:    global_load_dwordx4 v[12:15], v32, s[4:5] offset:48
-; GCN-NEXT:    global_load_dwordx4 v[16:19], v32, s[4:5] offset:64
-; GCN-NEXT:    global_load_dwordx4 v[20:23], v32, s[4:5] offset:80
-; GCN-NEXT:    global_load_dwordx4 v[24:27], v32, s[4:5] offset:96
-; GCN-NEXT:    global_load_dwordx4 v[28:31], v32, s[4:5] offset:112
+; GCN-NEXT:    global_load_dwordx4 v[0:3], v32, s[0:1]
+; GCN-NEXT:    global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
+; GCN-NEXT:    global_load_dwordx4 v[8:11], v32, s[0:1] offset:32
+; GCN-NEXT:    global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
+; GCN-NEXT:    global_load_dwordx4 v[16:19], v32, s[0:1] offset:64
+; GCN-NEXT:    global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
+; GCN-NEXT:    global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
+; GCN-NEXT:    global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000020) size(8) SyncID(0)
 ; GCN-NEXT:    s_waitcnt vmcnt(7)
 ; GCN-NEXT:    v_mul_lo_u32 v3, v3, v3
@@ -82,32 +82,32 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_READ_VALU_WRITE(ptr
 ; GCN-NEXT:    v_mul_lo_u32 v30, v30, v30
 ; GCN-NEXT:    v_mul_lo_u32 v29, v29, v29
 ; GCN-NEXT:    v_mul_lo_u32 v28, v28, v28
-; GCN-NEXT:    global_store_dwordx4 v32, v[28:31], s[6:7] offset:112
-; GCN-NEXT:    global_store_dwordx4 v32, v[24:27], s[6:7] offset:96
-; GCN-NEXT:    global_store_dwordx4 v32, v[20:23], s[6:7] offset:80
-; GCN-NEXT:    global_store_dwordx4 v32, v[16:19], s[6:7] offset:64
-; GCN-NEXT:    global_store_dwordx4 v32, v[12:15], s[6:7] offset:48
-; GCN-NEXT:    global_store_dwordx4 v32, v[8:11], s[6:7] offset:32
-; GCN-NEXT:    global_store_dwordx4 v32, v[4:7], s[6:7] offset:16
-; GCN-NEXT:    global_store_dwordx4 v32, v[0:3], s[6:7]
+; GCN-NEXT:    global_store_dwordx4 v32, v[28:31], s[2:3] offset:112
+; GCN-NEXT:    global_store_dwordx4 v32, v[24:27], s[2:3] offset:96
+; GCN-NEXT:    global_store_dwordx4 v32, v[20:23], s[2:3] offset:80
+; GCN-NEXT:    global_store_dwordx4 v32, v[16:19], s[2:3] offset:64
+; GCN-NEXT:    global_store_dwordx4 v32, v[12:15], s[2:3] offset:48
+; GCN-NEXT:    global_store_dwordx4 v32, v[8:11], s[2:3] offset:32
+; GCN-NEXT:    global_store_dwordx4 v32, v[4:7], s[2:3] offset:16
+; GCN-NEXT:    global_store_dwordx4 v32, v[0:3], s[2:3]
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000002) size(30) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000040) size(8) SyncID(0)
 ; GCN-NEXT:    s_endpgm
 ;
 ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_READ_VALU_WRITE:
 ; EXACTCUTOFF:       ; %bb.0:
-; EXACTCUTOFF-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; EXACTCUTOFF-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; EXACTCUTOFF-NEXT:    v_lshlrev_b32_e32 v32, 7, v0
-; EXACTCUTOFF-NEXT:    ; kill: killed $sgpr4_sgpr5
+; EXACTCUTOFF-NEXT:    ; kill: killed $sgpr0_sgpr1
 ; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[0:3], v32, s[4:5]
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[4:7], v32, s[4:5] offset:16
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[8:11], v32, s[4:5] offset:32
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[12:15], v32, s[4:5] offset:48
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[16:19], v32, s[4:5] offset:64
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[20:23], v32, s[4:5] offset:80
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[24:27], v32, s[4:5] offset:96
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[28:31], v32, s[4:5] offset:112
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[0:3], v32, s[0:1]
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[8:11], v32, s[0:1] offset:32
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[16:19], v32, s[0:1] offset:64
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000020) size(8) SyncID(0)
 ; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(7)
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v3, v3, v3
@@ -149,14 +149,14 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_READ_VALU_WRITE(ptr
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v30, v30, v30
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v29, v29, v29
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v28, v28, v28
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[28:31], s[6:7] offset:112
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[24:27], s[6:7] offset:96
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[20:23], s[6:7] offset:80
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[16:19], s[6:7] offset:64
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[12:15], s[6:7] offset:48
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[8:11], s[6:7] offset:32
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[4:7], s[6:7] offset:16
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[0:3], s[6:7]
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[28:31], s[2:3] offset:112
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[24:27], s[2:3] offset:96
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[20:23], s[2:3] offset:80
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[16:19], s[2:3] offset:64
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[12:15], s[2:3] offset:48
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[8:11], s[2:3] offset:32
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[4:7], s[2:3] offset:16
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[0:3], s[2:3]
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000002) size(30) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000040) size(8) SyncID(0)
 ; EXACTCUTOFF-NEXT:    s_endpgm
@@ -178,17 +178,17 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_READ_VALU_WRITE(ptr
 define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VALU(ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %out) #0 {
 ; GCN-LABEL: test_sched_group_barrier_pipeline_alternating_READ_VALU:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GCN-NEXT:    v_lshlrev_b32_e32 v32, 7, v0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    global_load_dwordx4 v[28:31], v32, s[4:5] offset:16
-; GCN-NEXT:    global_load_dwordx4 v[8:11], v32, s[4:5] offset:96
+; GCN-NEXT:    global_load_dwordx4 v[28:31], v32, s[0:1] offset:16
+; GCN-NEXT:    global_load_dwordx4 v[8:11], v32, s[0:1] offset:96
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_mul_lo_u32 v29, v29, v29
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mul_lo_u32 v9, v9, v9
-; GCN-NEXT:    global_load_dwordx4 v[0:3], v32, s[4:5]
+; GCN-NEXT:    global_load_dwordx4 v[0:3], v32, s[0:1]
 ; GCN-NEXT:    v_mul_lo_u32 v8, v8, v8
 ; GCN-NEXT:    v_mul_lo_u32 v28, v28, v28
 ; GCN-NEXT:    v_mul_lo_u32 v31, v31, v31
@@ -198,12 +198,12 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mul_lo_u32 v3, v3, v3
 ; GCN-NEXT:    v_mul_lo_u32 v2, v2, v2
-; GCN-NEXT:    global_load_dwordx4 v[4:7], v32, s[4:5] offset:112
+; GCN-NEXT:    global_load_dwordx4 v[4:7], v32, s[0:1] offset:112
 ; GCN-NEXT:    v_mul_lo_u32 v1, v1, v1
 ; GCN-NEXT:    v_mul_lo_u32 v0, v0, v0
 ; GCN-NEXT:    v_mul_lo_u32 v11, v11, v11
 ; GCN-NEXT:    v_mul_lo_u32 v10, v10, v10
-; GCN-NEXT:    global_load_dwordx4 v[12:15], v32, s[4:5] offset:48
+; GCN-NEXT:    global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
@@ -218,11 +218,11 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mul_lo_u32 v13, v13, v13
 ; GCN-NEXT:    v_mul_lo_u32 v15, v15, v15
-; GCN-NEXT:    global_load_dwordx4 v[16:19], v32, s[4:5] offset:80
+; GCN-NEXT:    global_load_dwordx4 v[16:19], v32, s[0:1] offset:80
 ; GCN-NEXT:    v_mul_lo_u32 v14, v14, v14
 ; GCN-NEXT:    v_mul_lo_u32 v12, v12, v12
-; GCN-NEXT:    global_load_dwordx4 v[20:23], v32, s[4:5] offset:64
-; GCN-NEXT:    global_load_dwordx4 v[24:27], v32, s[4:5] offset:32
+; GCN-NEXT:    global_load_dwordx4 v[20:23], v32, s[0:1] offset:64
+; GCN-NEXT:    global_load_dwordx4 v[24:27], v32, s[0:1] offset:32
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
@@ -242,14 +242,14 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; GCN-NEXT:    v_mul_lo_u32 v21, v21, v21
 ; GCN-NEXT:    v_mul_lo_u32 v20, v20, v20
 ; GCN-NEXT:    v_mul_lo_u32 v16, v16, v16
-; GCN-NEXT:    global_store_dwordx4 v32, v[4:7], s[6:7] offset:112
-; GCN-NEXT:    global_store_dwordx4 v32, v[8:11], s[6:7] offset:96
-; GCN-NEXT:    global_store_dwordx4 v32, v[16:19], s[6:7] offset:80
-; GCN-NEXT:    global_store_dwordx4 v32, v[20:23], s[6:7] offset:64
-; GCN-NEXT:    global_store_dwordx4 v32, v[12:15], s[6:7] offset:48
-; GCN-NEXT:    global_store_dwordx4 v32, v[24:27], s[6:7] offset:32
-; GCN-NEXT:    global_store_dwordx4 v32, v[28:31], s[6:7] offset:16
-; GCN-NEXT:    global_store_dwordx4 v32, v[0:3], s[6:7]
+; GCN-NEXT:    global_store_dwordx4 v32, v[4:7], s[2:3] offset:112
+; GCN-NEXT:    global_store_dwordx4 v32, v[8:11], s[2:3] offset:96
+; GCN-NEXT:    global_store_dwordx4 v32, v[16:19], s[2:3] offset:80
+; GCN-NEXT:    global_store_dwordx4 v32, v[20:23], s[2:3] offset:64
+; GCN-NEXT:    global_store_dwordx4 v32, v[12:15], s[2:3] offset:48
+; GCN-NEXT:    global_store_dwordx4 v32, v[24:27], s[2:3] offset:32
+; GCN-NEXT:    global_store_dwordx4 v32, v[28:31], s[2:3] offset:16
+; GCN-NEXT:    global_store_dwordx4 v32, v[0:3], s[2:3]
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
@@ -258,17 +258,17 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ;
 ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_alternating_READ_VALU:
 ; EXACTCUTOFF:       ; %bb.0:
-; EXACTCUTOFF-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; EXACTCUTOFF-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; EXACTCUTOFF-NEXT:    v_lshlrev_b32_e32 v32, 7, v0
 ; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[28:31], v32, s[4:5] offset:16
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[8:11], v32, s[4:5] offset:96
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[28:31], v32, s[0:1] offset:16
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[8:11], v32, s[0:1] offset:96
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(1)
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v29, v29, v29
 ; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(0)
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v9, v9, v9
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[0:3], v32, s[4:5]
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[0:3], v32, s[0:1]
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v8, v8, v8
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v28, v28, v28
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v31, v31, v31
@@ -278,12 +278,12 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(0)
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v3, v3, v3
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v2, v2, v2
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[4:7], v32, s[4:5] offset:112
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[4:7], v32, s[0:1] offset:112
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v1, v1, v1
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v0, v0, v0
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v11, v11, v11
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v10, v10, v10
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[12:15], v32, s[4:5] offset:48
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
@@ -298,11 +298,11 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(0)
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v13, v13, v13
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v15, v15, v15
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[16:19], v32, s[4:5] offset:80
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[16:19], v32, s[0:1] offset:80
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v14, v14, v14
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v12, v12, v12
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[20:23], v32, s[4:5] offset:64
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[24:27], v32, s[4:5] offset:32
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[20:23], v32, s[0:1] offset:64
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[24:27], v32, s[0:1] offset:32
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
@@ -322,14 +322,14 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v21, v21, v21
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v20, v20, v20
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v16, v16, v16
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[4:7], s[6:7] offset:112
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[8:11], s[6:7] offset:96
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[16:19], s[6:7] offset:80
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[20:23], s[6:7] offset:64
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[12:15], s[6:7] offset:48
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[24:27], s[6:7] offset:32
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[28:31], s[6:7] offset:16
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[0:3], s[6:7]
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[4:7], s[2:3] offset:112
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[8:11], s[2:3] offset:96
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[16:19], s[2:3] offset:80
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[20:23], s[2:3] offset:64
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[12:15], s[2:3] offset:48
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[24:27], s[2:3] offset:32
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[28:31], s[2:3] offset:16
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[0:3], s[2:3]
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
@@ -381,18 +381,18 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VALU_WRITE(ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %out) #0 {
 ; GCN-LABEL: test_sched_group_barrier_pipeline_alternating_READ_VALU_WRITE:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GCN-NEXT:    v_lshlrev_b32_e32 v16, 7, v0
-; GCN-NEXT:    ; kill: killed $sgpr4_sgpr5
+; GCN-NEXT:    ; kill: killed $sgpr0_sgpr1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    global_load_dwordx4 v[12:15], v16, s[4:5] offset:32
-; GCN-NEXT:    global_load_dwordx4 v[4:7], v16, s[4:5] offset:48
+; GCN-NEXT:    global_load_dwordx4 v[12:15], v16, s[0:1] offset:32
+; GCN-NEXT:    global_load_dwordx4 v[4:7], v16, s[0:1] offset:48
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_mul_lo_u32 v13, v13, v13
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mul_lo_u32 v7, v7, v7
-; GCN-NEXT:    global_load_dwordx4 v[0:3], v16, s[4:5]
+; GCN-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1]
 ; GCN-NEXT:    v_mul_lo_u32 v6, v6, v6
 ; GCN-NEXT:    v_mul_lo_u32 v12, v12, v12
 ; GCN-NEXT:    v_mul_lo_u32 v15, v15, v15
@@ -403,25 +403,25 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; GCN-NEXT:    v_mul_lo_u32 v2, v2, v2
 ; GCN-NEXT:    v_mul_lo_u32 v1, v1, v1
 ; GCN-NEXT:    v_mul_lo_u32 v0, v0, v0
-; GCN-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7]
-; GCN-NEXT:    global_load_dwordx4 v[0:3], v16, s[4:5] offset:112
+; GCN-NEXT:    global_store_dwordx4 v16, v[0:3], s[2:3]
+; GCN-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1] offset:112
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mul_lo_u32 v3, v3, v3
 ; GCN-NEXT:    v_mul_lo_u32 v2, v2, v2
 ; GCN-NEXT:    v_mul_lo_u32 v1, v1, v1
 ; GCN-NEXT:    v_mul_lo_u32 v0, v0, v0
-; GCN-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7] offset:112
-; GCN-NEXT:    global_load_dwordx4 v[0:3], v16, s[4:5] offset:96
+; GCN-NEXT:    global_store_dwordx4 v16, v[0:3], s[2:3] offset:112
+; GCN-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1] offset:96
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mul_lo_u32 v3, v3, v3
 ; GCN-NEXT:    v_mul_lo_u32 v2, v2, v2
 ; GCN-NEXT:    v_mul_lo_u32 v1, v1, v1
 ; GCN-NEXT:    v_mul_lo_u32 v0, v0, v0
-; GCN-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7] offset:96
+; GCN-NEXT:    global_store_dwordx4 v16, v[0:3], s[2:3] offset:96
 ; GCN-NEXT:    v_mul_lo_u32 v5, v5, v5
 ; GCN-NEXT:    v_mul_lo_u32 v4, v4, v4
-; GCN-NEXT:    global_store_dwordx4 v16, v[4:7], s[6:7] offset:48
-; GCN-NEXT:    global_load_dwordx4 v[4:7], v16, s[4:5] offset:64
+; GCN-NEXT:    global_store_dwordx4 v16, v[4:7], s[2:3] offset:48
+; GCN-NEXT:    global_load_dwordx4 v[4:7], v16, s[0:1] offset:64
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
@@ -430,9 +430,9 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; GCN-NEXT:    v_mul_lo_u32 v6, v6, v6
 ; GCN-NEXT:    v_mul_lo_u32 v5, v5, v5
 ; GCN-NEXT:    v_mul_lo_u32 v4, v4, v4
-; GCN-NEXT:    global_store_dwordx4 v16, v[4:7], s[6:7] offset:64
-; GCN-NEXT:    global_store_dwordx4 v16, v[12:15], s[6:7] offset:32
-; GCN-NEXT:    global_load_dwordx4 v[8:11], v16, s[4:5] offset:16
+; GCN-NEXT:    global_store_dwordx4 v16, v[4:7], s[2:3] offset:64
+; GCN-NEXT:    global_store_dwordx4 v16, v[12:15], s[2:3] offset:32
+; GCN-NEXT:    global_load_dwordx4 v[8:11], v16, s[0:1] offset:16
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
@@ -452,15 +452,15 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; GCN-NEXT:    v_mul_lo_u32 v8, v8, v8
 ; GCN-NEXT:    v_mul_lo_u32 v11, v11, v11
 ; GCN-NEXT:    v_mul_lo_u32 v10, v10, v10
-; GCN-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7] offset:16
-; GCN-NEXT:    global_load_dwordx4 v[8:11], v16, s[4:5] offset:80
+; GCN-NEXT:    global_store_dwordx4 v16, v[8:11], s[2:3] offset:16
+; GCN-NEXT:    global_load_dwordx4 v[8:11], v16, s[0:1] offset:80
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mul_lo_u32 v11, v11, v11
 ; GCN-NEXT:    v_mul_lo_u32 v10, v10, v10
 ; GCN-NEXT:    v_mul_lo_u32 v9, v9, v9
 ; GCN-NEXT:    v_mul_lo_u32 v8, v8, v8
-; GCN-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7] offset:80
+; GCN-NEXT:    global_store_dwordx4 v16, v[8:11], s[2:3] offset:80
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
@@ -469,18 +469,18 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ;
 ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_alternating_READ_VALU_WRITE:
 ; EXACTCUTOFF:       ; %bb.0:
-; EXACTCUTOFF-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; EXACTCUTOFF-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; EXACTCUTOFF-NEXT:    v_lshlrev_b32_e32 v16, 7, v0
-; EXACTCUTOFF-NEXT:    ; kill: killed $sgpr4_sgpr5
+; EXACTCUTOFF-NEXT:    ; kill: killed $sgpr0_sgpr1
 ; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[12:15], v16, s[4:5] offset:32
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[4:7], v16, s[4:5] offset:48
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[12:15], v16, s[0:1] offset:32
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[4:7], v16, s[0:1] offset:48
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(1)
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v13, v13, v13
 ; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(0)
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v7, v7, v7
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[0:3], v16, s[4:5]
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1]
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v6, v6, v6
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v12, v12, v12
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v15, v15, v15
@@ -491,25 +491,25 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v2, v2, v2
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v1, v1, v1
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v0, v0, v0
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7]
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[0:3], v16, s[4:5] offset:112
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[0:3], s[2:3]
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1] offset:112
 ; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(0)
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v3, v3, v3
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v2, v2, v2
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v1, v1, v1
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v0, v0, v0
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7] offset:112
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[0:3], v16, s[4:5] offset:96
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[0:3], s[2:3] offset:112
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1] offset:96
 ; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(0)
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v3, v3, v3
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v2, v2, v2
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v1, v1, v1
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v0, v0, v0
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7] offset:96
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[0:3], s[2:3] offset:96
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v5, v5, v5
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v4, v4, v4
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[4:7], s[6:7] offset:48
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[4:7], v16, s[4:5] offset:64
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[4:7], s[2:3] offset:48
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[4:7], v16, s[0:1] offset:64
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
@@ -518,9 +518,9 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v6, v6, v6
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v5, v5, v5
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v4, v4, v4
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[4:7], s[6:7] offset:64
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[12:15], s[6:7] offset:32
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[8:11], v16, s[4:5] offset:16
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[4:7], s[2:3] offset:64
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[12:15], s[2:3] offset:32
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[8:11], v16, s[0:1] offset:16
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
@@ -540,15 +540,15 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v8, v8, v8
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v11, v11, v11
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v10, v10, v10
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7] offset:16
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[8:11], v16, s[4:5] offset:80
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[8:11], s[2:3] offset:16
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[8:11], v16, s[0:1] offset:80
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
 ; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(0)
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v11, v11, v11
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v10, v10, v10
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v9, v9, v9
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v8, v8, v8
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7] offset:80
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[8:11], s[2:3] offset:80
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
@@ -614,10 +614,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 {
 ; GCN-LABEL: test_sched_group_barrier_pipeline_MFMA_cluster:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_add_u32_e32 v1, s2, v0
+; GCN-NEXT:    v_add_u32_e32 v1, s0, v0
 ; GCN-NEXT:    ds_read_b128 a[28:31], v1 offset:112
 ; GCN-NEXT:    ds_read_b128 a[24:27], v1 offset:96
 ; GCN-NEXT:    ds_read_b128 a[20:23], v1 offset:80
@@ -661,7 +661,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad
 ; GCN-NEXT:    ds_read_b128 a[136:139], v2 offset:57376
 ; GCN-NEXT:    ds_read_b128 a[140:143], v2 offset:57392
 ; GCN-NEXT:    v_mov_b32_e32 v2, 2.0
-; GCN-NEXT:    v_add_u32_e32 v0, s3, v0
+; GCN-NEXT:    v_add_u32_e32 v0, s1, v0
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(40) SyncID(0)
 ; GCN-NEXT:    s_waitcnt lgkmcnt(14)
 ; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
@@ -681,7 +681,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad
 ; GCN-NEXT:    ds_write_b128 v0, a[8:11] offset:32
 ; GCN-NEXT:    ds_write_b128 v0, a[4:7] offset:16
 ; GCN-NEXT:    ds_write_b128 v0, a[0:3]
-; GCN-NEXT:    v_mov_b32_e32 v0, s3
+; GCN-NEXT:    v_mov_b32_e32 v0, s1
 ; GCN-NEXT:    ds_write_b128 v0, a[56:59] offset:8288
 ; GCN-NEXT:    ds_write_b128 v0, a[60:63] offset:8304
 ; GCN-NEXT:    ds_write_b128 v0, a[48:51] offset:8256
@@ -720,10 +720,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad
 ;
 ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_MFMA_cluster:
 ; EXACTCUTOFF:       ; %bb.0: ; %entry
-; EXACTCUTOFF-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; EXACTCUTOFF-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; EXACTCUTOFF-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
 ; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
-; EXACTCUTOFF-NEXT:    v_add_u32_e32 v1, s2, v0
+; EXACTCUTOFF-NEXT:    v_add_u32_e32 v1, s0, v0
 ; EXACTCUTOFF-NEXT:    ds_read_b128 a[28:31], v1 offset:112
 ; EXACTCUTOFF-NEXT:    ds_read_b128 a[24:27], v1 offset:96
 ; EXACTCUTOFF-NEXT:    ds_read_b128 a[20:23], v1 offset:80
@@ -767,7 +767,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad
 ; EXACTCUTOFF-NEXT:    ds_read_b128 a[136:139], v2 offset:57376
 ; EXACTCUTOFF-NEXT:    ds_read_b128 a[140:143], v2 offset:57392
 ; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v2, 2.0
-; EXACTCUTOFF-NEXT:    v_add_u32_e32 v0, s3, v0
+; EXACTCUTOFF-NEXT:    v_add_u32_e32 v0, s1, v0
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000100) size(40) SyncID(0)
 ; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(14)
 ; EXACTCUTOFF-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
@@ -787,7 +787,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad
 ; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[8:11] offset:32
 ; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[4:7] offset:16
 ; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[0:3]
-; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v0, s3
+; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v0, s1
 ; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[56:59] offset:8288
 ; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[60:63] offset:8304
 ; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[48:51] offset:8256
@@ -862,12 +862,12 @@ entry:
 define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 {
 ; GCN-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GCN-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
 ; GCN-NEXT:    v_mov_b32_e32 v2, 1.0
 ; GCN-NEXT:    v_mov_b32_e32 v3, 2.0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_add_u32_e32 v0, s2, v1
+; GCN-NEXT:    v_add_u32_e32 v0, s0, v1
 ; GCN-NEXT:    ds_read_b128 a[28:31], v0 offset:112
 ; GCN-NEXT:    ds_read_b128 a[24:27], v0 offset:96
 ; GCN-NEXT:    ds_read_b128 a[20:23], v0 offset:80
@@ -878,7 +878,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; GCN-NEXT:    ds_read_b128 a[12:15], v0 offset:48
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
-; GCN-NEXT:    v_add_u32_e32 v1, s3, v1
+; GCN-NEXT:    v_add_u32_e32 v1, s1, v1
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
 ; GCN-NEXT:    s_nop 7
@@ -902,7 +902,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; GCN-NEXT:    ds_read_b128 a[0:3], v0 offset:8192
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
@@ -995,12 +995,12 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ;
 ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave:
 ; EXACTCUTOFF:       ; %bb.0: ; %entry
-; EXACTCUTOFF-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; EXACTCUTOFF-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; EXACTCUTOFF-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
 ; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v2, 1.0
 ; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v3, 2.0
 ; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
-; EXACTCUTOFF-NEXT:    v_add_u32_e32 v0, s2, v1
+; EXACTCUTOFF-NEXT:    v_add_u32_e32 v0, s0, v1
 ; EXACTCUTOFF-NEXT:    ds_read_b128 a[28:31], v0 offset:112
 ; EXACTCUTOFF-NEXT:    ds_read_b128 a[24:27], v0 offset:96
 ; EXACTCUTOFF-NEXT:    ds_read_b128 a[20:23], v0 offset:80
@@ -1011,7 +1011,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; EXACTCUTOFF-NEXT:    ds_read_b128 a[12:15], v0 offset:48
 ; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
 ; EXACTCUTOFF-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
-; EXACTCUTOFF-NEXT:    v_add_u32_e32 v1, s3, v1
+; EXACTCUTOFF-NEXT:    v_add_u32_e32 v1, s1, v1
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    s_nop 7
@@ -1035,7 +1035,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; EXACTCUTOFF-NEXT:    ds_read_b128 a[0:3], v0 offset:8192
 ; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
 ; EXACTCUTOFF-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
-; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v1, s3
+; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v1, s1
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
index 4ce0ff20e3b73..3a867879bb809 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
@@ -440,59 +440,59 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
 ;
 ; GFX900-SDAG-LABEL: s_exp_v2f32:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX900-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v0, 0x3fb8aa3b
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x32a5705f
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v5, 0xc2ce8ed0
 ; GFX900-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_mul_f32_e32 v2, s7, v0
+; GFX900-SDAG-NEXT:    v_mul_f32_e32 v2, s3, v0
 ; GFX900-SDAG-NEXT:    v_rndne_f32_e32 v3, v2
-; GFX900-SDAG-NEXT:    v_fma_f32 v4, s7, v0, -v2
+; GFX900-SDAG-NEXT:    v_fma_f32 v4, s3, v0, -v2
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v2, v2, v3
-; GFX900-SDAG-NEXT:    v_fma_f32 v4, s7, v1, v4
-; GFX900-SDAG-NEXT:    v_mul_f32_e32 v6, s6, v0
+; GFX900-SDAG-NEXT:    v_fma_f32 v4, s3, v1, v4
+; GFX900-SDAG-NEXT:    v_mul_f32_e32 v6, s2, v0
 ; GFX900-SDAG-NEXT:    v_add_f32_e32 v2, v2, v4
 ; GFX900-SDAG-NEXT:    v_rndne_f32_e32 v7, v6
-; GFX900-SDAG-NEXT:    v_fma_f32 v0, s6, v0, -v6
+; GFX900-SDAG-NEXT:    v_fma_f32 v0, s2, v0, -v6
 ; GFX900-SDAG-NEXT:    v_cvt_i32_f32_e32 v3, v3
 ; GFX900-SDAG-NEXT:    v_exp_f32_e32 v2, v2
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v8, v6, v7
-; GFX900-SDAG-NEXT:    v_fma_f32 v0, s6, v1, v0
+; GFX900-SDAG-NEXT:    v_fma_f32 v0, s2, v1, v0
 ; GFX900-SDAG-NEXT:    v_add_f32_e32 v0, v8, v0
 ; GFX900-SDAG-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX900-SDAG-NEXT:    v_cvt_i32_f32_e32 v6, v7
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v2, v2, v3
-; GFX900-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s7, v5
+; GFX900-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s3, v5
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v3, 0x42b17218
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v7, 0x7f800000
-; GFX900-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s7, v3
+; GFX900-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s3, v3
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v6
-; GFX900-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s6, v5
+; GFX900-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s2, v5
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GFX900-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s6, v3
+; GFX900-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s2, v3
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc
-; GFX900-SDAG-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
+; GFX900-SDAG-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
 ; GFX900-SDAG-NEXT:    s_endpgm
 ;
 ; GFX900-GISEL-LABEL: s_exp_v2f32:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX900-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, 0x3fb8aa3b
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x32a5705f
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7f800000
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, s6, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v3, s6, v0, -v2
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, s2, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v3, s2, v0, -v2
 ; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v4, v2
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v5, s7, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v3, s6, v1, v3
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v5, s3, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v3, s2, v1, v3
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v4
-; GFX900-GISEL-NEXT:    v_fma_f32 v0, s7, v0, -v5
+; GFX900-GISEL-NEXT:    v_fma_f32 v0, s3, v0, -v5
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
-; GFX900-GISEL-NEXT:    v_fma_f32 v0, s7, v1, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v0, s3, v1, v0
 ; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v1, v5
 ; GFX900-GISEL-NEXT:    v_cvt_i32_f32_e32 v3, v4
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v2, v2
@@ -502,18 +502,18 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v5, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0xc2ce8ed0
 ; GFX900-GISEL-NEXT:    v_ldexp_f32 v2, v2, v3
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v4
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v4
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x42b17218
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s6, v3
+; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s2, v3
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v6, vcc
 ; GFX900-GISEL-NEXT:    v_ldexp_f32 v1, v5, v1
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s7, v4
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s3, v4
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
-; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s7, v3
+; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s3, v3
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; GFX900-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX900-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX900-GISEL-NEXT:    s_endpgm
 ;
 ; SI-SDAG-LABEL: s_exp_v2f32:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
index 5ab960f47f57b..a162949587481 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
@@ -442,59 +442,59 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ;
 ; GFX900-SDAG-LABEL: s_exp10_v2f32:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX900-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v0, 0x40549a78
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x33979a37
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v5, 0xc23369f4
 ; GFX900-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_mul_f32_e32 v2, s7, v0
+; GFX900-SDAG-NEXT:    v_mul_f32_e32 v2, s3, v0
 ; GFX900-SDAG-NEXT:    v_rndne_f32_e32 v3, v2
-; GFX900-SDAG-NEXT:    v_fma_f32 v4, s7, v0, -v2
+; GFX900-SDAG-NEXT:    v_fma_f32 v4, s3, v0, -v2
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v2, v2, v3
-; GFX900-SDAG-NEXT:    v_fma_f32 v4, s7, v1, v4
-; GFX900-SDAG-NEXT:    v_mul_f32_e32 v6, s6, v0
+; GFX900-SDAG-NEXT:    v_fma_f32 v4, s3, v1, v4
+; GFX900-SDAG-NEXT:    v_mul_f32_e32 v6, s2, v0
 ; GFX900-SDAG-NEXT:    v_add_f32_e32 v2, v2, v4
 ; GFX900-SDAG-NEXT:    v_rndne_f32_e32 v7, v6
-; GFX900-SDAG-NEXT:    v_fma_f32 v0, s6, v0, -v6
+; GFX900-SDAG-NEXT:    v_fma_f32 v0, s2, v0, -v6
 ; GFX900-SDAG-NEXT:    v_cvt_i32_f32_e32 v3, v3
 ; GFX900-SDAG-NEXT:    v_exp_f32_e32 v2, v2
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v8, v6, v7
-; GFX900-SDAG-NEXT:    v_fma_f32 v0, s6, v1, v0
+; GFX900-SDAG-NEXT:    v_fma_f32 v0, s2, v1, v0
 ; GFX900-SDAG-NEXT:    v_add_f32_e32 v0, v8, v0
 ; GFX900-SDAG-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX900-SDAG-NEXT:    v_cvt_i32_f32_e32 v6, v7
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v2, v2, v3
-; GFX900-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s7, v5
+; GFX900-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s3, v5
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v3, 0x421a209b
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v7, 0x7f800000
-; GFX900-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s7, v3
+; GFX900-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s3, v3
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v6
-; GFX900-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s6, v5
+; GFX900-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s2, v5
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GFX900-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s6, v3
+; GFX900-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s2, v3
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc
-; GFX900-SDAG-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
+; GFX900-SDAG-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
 ; GFX900-SDAG-NEXT:    s_endpgm
 ;
 ; GFX900-GISEL-LABEL: s_exp10_v2f32:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX900-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, 0x40549a78
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x33979a37
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7f800000
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, s6, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v3, s6, v0, -v2
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, s2, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v3, s2, v0, -v2
 ; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v4, v2
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v5, s7, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v3, s6, v1, v3
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v5, s3, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v3, s2, v1, v3
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v4
-; GFX900-GISEL-NEXT:    v_fma_f32 v0, s7, v0, -v5
+; GFX900-GISEL-NEXT:    v_fma_f32 v0, s3, v0, -v5
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
-; GFX900-GISEL-NEXT:    v_fma_f32 v0, s7, v1, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v0, s3, v1, v0
 ; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v1, v5
 ; GFX900-GISEL-NEXT:    v_cvt_i32_f32_e32 v3, v4
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v2, v2
@@ -504,18 +504,18 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v5, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0xc23369f4
 ; GFX900-GISEL-NEXT:    v_ldexp_f32 v2, v2, v3
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v4
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v4
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x421a209b
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s6, v3
+; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s2, v3
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v6, vcc
 ; GFX900-GISEL-NEXT:    v_ldexp_f32 v1, v5, v1
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s7, v4
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s3, v4
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
-; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s7, v3
+; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s3, v3
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; GFX900-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX900-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX900-GISEL-NEXT:    s_endpgm
 ;
 ; SI-SDAG-LABEL: s_exp10_v2f32:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
index 6cca705f7b1db..36e78975cdb01 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
@@ -270,25 +270,25 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ;
 ; GFX900-SDAG-LABEL: s_exp2_v2f32:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX900-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v0, 0xc2fc0000
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v2, 0x42800000
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX900-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s7, v0
+; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s3, v0
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v3, 1.0, v1, vcc
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v2, vcc
-; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
+; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v0
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, 1.0, v1, vcc
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GFX900-SDAG-NEXT:    v_add_f32_e32 v4, s7, v4
-; GFX900-SDAG-NEXT:    v_add_f32_e32 v1, s6, v1
+; GFX900-SDAG-NEXT:    v_add_f32_e32 v4, s3, v4
+; GFX900-SDAG-NEXT:    v_add_f32_e32 v1, s2, v1
 ; GFX900-SDAG-NEXT:    v_exp_f32_e32 v4, v4
 ; GFX900-SDAG-NEXT:    v_exp_f32_e32 v2, v1
 ; GFX900-SDAG-NEXT:    v_mul_f32_e32 v1, v4, v3
 ; GFX900-SDAG-NEXT:    v_mul_f32_e32 v0, v2, v0
-; GFX900-SDAG-NEXT:    global_store_dwordx2 v5, v[0:1], s[4:5]
+; GFX900-SDAG-NEXT:    global_store_dwordx2 v5, v[0:1], s[0:1]
 ; GFX900-SDAG-NEXT:    s_endpgm
 ;
 ; GFX900-GISEL-LABEL: s_exp2_v2f32:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
index 90a15ae8d9b28..035b2439eff15 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
@@ -313,25 +313,25 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ;
 ; GFX900-SDAG-LABEL: s_log2_v2f32:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX900-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v0, 0x800000
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX900-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s7, v0
+; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s3, v0
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v3, 0, v1, vcc
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v4, 1.0, v2, vcc
-; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
+; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v0
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-SDAG-NEXT:    v_mul_f32_e32 v4, s7, v4
-; GFX900-SDAG-NEXT:    v_mul_f32_e32 v1, s6, v1
+; GFX900-SDAG-NEXT:    v_mul_f32_e32 v4, s3, v4
+; GFX900-SDAG-NEXT:    v_mul_f32_e32 v1, s2, v1
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v4, v4
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v2, v1
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v1, v4, v3
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v2, v0
-; GFX900-SDAG-NEXT:    global_store_dwordx2 v5, v[0:1], s[4:5]
+; GFX900-SDAG-NEXT:    global_store_dwordx2 v5, v[0:1], s[0:1]
 ; GFX900-SDAG-NEXT:    s_endpgm
 ;
 ; GFX900-GISEL-LABEL: s_log2_v2f32:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.ll
index 8196999b8f1f1..7ad7cc821c1b5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.round.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.round.ll
@@ -133,57 +133,31 @@ define amdgpu_kernel void @round_v2f32(ptr addrspace(1) %out, <2 x float> %in) #
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
-; GFX8-LABEL: round_v2f32:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8-NEXT:    s_brev_b32 s8, -2
-; GFX8-NEXT:    s_mov_b32 s7, 0xf000
-; GFX8-NEXT:    s_mov_b32 s6, -1
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_trunc_f32_e32 v0, s3
-; GFX8-NEXT:    v_sub_f32_e32 v1, s3, v0
-; GFX8-NEXT:    s_mov_b32 s4, s0
-; GFX8-NEXT:    s_mov_b32 s5, s1
-; GFX8-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
-; GFX8-NEXT:    v_mov_b32_e32 v2, s3
-; GFX8-NEXT:    v_bfi_b32 v1, s8, v1, v2
-; GFX8-NEXT:    v_add_f32_e32 v1, v0, v1
-; GFX8-NEXT:    v_trunc_f32_e32 v0, s2
-; GFX8-NEXT:    v_sub_f32_e32 v2, s2, v0
-; GFX8-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v2|, 0.5
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s[0:1]
-; GFX8-NEXT:    v_mov_b32_e32 v3, s2
-; GFX8-NEXT:    v_bfi_b32 v2, s8, v2, v3
-; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
-; GFX8-NEXT:    s_endpgm
-;
-; GFX9-LABEL: round_v2f32:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NEXT:    s_brev_b32 s8, -2
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_trunc_f32_e32 v0, s7
-; GFX9-NEXT:    v_sub_f32_e32 v1, s7, v0
-; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    s_mov_b32 s1, s5
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, s[4:5]
-; GFX9-NEXT:    v_mov_b32_e32 v2, s7
-; GFX9-NEXT:    v_bfi_b32 v1, s8, v1, v2
-; GFX9-NEXT:    v_add_f32_e32 v1, v0, v1
-; GFX9-NEXT:    v_trunc_f32_e32 v0, s6
-; GFX9-NEXT:    v_sub_f32_e32 v2, s6, v0
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
-; GFX9-NEXT:    v_mov_b32_e32 v3, s6
-; GFX9-NEXT:    v_bfi_b32 v2, s8, v2, v3
-; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; GFX9-NEXT:    s_endpgm
+; GFX89-LABEL: round_v2f32:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX89-NEXT:    s_brev_b32 s8, -2
+; GFX89-NEXT:    s_mov_b32 s7, 0xf000
+; GFX89-NEXT:    s_mov_b32 s6, -1
+; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX89-NEXT:    v_trunc_f32_e32 v0, s3
+; GFX89-NEXT:    v_sub_f32_e32 v1, s3, v0
+; GFX89-NEXT:    s_mov_b32 s4, s0
+; GFX89-NEXT:    s_mov_b32 s5, s1
+; GFX89-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
+; GFX89-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
+; GFX89-NEXT:    v_mov_b32_e32 v2, s3
+; GFX89-NEXT:    v_bfi_b32 v1, s8, v1, v2
+; GFX89-NEXT:    v_add_f32_e32 v1, v0, v1
+; GFX89-NEXT:    v_trunc_f32_e32 v0, s2
+; GFX89-NEXT:    v_sub_f32_e32 v2, s2, v0
+; GFX89-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v2|, 0.5
+; GFX89-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s[0:1]
+; GFX89-NEXT:    v_mov_b32_e32 v3, s2
+; GFX89-NEXT:    v_bfi_b32 v2, s8, v2, v3
+; GFX89-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX89-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX89-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: round_v2f32:
 ; GFX11:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
index a54405bf1b471..994ef22539a65 100644
--- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
@@ -8,12 +8,12 @@
 define amdgpu_kernel void @s_lshr_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
 ; GFX9-LABEL: s_lshr_v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-NEXT:    v_pk_lshrrev_b16 v1, s7, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    v_pk_lshrrev_b16 v1, s3, v1
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_lshr_v2i16:
@@ -54,11 +54,11 @@ define amdgpu_kernel void @s_lshr_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <
 ;
 ; GFX10-LABEL: s_lshr_v2i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_pk_lshrrev_b16 v1, s7, s6
-; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT:    v_pk_lshrrev_b16 v1, s3, s2
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_lshr_v2i16:
diff --git a/llvm/test/CodeGen/AMDGPU/madak.ll b/llvm/test/CodeGen/AMDGPU/madak.ll
index 944db3d3adc3a..9ec37a5e14cdf 100644
--- a/llvm/test/CodeGen/AMDGPU/madak.ll
+++ b/llvm/test/CodeGen/AMDGPU/madak.ll
@@ -220,40 +220,40 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad
 ;
 ; GFX9-LABEL: madak_2_use_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x41200000
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v0, s[6:7] offset:4 glc
+; GFX9-NEXT:    global_load_dword v2, v0, s[2:3] offset:4 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] offset:8 glc
+; GFX9-NEXT:    global_load_dword v3, v0, s[2:3] offset:8 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_madak_f32 v2, v1, v2, 0x41200000
 ; GFX9-NEXT:    v_mac_f32_e32 v4, v1, v3
-; GFX9-NEXT:    global_store_dword v0, v2, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v2, s[0:1]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_dword v0, v4, s[6:7] offset:4
+; GFX9-NEXT:    global_store_dword v0, v4, s[2:3] offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-MAD-LABEL: madak_2_use_f32:
 ; GFX10-MAD:       ; %bb.0:
-; GFX10-MAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-MAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-MAD-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-MAD-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-MAD-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
+; GFX10-MAD-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
 ; GFX10-MAD-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-MAD-NEXT:    global_load_dword v2, v0, s[6:7] offset:4 glc dlc
+; GFX10-MAD-NEXT:    global_load_dword v2, v0, s[2:3] offset:4 glc dlc
 ; GFX10-MAD-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-MAD-NEXT:    global_load_dword v3, v0, s[6:7] offset:8 glc dlc
+; GFX10-MAD-NEXT:    global_load_dword v3, v0, s[2:3] offset:8 glc dlc
 ; GFX10-MAD-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-MAD-NEXT:    v_madak_f32 v2, v1, v2, 0x41200000
 ; GFX10-MAD-NEXT:    v_madak_f32 v1, v1, v3, 0x41200000
-; GFX10-MAD-NEXT:    global_store_dword v0, v2, s[4:5]
+; GFX10-MAD-NEXT:    global_store_dword v0, v2, s[0:1]
 ; GFX10-MAD-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-MAD-NEXT:    global_store_dword v0, v1, s[6:7] offset:4
+; GFX10-MAD-NEXT:    global_store_dword v0, v1, s[2:3] offset:4
 ; GFX10-MAD-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-MAD-NEXT:    s_endpgm
 ;
@@ -282,40 +282,40 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad
 ;
 ; GFX940-FMA-LABEL: madak_2_use_f32:
 ; GFX940-FMA:       ; %bb.0:
-; GFX940-FMA-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX940-FMA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX940-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX940-FMA-NEXT:    v_mov_b32_e32 v4, 0x41200000
 ; GFX940-FMA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-FMA-NEXT:    global_load_dword v1, v0, s[6:7] sc0 sc1
+; GFX940-FMA-NEXT:    global_load_dword v1, v0, s[2:3] sc0 sc1
 ; GFX940-FMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-FMA-NEXT:    global_load_dword v2, v0, s[6:7] offset:4 sc0 sc1
+; GFX940-FMA-NEXT:    global_load_dword v2, v0, s[2:3] offset:4 sc0 sc1
 ; GFX940-FMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-FMA-NEXT:    global_load_dword v3, v0, s[6:7] offset:8 sc0 sc1
+; GFX940-FMA-NEXT:    global_load_dword v3, v0, s[2:3] offset:8 sc0 sc1
 ; GFX940-FMA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-FMA-NEXT:    v_fmaak_f32 v2, v1, v2, 0x41200000
 ; GFX940-FMA-NEXT:    v_fmac_f32_e32 v4, v1, v3
-; GFX940-FMA-NEXT:    global_store_dword v0, v2, s[4:5] sc0 sc1
+; GFX940-FMA-NEXT:    global_store_dword v0, v2, s[0:1] sc0 sc1
 ; GFX940-FMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-FMA-NEXT:    global_store_dword v0, v4, s[6:7] offset:4 sc0 sc1
+; GFX940-FMA-NEXT:    global_store_dword v0, v4, s[2:3] offset:4 sc0 sc1
 ; GFX940-FMA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-FMA-NEXT:    s_endpgm
 ;
 ; GFX10-FMA-LABEL: madak_2_use_f32:
 ; GFX10-FMA:       ; %bb.0:
-; GFX10-FMA-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-FMA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-FMA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-FMA-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
+; GFX10-FMA-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
 ; GFX10-FMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FMA-NEXT:    global_load_dword v2, v0, s[6:7] offset:4 glc dlc
+; GFX10-FMA-NEXT:    global_load_dword v2, v0, s[2:3] offset:4 glc dlc
 ; GFX10-FMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FMA-NEXT:    global_load_dword v3, v0, s[6:7] offset:8 glc dlc
+; GFX10-FMA-NEXT:    global_load_dword v3, v0, s[2:3] offset:8 glc dlc
 ; GFX10-FMA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FMA-NEXT:    v_fmaak_f32 v2, v1, v2, 0x41200000
 ; GFX10-FMA-NEXT:    v_fmaak_f32 v1, v1, v3, 0x41200000
-; GFX10-FMA-NEXT:    global_store_dword v0, v2, s[4:5]
+; GFX10-FMA-NEXT:    global_store_dword v0, v2, s[0:1]
 ; GFX10-FMA-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-FMA-NEXT:    global_store_dword v0, v1, s[6:7] offset:4
+; GFX10-FMA-NEXT:    global_store_dword v0, v1, s[2:3] offset:4
 ; GFX10-FMA-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-FMA-NEXT:    s_endpgm
 ;
@@ -398,24 +398,24 @@ define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out,
 ;
 ; GFX9-LABEL: madak_m_inline_imm_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_madak_f32 v1, 4.0, v1, 0x41200000
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-MAD-LABEL: madak_m_inline_imm_f32:
 ; GFX10-MAD:       ; %bb.0:
-; GFX10-MAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-MAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-MAD-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-MAD-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-MAD-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-MAD-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX10-MAD-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-MAD-NEXT:    v_madak_f32 v1, 4.0, v1, 0x41200000
-; GFX10-MAD-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-MAD-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-MAD-NEXT:    s_endpgm
 ;
 ; GFX11-MAD-LABEL: madak_m_inline_imm_f32:
@@ -435,24 +435,24 @@ define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out,
 ;
 ; GFX940-FMA-LABEL: madak_m_inline_imm_f32:
 ; GFX940-FMA:       ; %bb.0:
-; GFX940-FMA-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX940-FMA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX940-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX940-FMA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-FMA-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX940-FMA-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX940-FMA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-FMA-NEXT:    v_fmaak_f32 v1, 4.0, v1, 0x41200000
-; GFX940-FMA-NEXT:    global_store_dword v0, v1, s[4:5] sc0 sc1
+; GFX940-FMA-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-FMA-NEXT:    s_endpgm
 ;
 ; GFX10-FMA-LABEL: madak_m_inline_imm_f32:
 ; GFX10-FMA:       ; %bb.0:
-; GFX10-FMA-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-FMA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-FMA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-FMA-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-FMA-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX10-FMA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FMA-NEXT:    v_fmaak_f32 v1, 4.0, v1, 0x41200000
-; GFX10-FMA-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-FMA-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-FMA-NEXT:    s_endpgm
 ;
 ; GFX11-FMA-LABEL: madak_m_inline_imm_f32:
@@ -931,23 +931,23 @@ define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float
 ;
 ; GFX9-LABEL: s_s_madak_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x41200000
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v2, s7
-; GFX9-NEXT:    v_mac_f32_e32 v1, s6, v2
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-NEXT:    v_mac_f32_e32 v1, s2, v2
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-MAD-LABEL: s_s_madak_f32:
 ; GFX10-MAD:       ; %bb.0:
-; GFX10-MAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-MAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-MAD-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-MAD-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-MAD-NEXT:    v_mov_b32_e32 v0, s7
-; GFX10-MAD-NEXT:    v_madak_f32 v0, s6, v0, 0x41200000
-; GFX10-MAD-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-MAD-NEXT:    v_mov_b32_e32 v0, s3
+; GFX10-MAD-NEXT:    v_madak_f32 v0, s2, v0, 0x41200000
+; GFX10-MAD-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-MAD-NEXT:    s_endpgm
 ;
 ; GFX11-MAD-LABEL: s_s_madak_f32:
@@ -964,23 +964,23 @@ define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float
 ;
 ; GFX940-FMA-LABEL: s_s_madak_f32:
 ; GFX940-FMA:       ; %bb.0:
-; GFX940-FMA-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX940-FMA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX940-FMA-NEXT:    v_mov_b32_e32 v1, 0x41200000
 ; GFX940-FMA-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-FMA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-FMA-NEXT:    v_mov_b32_e32 v2, s7
-; GFX940-FMA-NEXT:    v_fmac_f32_e32 v1, s6, v2
-; GFX940-FMA-NEXT:    global_store_dword v0, v1, s[4:5] sc0 sc1
+; GFX940-FMA-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-FMA-NEXT:    v_fmac_f32_e32 v1, s2, v2
+; GFX940-FMA-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-FMA-NEXT:    s_endpgm
 ;
 ; GFX10-FMA-LABEL: s_s_madak_f32:
 ; GFX10-FMA:       ; %bb.0:
-; GFX10-FMA-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-FMA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-FMA-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-FMA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-FMA-NEXT:    v_mov_b32_e32 v0, s7
-; GFX10-FMA-NEXT:    v_fmaak_f32 v0, s6, v0, 0x41200000
-; GFX10-FMA-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-FMA-NEXT:    v_mov_b32_e32 v0, s3
+; GFX10-FMA-NEXT:    v_fmaak_f32 v0, s2, v0, 0x41200000
+; GFX10-FMA-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-FMA-NEXT:    s_endpgm
 ;
 ; GFX11-FMA-LABEL: s_s_madak_f32:
diff --git a/llvm/test/CodeGen/AMDGPU/memory_clause.ll b/llvm/test/CodeGen/AMDGPU/memory_clause.ll
index 86a5055ab0704..940287d44d8d1 100644
--- a/llvm/test/CodeGen/AMDGPU/memory_clause.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory_clause.ll
@@ -5,21 +5,21 @@
 define amdgpu_kernel void @vector_clause(ptr addrspace(1) noalias nocapture readonly %arg, ptr addrspace(1) noalias nocapture %arg1) {
 ; GCN-LABEL: vector_clause:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GCN-NEXT:    v_lshlrev_b32_e32 v16, 4, v0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    global_load_dwordx4 v[0:3], v16, s[4:5]
-; GCN-NEXT:    global_load_dwordx4 v[4:7], v16, s[4:5] offset:16
-; GCN-NEXT:    global_load_dwordx4 v[8:11], v16, s[4:5] offset:32
-; GCN-NEXT:    global_load_dwordx4 v[12:15], v16, s[4:5] offset:48
+; GCN-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1]
+; GCN-NEXT:    global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
+; GCN-NEXT:    global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
+; GCN-NEXT:    global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
 ; GCN-NEXT:    s_waitcnt vmcnt(3)
-; GCN-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7]
+; GCN-NEXT:    global_store_dwordx4 v16, v[0:3], s[2:3]
 ; GCN-NEXT:    s_waitcnt vmcnt(3)
-; GCN-NEXT:    global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
+; GCN-NEXT:    global_store_dwordx4 v16, v[4:7], s[2:3] offset:16
 ; GCN-NEXT:    s_waitcnt vmcnt(3)
-; GCN-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
+; GCN-NEXT:    global_store_dwordx4 v16, v[8:11], s[2:3] offset:32
 ; GCN-NEXT:    s_waitcnt vmcnt(3)
-; GCN-NEXT:    global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
+; GCN-NEXT:    global_store_dwordx4 v16, v[12:15], s[2:3] offset:48
 ; GCN-NEXT:    s_endpgm
 ;
 ; GCN-SCRATCH-LABEL: vector_clause:
diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll
index c98cfa08160ca..9d6e0927b0dfd 100644
--- a/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll
+++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll
@@ -8,17 +8,17 @@ declare i64 @llvm.cttz.i64(i64, i1) nounwind readnone
 define amdgpu_kernel void @ctlz_i64_poison(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
 ; GFX9-LABEL: ctlz_i64_poison:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ubyte v0, v1, s[6:7] offset:5
-; GFX9-NEXT:    global_load_ubyte v2, v1, s[6:7] offset:6
-; GFX9-NEXT:    global_load_ubyte v3, v1, s[6:7] offset:7
-; GFX9-NEXT:    global_load_ubyte v4, v1, s[6:7] offset:1
-; GFX9-NEXT:    global_load_ubyte v5, v1, s[6:7] offset:3
-; GFX9-NEXT:    global_load_ubyte v6, v1, s[6:7] offset:4
-; GFX9-NEXT:    global_load_ubyte v7, v1, s[6:7]
-; GFX9-NEXT:    global_load_ubyte v8, v1, s[6:7] offset:2
+; GFX9-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5
+; GFX9-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:6
+; GFX9-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:7
+; GFX9-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1
+; GFX9-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3
+; GFX9-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:4
+; GFX9-NEXT:    global_load_ubyte v7, v1, s[2:3]
+; GFX9-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:2
 ; GFX9-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(5)
@@ -40,23 +40,23 @@ define amdgpu_kernel void @ctlz_i64_poison(ptr addrspace(1) noalias %out, ptr ad
 ; GFX9-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX9-NEXT:    v_add_u32_e64 v2, v2, 32 clamp
 ; GFX9-NEXT:    v_min_u32_e32 v0, v2, v0
-; GFX9-NEXT:    global_store_dwordx2 v1, v[0:1], s[4:5]
+; GFX9-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: ctlz_i64_poison:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x7
-; GFX10-NEXT:    global_load_ubyte v0, v1, s[6:7] offset:5
-; GFX10-NEXT:    global_load_ubyte v2, v1, s[6:7] offset:6
-; GFX10-NEXT:    global_load_ubyte v3, v1, s[6:7] offset:7
-; GFX10-NEXT:    global_load_ubyte v4, v1, s[6:7] offset:1
-; GFX10-NEXT:    global_load_ubyte v5, v1, s[6:7] offset:3
-; GFX10-NEXT:    global_load_ubyte v6, v1, s[6:7]
-; GFX10-NEXT:    global_load_ubyte v7, v1, s[6:7] offset:2
-; GFX10-NEXT:    global_load_ubyte v8, v1, s[6:7] offset:4
+; GFX10-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5
+; GFX10-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:6
+; GFX10-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:7
+; GFX10-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1
+; GFX10-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3
+; GFX10-NEXT:    global_load_ubyte v6, v1, s[2:3]
+; GFX10-NEXT:    global_load_ubyte v7, v1, s[2:3] offset:2
+; GFX10-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(7)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GFX10-NEXT:    s_waitcnt vmcnt(5)
@@ -76,7 +76,7 @@ define amdgpu_kernel void @ctlz_i64_poison(ptr addrspace(1) noalias %out, ptr ad
 ; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX10-NEXT:    v_add_nc_u32_e64 v2, v2, 32 clamp
 ; GFX10-NEXT:    v_min_u32_e32 v0, v2, v0
-; GFX10-NEXT:    global_store_dwordx2 v1, v[0:1], s[4:5]
+; GFX10-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
   %val = load i64, ptr addrspace(1) %arrayidx, align 1
   %ctlz = tail call i64 @llvm.ctlz.i64(i64 %val, i1 true) nounwind readnone
@@ -87,17 +87,17 @@ define amdgpu_kernel void @ctlz_i64_poison(ptr addrspace(1) noalias %out, ptr ad
 define amdgpu_kernel void @ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
 ; GFX9-LABEL: ctlz_i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ubyte v0, v1, s[6:7] offset:5
-; GFX9-NEXT:    global_load_ubyte v2, v1, s[6:7] offset:6
-; GFX9-NEXT:    global_load_ubyte v3, v1, s[6:7] offset:7
-; GFX9-NEXT:    global_load_ubyte v4, v1, s[6:7] offset:1
-; GFX9-NEXT:    global_load_ubyte v5, v1, s[6:7] offset:3
-; GFX9-NEXT:    global_load_ubyte v6, v1, s[6:7] offset:4
-; GFX9-NEXT:    global_load_ubyte v7, v1, s[6:7]
-; GFX9-NEXT:    global_load_ubyte v8, v1, s[6:7] offset:2
+; GFX9-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5
+; GFX9-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:6
+; GFX9-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:7
+; GFX9-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1
+; GFX9-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3
+; GFX9-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:4
+; GFX9-NEXT:    global_load_ubyte v7, v1, s[2:3]
+; GFX9-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:2
 ; GFX9-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(5)
@@ -120,23 +120,23 @@ define amdgpu_kernel void @ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace
 ; GFX9-NEXT:    v_add_u32_e64 v2, v2, 32 clamp
 ; GFX9-NEXT:    v_min_u32_e32 v0, v2, v0
 ; GFX9-NEXT:    v_min_u32_e32 v0, 64, v0
-; GFX9-NEXT:    global_store_dwordx2 v1, v[0:1], s[4:5]
+; GFX9-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: ctlz_i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x7
-; GFX10-NEXT:    global_load_ubyte v0, v1, s[6:7] offset:5
-; GFX10-NEXT:    global_load_ubyte v2, v1, s[6:7] offset:6
-; GFX10-NEXT:    global_load_ubyte v3, v1, s[6:7] offset:7
-; GFX10-NEXT:    global_load_ubyte v4, v1, s[6:7] offset:1
-; GFX10-NEXT:    global_load_ubyte v5, v1, s[6:7] offset:3
-; GFX10-NEXT:    global_load_ubyte v6, v1, s[6:7]
-; GFX10-NEXT:    global_load_ubyte v7, v1, s[6:7] offset:2
-; GFX10-NEXT:    global_load_ubyte v8, v1, s[6:7] offset:4
+; GFX10-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5
+; GFX10-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:6
+; GFX10-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:7
+; GFX10-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1
+; GFX10-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3
+; GFX10-NEXT:    global_load_ubyte v6, v1, s[2:3]
+; GFX10-NEXT:    global_load_ubyte v7, v1, s[2:3] offset:2
+; GFX10-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(7)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GFX10-NEXT:    s_waitcnt vmcnt(5)
@@ -157,7 +157,7 @@ define amdgpu_kernel void @ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace
 ; GFX10-NEXT:    v_add_nc_u32_e64 v2, v2, 32 clamp
 ; GFX10-NEXT:    v_min_u32_e32 v0, v2, v0
 ; GFX10-NEXT:    v_min_u32_e32 v0, 64, v0
-; GFX10-NEXT:    global_store_dwordx2 v1, v[0:1], s[4:5]
+; GFX10-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
   %val = load i64, ptr addrspace(1) %arrayidx, align 1
   %ctlz = tail call i64 @llvm.ctlz.i64(i64 %val, i1 false) nounwind readnone
@@ -168,17 +168,17 @@ define amdgpu_kernel void @ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace
 define amdgpu_kernel void @cttz_i64_poison(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
 ; GFX9-LABEL: cttz_i64_poison:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ubyte v0, v1, s[6:7] offset:5
-; GFX9-NEXT:    global_load_ubyte v2, v1, s[6:7] offset:6
-; GFX9-NEXT:    global_load_ubyte v3, v1, s[6:7] offset:7
-; GFX9-NEXT:    global_load_ubyte v4, v1, s[6:7] offset:1
-; GFX9-NEXT:    global_load_ubyte v5, v1, s[6:7] offset:3
-; GFX9-NEXT:    global_load_ubyte v6, v1, s[6:7] offset:4
-; GFX9-NEXT:    global_load_ubyte v7, v1, s[6:7]
-; GFX9-NEXT:    global_load_ubyte v8, v1, s[6:7] offset:2
+; GFX9-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5
+; GFX9-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:6
+; GFX9-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:7
+; GFX9-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1
+; GFX9-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3
+; GFX9-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:4
+; GFX9-NEXT:    global_load_ubyte v7, v1, s[2:3]
+; GFX9-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:2
 ; GFX9-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(5)
@@ -200,23 +200,23 @@ define amdgpu_kernel void @cttz_i64_poison(ptr addrspace(1) noalias %out, ptr ad
 ; GFX9-NEXT:    v_ffbl_b32_e32 v2, v2
 ; GFX9-NEXT:    v_add_u32_e64 v0, v0, 32 clamp
 ; GFX9-NEXT:    v_min_u32_e32 v0, v0, v2
-; GFX9-NEXT:    global_store_dwordx2 v1, v[0:1], s[4:5]
+; GFX9-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: cttz_i64_poison:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x7
-; GFX10-NEXT:    global_load_ubyte v0, v1, s[6:7] offset:5
-; GFX10-NEXT:    global_load_ubyte v2, v1, s[6:7] offset:7
-; GFX10-NEXT:    global_load_ubyte v3, v1, s[6:7] offset:6
-; GFX10-NEXT:    global_load_ubyte v4, v1, s[6:7] offset:1
-; GFX10-NEXT:    global_load_ubyte v5, v1, s[6:7] offset:3
-; GFX10-NEXT:    global_load_ubyte v6, v1, s[6:7] offset:4
-; GFX10-NEXT:    global_load_ubyte v7, v1, s[6:7]
-; GFX10-NEXT:    global_load_ubyte v8, v1, s[6:7] offset:2
+; GFX10-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5
+; GFX10-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:7
+; GFX10-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:6
+; GFX10-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1
+; GFX10-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3
+; GFX10-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:4
+; GFX10-NEXT:    global_load_ubyte v7, v1, s[2:3]
+; GFX10-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:2
 ; GFX10-NEXT:    s_waitcnt vmcnt(7)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GFX10-NEXT:    s_waitcnt vmcnt(6)
@@ -238,7 +238,7 @@ define amdgpu_kernel void @cttz_i64_poison(ptr addrspace(1) noalias %out, ptr ad
 ; GFX10-NEXT:    v_ffbl_b32_e32 v2, v2
 ; GFX10-NEXT:    v_add_nc_u32_e64 v0, v0, 32 clamp
 ; GFX10-NEXT:    v_min_u32_e32 v0, v0, v2
-; GFX10-NEXT:    global_store_dwordx2 v1, v[0:1], s[4:5]
+; GFX10-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
   %val = load i64, ptr addrspace(1) %arrayidx, align 1
   %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 true) nounwind readnone
@@ -249,17 +249,17 @@ define amdgpu_kernel void @cttz_i64_poison(ptr addrspace(1) noalias %out, ptr ad
 define amdgpu_kernel void @cttz_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
 ; GFX9-LABEL: cttz_i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ubyte v0, v1, s[6:7] offset:5
-; GFX9-NEXT:    global_load_ubyte v2, v1, s[6:7] offset:6
-; GFX9-NEXT:    global_load_ubyte v3, v1, s[6:7] offset:7
-; GFX9-NEXT:    global_load_ubyte v4, v1, s[6:7] offset:1
-; GFX9-NEXT:    global_load_ubyte v5, v1, s[6:7] offset:3
-; GFX9-NEXT:    global_load_ubyte v6, v1, s[6:7] offset:4
-; GFX9-NEXT:    global_load_ubyte v7, v1, s[6:7]
-; GFX9-NEXT:    global_load_ubyte v8, v1, s[6:7] offset:2
+; GFX9-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5
+; GFX9-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:6
+; GFX9-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:7
+; GFX9-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1
+; GFX9-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3
+; GFX9-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:4
+; GFX9-NEXT:    global_load_ubyte v7, v1, s[2:3]
+; GFX9-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:2
 ; GFX9-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(5)
@@ -282,23 +282,23 @@ define amdgpu_kernel void @cttz_i64(ptr addrspace(1) noalias %out, ptr addrspace
 ; GFX9-NEXT:    v_add_u32_e64 v0, v0, 32 clamp
 ; GFX9-NEXT:    v_min_u32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_min_u32_e32 v0, 64, v0
-; GFX9-NEXT:    global_store_dwordx2 v1, v[0:1], s[4:5]
+; GFX9-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: cttz_i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x7
-; GFX10-NEXT:    global_load_ubyte v0, v1, s[6:7] offset:5
-; GFX10-NEXT:    global_load_ubyte v2, v1, s[6:7] offset:7
-; GFX10-NEXT:    global_load_ubyte v3, v1, s[6:7] offset:6
-; GFX10-NEXT:    global_load_ubyte v4, v1, s[6:7] offset:1
-; GFX10-NEXT:    global_load_ubyte v5, v1, s[6:7] offset:3
-; GFX10-NEXT:    global_load_ubyte v6, v1, s[6:7] offset:4
-; GFX10-NEXT:    global_load_ubyte v7, v1, s[6:7]
-; GFX10-NEXT:    global_load_ubyte v8, v1, s[6:7] offset:2
+; GFX10-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5
+; GFX10-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:7
+; GFX10-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:6
+; GFX10-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1
+; GFX10-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3
+; GFX10-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:4
+; GFX10-NEXT:    global_load_ubyte v7, v1, s[2:3]
+; GFX10-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:2
 ; GFX10-NEXT:    s_waitcnt vmcnt(7)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GFX10-NEXT:    s_waitcnt vmcnt(6)
@@ -321,7 +321,7 @@ define amdgpu_kernel void @cttz_i64(ptr addrspace(1) noalias %out, ptr addrspace
 ; GFX10-NEXT:    v_add_nc_u32_e64 v0, v0, 32 clamp
 ; GFX10-NEXT:    v_min_u32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_min_u32_e32 v0, 64, v0
-; GFX10-NEXT:    global_store_dwordx2 v1, v[0:1], s[4:5]
+; GFX10-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
   %val = load i64, ptr addrspace(1) %arrayidx, align 1
   %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 false) nounwind readnone
diff --git a/llvm/test/CodeGen/AMDGPU/mul_int24.ll b/llvm/test/CodeGen/AMDGPU/mul_int24.ll
index 58fd4b9bd2fee..357b851a8f56f 100644
--- a/llvm/test/CodeGen/AMDGPU/mul_int24.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul_int24.ll
@@ -39,17 +39,17 @@ define amdgpu_kernel void @test_smul24_i32(ptr addrspace(1) %out, i32 %a, i32 %b
 ;
 ; GFX9-LABEL: test_smul24_i32:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    s_mov_b32 s1, s5
-; GFX9-NEXT:    s_bfe_i32 s4, s6, 0x180000
-; GFX9-NEXT:    s_bfe_i32 s5, s7, 0x180000
-; GFX9-NEXT:    s_mul_i32 s4, s4, s5
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    s_mov_b32 s4, s0
+; GFX9-NEXT:    s_mov_b32 s5, s1
+; GFX9-NEXT:    s_bfe_i32 s0, s2, 0x180000
+; GFX9-NEXT:    s_bfe_i32 s1, s3, 0x180000
+; GFX9-NEXT:    s_mul_i32 s0, s0, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; EG-LABEL: test_smul24_i32:
@@ -126,17 +126,17 @@ define amdgpu_kernel void @test_smulhi24_i64(ptr addrspace(1) %out, i32 %a, i32
 ;
 ; GFX9-LABEL: test_smulhi24_i64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    s_mov_b32 s1, s5
-; GFX9-NEXT:    s_bfe_i32 s4, s6, 0x180000
-; GFX9-NEXT:    s_bfe_i32 s5, s7, 0x180000
-; GFX9-NEXT:    s_mul_hi_i32 s4, s4, s5
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    s_mov_b32 s4, s0
+; GFX9-NEXT:    s_mov_b32 s5, s1
+; GFX9-NEXT:    s_bfe_i32 s0, s2, 0x180000
+; GFX9-NEXT:    s_bfe_i32 s1, s3, 0x180000
+; GFX9-NEXT:    s_mul_hi_i32 s0, s0, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; EG-LABEL: test_smulhi24_i64:
diff --git a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
index 698a54de108f7..3a16c88f32cc3 100644
--- a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
@@ -39,17 +39,17 @@ define amdgpu_kernel void @test_umul24_i32(ptr addrspace(1) %out, i32 %a, i32 %b
 ;
 ; GFX9-LABEL: test_umul24_i32:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    s_mov_b32 s1, s5
-; GFX9-NEXT:    s_and_b32 s4, s6, 0xffffff
-; GFX9-NEXT:    s_and_b32 s5, s7, 0xffffff
-; GFX9-NEXT:    s_mul_i32 s4, s4, s5
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    s_mov_b32 s4, s0
+; GFX9-NEXT:    s_mov_b32 s5, s1
+; GFX9-NEXT:    s_and_b32 s0, s2, 0xffffff
+; GFX9-NEXT:    s_and_b32 s1, s3, 0xffffff
+; GFX9-NEXT:    s_mul_i32 s0, s0, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
 entry:
   %0 = shl i32 %a, 8
@@ -405,17 +405,17 @@ define amdgpu_kernel void @test_umulhi24_i32_i64(ptr addrspace(1) %out, i32 %a,
 ;
 ; GFX9-LABEL: test_umulhi24_i32_i64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    s_mov_b32 s1, s5
-; GFX9-NEXT:    s_and_b32 s4, s6, 0xffffff
-; GFX9-NEXT:    s_and_b32 s5, s7, 0xffffff
-; GFX9-NEXT:    s_mul_hi_u32 s4, s4, s5
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    s_mov_b32 s4, s0
+; GFX9-NEXT:    s_mov_b32 s5, s1
+; GFX9-NEXT:    s_and_b32 s0, s2, 0xffffff
+; GFX9-NEXT:    s_and_b32 s1, s3, 0xffffff
+; GFX9-NEXT:    s_mul_hi_u32 s0, s0, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
 entry:
   %a.24 = and i32 %a, 16777215
@@ -663,14 +663,14 @@ define amdgpu_kernel void @test_umulhi16_i32(ptr addrspace(1) %out, i32 %a, i32
 ;
 ; GFX9-LABEL: test_umulhi16_i32:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s0, s6, 0xffff
-; GFX9-NEXT:    s_and_b32 s1, s7, 0xffff
-; GFX9-NEXT:    s_mul_i32 s0, s0, s1
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-NEXT:    global_store_short_d16_hi v0, v1, s[4:5]
+; GFX9-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX9-NEXT:    s_and_b32 s3, s3, 0xffff
+; GFX9-NEXT:    s_mul_i32 s2, s2, s3
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    global_store_short_d16_hi v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 entry:
   %a.16 = and i32 %a, 65535
diff --git a/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll b/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll
index 1899a0abc6592..a3f7906a05f6b 100644
--- a/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll
+++ b/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll
@@ -4,17 +4,17 @@
 define amdgpu_kernel void @fma_vector_vector_scalar_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 {
 ; GCN-LABEL: fma_vector_vector_scalar_lo:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-NEXT:    ds_read_b32 v2, v0
 ; GCN-NEXT:    ds_read_b32 v0, v0 offset:4
 ; GCN-NEXT:    ds_read_u16 v1, v1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0]
-; GCN-NEXT:    global_store_dword v3, v0, s[4:5]
+; GCN-NEXT:    global_store_dword v3, v0, s[0:1]
 ; GCN-NEXT:    s_endpgm
 bb:
   %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -35,17 +35,17 @@ bb:
 define amdgpu_kernel void @fma_vector_vector_neg_broadcast_scalar_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 {
 ; GCN-LABEL: fma_vector_vector_neg_broadcast_scalar_lo:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-NEXT:    ds_read_b32 v2, v0
 ; GCN-NEXT:    ds_read_b32 v0, v0 offset:4
 ; GCN-NEXT:    ds_read_u16 v1, v1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1]
-; GCN-NEXT:    global_store_dword v3, v0, s[4:5]
+; GCN-NEXT:    global_store_dword v3, v0, s[0:1]
 ; GCN-NEXT:    s_endpgm
 bb:
   %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -67,17 +67,17 @@ bb:
 define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 {
 ; GCN-LABEL: fma_vector_vector_neg_scalar_lo:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-NEXT:    ds_read_b32 v2, v0
 ; GCN-NEXT:    ds_read_b32 v0, v0 offset:4
 ; GCN-NEXT:    ds_read_u16 v1, v1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1]
-; GCN-NEXT:    global_store_dword v3, v0, s[4:5]
+; GCN-NEXT:    global_store_dword v3, v0, s[0:1]
 ; GCN-NEXT:    s_endpgm
 bb:
   %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -99,17 +99,17 @@ bb:
 define amdgpu_kernel void @fma_vector_vector_neg_broadcast_neg_scalar_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 {
 ; GCN-LABEL: fma_vector_vector_neg_broadcast_neg_scalar_lo:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-NEXT:    ds_read_b32 v2, v0
 ; GCN-NEXT:    ds_read_b32 v0, v0 offset:4
 ; GCN-NEXT:    ds_read_u16 v1, v1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0]
-; GCN-NEXT:    global_store_dword v3, v0, s[4:5]
+; GCN-NEXT:    global_store_dword v3, v0, s[0:1]
 ; GCN-NEXT:    s_endpgm
 bb:
   %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -132,17 +132,17 @@ bb:
 define amdgpu_kernel void @fma_vector_vector_scalar_neg_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 {
 ; GCN-LABEL: fma_vector_vector_scalar_neg_lo:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-NEXT:    ds_read_b32 v2, v0
 ; GCN-NEXT:    ds_read_b32 v0, v0 offset:4
 ; GCN-NEXT:    ds_read_u16 v1, v1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] neg_lo:[0,0,1]
-; GCN-NEXT:    global_store_dword v3, v0, s[4:5]
+; GCN-NEXT:    global_store_dword v3, v0, s[0:1]
 ; GCN-NEXT:    s_endpgm
 bb:
   %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -163,17 +163,17 @@ bb:
 define amdgpu_kernel void @fma_vector_vector_scalar_neg_hi(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 {
 ; GCN-LABEL: fma_vector_vector_scalar_neg_hi:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-NEXT:    ds_read_b32 v2, v0
 ; GCN-NEXT:    ds_read_b32 v0, v0 offset:4
 ; GCN-NEXT:    ds_read_u16 v1, v1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] neg_hi:[0,0,1]
-; GCN-NEXT:    global_store_dword v3, v0, s[4:5]
+; GCN-NEXT:    global_store_dword v3, v0, s[0:1]
 ; GCN-NEXT:    s_endpgm
 bb:
   %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -194,16 +194,16 @@ bb:
 define amdgpu_kernel void @add_vector_neg_bitcast_scalar_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 {
 ; GCN-LABEL: add_vector_neg_bitcast_scalar_lo:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-NEXT:    ds_read_b32 v0, v0
 ; GCN-NEXT:    ds_read_u16 v1, v1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_pk_add_u16 v0, v0, v1 op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]
-; GCN-NEXT:    global_store_dword v2, v0, s[4:5]
+; GCN-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GCN-NEXT:    s_endpgm
 bb:
   %vec0 = load volatile <2 x i16>, ptr addrspace(3) %lds, align 4
@@ -222,11 +222,11 @@ bb:
 define amdgpu_kernel void @fma_vector_vector_scalar_lo_neg_scalar_hi(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 {
 ; GCN-LABEL: fma_vector_vector_scalar_lo_neg_scalar_hi:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-NEXT:    ds_read_b32 v2, v0
 ; GCN-NEXT:    ds_read_b32 v0, v0 offset:4
 ; GCN-NEXT:    ds_read_u16 v3, v1
@@ -237,7 +237,7 @@ define amdgpu_kernel void @fma_vector_vector_scalar_lo_neg_scalar_hi(ptr addrspa
 ; GCN-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
 ; GCN-NEXT:    v_lshl_or_b32 v1, v1, 16, v3
 ; GCN-NEXT:    v_pk_fma_f16 v0, v2, v0, v1
-; GCN-NEXT:    global_store_dword v4, v0, s[4:5]
+; GCN-NEXT:    global_store_dword v4, v0, s[0:1]
 ; GCN-NEXT:    s_endpgm
 bb:
   %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -261,10 +261,10 @@ bb:
 define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo_scalar_hi(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 {
 ; GCN-LABEL: fma_vector_vector_neg_scalar_lo_scalar_hi:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-NEXT:    ds_read_b32 v2, v0
 ; GCN-NEXT:    ds_read_b32 v0, v0 offset:4
 ; GCN-NEXT:    ds_read_u16 v3, v1
@@ -273,7 +273,7 @@ define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo_scalar_hi(ptr addrspa
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_pk_fma_f16 v0, v2, v0, v3 neg_lo:[0,0,1] neg_hi:[0,0,1]
-; GCN-NEXT:    global_store_dword v1, v0, s[4:5]
+; GCN-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GCN-NEXT:    s_endpgm
 bb:
   %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
index 3f8b64b618e9d..e076df97e1ba4 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
@@ -720,13 +720,13 @@ define amdgpu_kernel void @v8i32_kernel_preload_arg(ptr addrspace(1) nocapture %
 define amdgpu_kernel void @v3i16_kernel_preload_arg(ptr addrspace(1) nocapture %out, <3 x i16> %in) #0 {
 ; GFX940-NO-PRELOAD-LABEL: v3i16_kernel_preload_arg:
 ; GFX940-NO-PRELOAD:       ; %bb.0:
-; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
 ; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s7
-; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s6
-; GFX940-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[4:5] offset:4 sc0 sc1
-; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[4:5] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
+; GFX940-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[0:1] offset:4 sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[0:1] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
 ; GFX940-PRELOAD-2-LABEL: v3i16_kernel_preload_arg:
@@ -934,13 +934,13 @@ define amdgpu_kernel void @v3f32_kernel_preload_arg(ptr addrspace(1) nocapture %
 define amdgpu_kernel void @v5i8_kernel_preload_arg(ptr addrspace(1) nocapture %out, <5 x i8> %in) #0 {
 ; GFX940-NO-PRELOAD-LABEL: v5i8_kernel_preload_arg:
 ; GFX940-NO-PRELOAD:       ; %bb.0:
-; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
 ; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s7
-; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s6
-; GFX940-NO-PRELOAD-NEXT:    global_store_byte v0, v1, s[4:5] offset:4 sc0 sc1
-; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[4:5] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
+; GFX940-NO-PRELOAD-NEXT:    global_store_byte v0, v1, s[0:1] offset:4 sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[0:1] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
 ; GFX940-PRELOAD-2-LABEL: v5i8_kernel_preload_arg:
@@ -1172,11 +1172,11 @@ define amdgpu_kernel void @v5f64_kernel_preload_arg(ptr addrspace(1) nocapture %
 define amdgpu_kernel void @v8i8_kernel_preload_arg(ptr addrspace(1) %out, <8 x i8> %in) #0 {
 ; GFX940-NO-PRELOAD-LABEL: v8i8_kernel_preload_arg:
 ; GFX940-NO-PRELOAD:       ; %bb.0:
-; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
 ; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NO-PRELOAD-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
-; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
 ; GFX940-PRELOAD-2-LABEL: v8i8_kernel_preload_arg:
@@ -1592,13 +1592,13 @@ define amdgpu_kernel void @v2bfloat_kernel_preload_arg(ptr addrspace(1) %out, <2
 define amdgpu_kernel void @v3bfloat_kernel_preload_arg(ptr addrspace(1) %out, <3 x bfloat> %in) #0 {
 ; GFX940-NO-PRELOAD-LABEL: v3bfloat_kernel_preload_arg:
 ; GFX940-NO-PRELOAD:       ; %bb.0:
-; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
 ; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s7
-; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s6
-; GFX940-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[4:5] offset:4 sc0 sc1
-; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[4:5] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
+; GFX940-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[0:1] offset:4 sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[0:1] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
 ; GFX940-PRELOAD-2-LABEL: v3bfloat_kernel_preload_arg:
@@ -1983,14 +1983,14 @@ define amdgpu_kernel void @fp128_kernel_preload_arg(ptr addrspace(1) %out, fp128
 define amdgpu_kernel void @v7i8_kernel_preload_arg(ptr addrspace(1) %out, <7 x i8> %in) #0 {
 ; GFX940-NO-PRELOAD-LABEL: v7i8_kernel_preload_arg:
 ; GFX940-NO-PRELOAD:       ; %bb.0:
-; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
 ; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s7
-; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s6
-; GFX940-NO-PRELOAD-NEXT:    global_store_byte_d16_hi v0, v1, s[4:5] offset:6 sc0 sc1
-; GFX940-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[4:5] offset:4 sc0 sc1
-; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[4:5] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
+; GFX940-NO-PRELOAD-NEXT:    global_store_byte_d16_hi v0, v1, s[0:1] offset:6 sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[0:1] offset:4 sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[0:1] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
 ; GFX940-PRELOAD-2-LABEL: v7i8_kernel_preload_arg:
diff --git a/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll b/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll
index 2ce0b9eed02cb..ce92d40cca2b0 100644
--- a/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll
+++ b/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll
@@ -50,40 +50,40 @@ define amdgpu_kernel void @buffers_dont_alias(ptr addrspace(8) noalias %a, ptr a
 define amdgpu_kernel void @buffers_from_flat_dont_alias(ptr noalias %a.flat, ptr noalias %b.flat) {
 ; SDAG-LABEL: buffers_from_flat_dont_alias:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; SDAG-NEXT:    s_mov_b32 s3, 0
-; SDAG-NEXT:    s_mov_b32 s2, 16
+; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-NEXT:    s_mov_b32 s7, 0
+; SDAG-NEXT:    s_mov_b32 s6, 16
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    s_and_b32 s1, s5, 0xffff
-; SDAG-NEXT:    s_mov_b32 s0, s4
-; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
-; SDAG-NEXT:    s_and_b32 s1, s7, 0xffff
-; SDAG-NEXT:    s_mov_b32 s0, s6
+; SDAG-NEXT:    s_and_b32 s5, s1, 0xffff
+; SDAG-NEXT:    s_mov_b32 s4, s0
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; SDAG-NEXT:    s_and_b32 s5, s3, 0xffff
+; SDAG-NEXT:    s_mov_b32 s4, s2
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    v_mul_f32_e32 v0, v0, v0
 ; SDAG-NEXT:    v_mul_f32_e32 v1, v1, v1
 ; SDAG-NEXT:    v_mul_f32_e32 v2, v2, v2
 ; SDAG-NEXT:    v_mul_f32_e32 v3, v3, v3
-; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: buffers_from_flat_dont_alias:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GISEL-NEXT:    s_mov_b32 s3, 0
-; GISEL-NEXT:    s_mov_b32 s2, 16
+; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-NEXT:    s_mov_b32 s7, 0
+; GISEL-NEXT:    s_mov_b32 s6, 16
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    s_and_b32 s1, s5, 0xffff
-; GISEL-NEXT:    s_mov_b32 s0, s4
-; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
-; GISEL-NEXT:    s_and_b32 s1, s7, 0xffff
-; GISEL-NEXT:    s_mov_b32 s0, s6
+; GISEL-NEXT:    s_and_b32 s5, s1, 0xffff
+; GISEL-NEXT:    s_mov_b32 s4, s0
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    s_and_b32 s5, s3, 0xffff
+; GISEL-NEXT:    s_mov_b32 s4, s2
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    v_mul_f32_e32 v0, v0, v0
 ; GISEL-NEXT:    v_mul_f32_e32 v1, v1, v1
 ; GISEL-NEXT:    v_mul_f32_e32 v2, v2, v2
 ; GISEL-NEXT:    v_mul_f32_e32 v3, v3, v3
-; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GISEL-NEXT:    s_endpgm
   %a = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %a.flat, i16 0, i32 16, i32 0)
   %b = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %b.flat, i16 0, i32 16, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/rotl.ll b/llvm/test/CodeGen/AMDGPU/rotl.ll
index fcccd2da07f76..a87973d93ac77 100644
--- a/llvm/test/CodeGen/AMDGPU/rotl.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotl.ll
@@ -47,12 +47,12 @@ define amdgpu_kernel void @rotl_i32(ptr addrspace(1) %in, i32 %x, i32 %y) {
 ;
 ; GFX10-LABEL: rotl_i32:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_sub_i32 s0, 32, s7
-; GFX10-NEXT:    v_alignbit_b32 v1, s6, s6, s0
-; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT:    s_sub_i32 s3, 32, s3
+; GFX10-NEXT:    v_alignbit_b32 v1, s2, s2, s3
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: rotl_i32:
diff --git a/llvm/test/CodeGen/AMDGPU/rotr.ll b/llvm/test/CodeGen/AMDGPU/rotr.ll
index 214894092a8b0..058ee589bc4b0 100644
--- a/llvm/test/CodeGen/AMDGPU/rotr.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotr.ll
@@ -43,11 +43,11 @@ define amdgpu_kernel void @rotr_i32(ptr addrspace(1) %in, i32 %x, i32 %y) {
 ;
 ; GFX10-LABEL: rotr_i32:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_alignbit_b32 v1, s6, s6, s7
-; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT:    v_alignbit_b32 v1, s2, s2, s3
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: rotr_i32:
diff --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
index acdcb631dccbd..b81af3eb838f1 100644
--- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
@@ -8,15 +8,15 @@
 define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
 ; GFX9-LABEL: s_shl_v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    s_mov_b32 s1, s5
-; GFX9-NEXT:    v_pk_lshlrev_b16 v0, s7, v0
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    s_mov_b32 s4, s0
+; GFX9-NEXT:    s_mov_b32 s5, s1
+; GFX9-NEXT:    v_pk_lshlrev_b16 v0, s3, v0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_shl_v2i16:
@@ -59,14 +59,14 @@ define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2
 ;
 ; GFX10-LABEL: s_shl_v2i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX10-NEXT:    s_mov_b32 s2, -1
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s6, -1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_pk_lshlrev_b16 v0, s7, s6
-; GFX10-NEXT:    s_mov_b32 s0, s4
-; GFX10-NEXT:    s_mov_b32 s1, s5
-; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX10-NEXT:    v_pk_lshlrev_b16 v0, s3, s2
+; GFX10-NEXT:    s_mov_b32 s4, s0
+; GFX10-NEXT:    s_mov_b32 s5, s1
+; GFX10-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_shl_v2i16:
diff --git a/llvm/test/CodeGen/AMDGPU/sub.ll b/llvm/test/CodeGen/AMDGPU/sub.ll
index 3ae982089228d..ded308ae4f230 100644
--- a/llvm/test/CodeGen/AMDGPU/sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.ll
@@ -33,12 +33,12 @@ define amdgpu_kernel void @s_sub_i32(ptr addrspace(1) %out, i32 %a, i32 %b) {
 ;
 ; GFX9-LABEL: s_sub_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_sub_i32 s0, s6, s7
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    s_sub_i32 s2, s2, s3
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: s_sub_i32:
diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
index bfeab97d81dbe..6ec213a06999b 100644
--- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
@@ -214,15 +214,15 @@ define amdgpu_kernel void @s_test_sub_self_v2i16(ptr addrspace(1) %out, ptr addr
 define amdgpu_kernel void @s_test_sub_v2i16_kernarg(ptr addrspace(1) %out, <2 x i16> %a, <2 x i16> %b) #1 {
 ; GFX9-LABEL: s_test_sub_v2i16_kernarg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s7
-; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    s_mov_b32 s1, s5
-; GFX9-NEXT:    v_pk_sub_i16 v0, s6, v0
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s3
+; GFX9-NEXT:    s_mov_b32 s4, s0
+; GFX9-NEXT:    s_mov_b32 s5, s1
+; GFX9-NEXT:    v_pk_sub_i16 v0, s2, v0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_test_sub_v2i16_kernarg:
@@ -246,14 +246,14 @@ define amdgpu_kernel void @s_test_sub_v2i16_kernarg(ptr addrspace(1) %out, <2 x
 ;
 ; GFX10-LABEL: s_test_sub_v2i16_kernarg:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX10-NEXT:    s_mov_b32 s2, -1
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s6, -1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_pk_sub_i16 v0, s6, s7
-; GFX10-NEXT:    s_mov_b32 s0, s4
-; GFX10-NEXT:    s_mov_b32 s1, s5
-; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX10-NEXT:    v_pk_sub_i16 v0, s2, s3
+; GFX10-NEXT:    s_mov_b32 s4, s0
+; GFX10-NEXT:    s_mov_b32 s5, s1
+; GFX10-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_test_sub_v2i16_kernarg:
diff --git a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
index 65eb1cee42350..fc6df735c05b0 100644
--- a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
@@ -135,13 +135,13 @@ define amdgpu_kernel void @v_cnd_nan(ptr addrspace(1) %out, i32 %c, float %f) #0
 ;
 ; GFX10-LABEL: v_cnd_nan:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_cmp_eq_u32 s6, 0
-; GFX10-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, -1, s7, s[0:1]
-; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX10-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, -1, s3, s[4:5]
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_cnd_nan:
diff --git a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
index e3185e189157b..89fef7eead839 100644
--- a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
@@ -104,15 +104,15 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg
 ;
 ; SDAG-GFX9-LABEL: basic_smax_smin_sgpr:
 ; SDAG-GFX9:       ; %bb.0:
-; SDAG-GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; SDAG-GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; SDAG-GFX9-NEXT:    v_mov_b32_e32 v1, 0xff
 ; SDAG-GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX9-NEXT:    v_med3_i16 v2, s6, 0, v1
-; SDAG-GFX9-NEXT:    v_med3_i16 v1, s7, 0, v1
+; SDAG-GFX9-NEXT:    v_med3_i16 v2, s2, 0, v1
+; SDAG-GFX9-NEXT:    v_med3_i16 v1, s3, 0, v1
 ; SDAG-GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; SDAG-GFX9-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
-; SDAG-GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; SDAG-GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX9-NEXT:    s_endpgm
 ;
 ; SDAG-GFX11-LABEL: basic_smax_smin_sgpr:
@@ -156,22 +156,22 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg
 ;
 ; GISEL-GFX9-LABEL: basic_smax_smin_sgpr:
 ; GISEL-GFX9:       ; %bb.0:
-; GISEL-GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GISEL-GFX9-NEXT:    s_sext_i32_i16 s0, 0
-; GISEL-GFX9-NEXT:    s_sext_i32_i16 s1, 0xff
+; GISEL-GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX9-NEXT:    s_sext_i32_i16 s4, 0
+; GISEL-GFX9-NEXT:    s_sext_i32_i16 s5, 0xff
 ; GISEL-GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX9-NEXT:    s_sext_i32_i16 s2, s6
-; GISEL-GFX9-NEXT:    s_sext_i32_i16 s3, s7
-; GISEL-GFX9-NEXT:    s_max_i32 s2, s2, s0
-; GISEL-GFX9-NEXT:    s_max_i32 s0, s3, s0
 ; GISEL-GFX9-NEXT:    s_sext_i32_i16 s2, s2
-; GISEL-GFX9-NEXT:    s_sext_i32_i16 s0, s0
-; GISEL-GFX9-NEXT:    s_min_i32 s2, s2, s1
-; GISEL-GFX9-NEXT:    s_min_i32 s0, s0, s1
-; GISEL-GFX9-NEXT:    s_pack_ll_b32_b16 s0, s2, s0
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GISEL-GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
+; GISEL-GFX9-NEXT:    s_sext_i32_i16 s3, s3
+; GISEL-GFX9-NEXT:    s_max_i32 s2, s2, s4
+; GISEL-GFX9-NEXT:    s_max_i32 s3, s3, s4
+; GISEL-GFX9-NEXT:    s_sext_i32_i16 s2, s2
+; GISEL-GFX9-NEXT:    s_sext_i32_i16 s3, s3
+; GISEL-GFX9-NEXT:    s_min_i32 s2, s2, s5
+; GISEL-GFX9-NEXT:    s_min_i32 s3, s3, s5
+; GISEL-GFX9-NEXT:    s_pack_ll_b32_b16 s2, s2, s3
+; GISEL-GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GISEL-GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX9-NEXT:    s_endpgm
 ;
 ; GISEL-GFX11-LABEL: basic_smax_smin_sgpr:
diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
index e12a4beb5dbe5..901e88a4c6aca 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -1344,40 +1344,40 @@ exit:
 define amdgpu_kernel void @fdiv_f32(ptr addrspace(1) %out, float %a, float %b) #0 {
 ; GFX1032-LABEL: fdiv_f32:
 ; GFX1032:       ; %bb.0: ; %entry
-; GFX1032-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_div_scale_f32 v0, s0, s7, s7, s6
+; GFX1032-NEXT:    v_div_scale_f32 v0, s4, s3, s3, s2
 ; GFX1032-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX1032-NEXT:    v_fma_f32 v2, -v0, v1, 1.0
 ; GFX1032-NEXT:    v_fmac_f32_e32 v1, v2, v1
-; GFX1032-NEXT:    v_div_scale_f32 v2, vcc_lo, s6, s7, s6
+; GFX1032-NEXT:    v_div_scale_f32 v2, vcc_lo, s2, s3, s2
 ; GFX1032-NEXT:    v_mul_f32_e32 v3, v2, v1
 ; GFX1032-NEXT:    v_fma_f32 v4, -v0, v3, v2
 ; GFX1032-NEXT:    v_fmac_f32_e32 v3, v4, v1
 ; GFX1032-NEXT:    v_fma_f32 v0, -v0, v3, v2
 ; GFX1032-NEXT:    v_div_fmas_f32 v0, v0, v1, v3
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-NEXT:    v_div_fixup_f32 v0, v0, s7, s6
-; GFX1032-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX1032-NEXT:    v_div_fixup_f32 v0, v0, s3, s2
+; GFX1032-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: fdiv_f32:
 ; GFX1064:       ; %bb.0: ; %entry
-; GFX1064-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_div_scale_f32 v0, s[0:1], s7, s7, s6
+; GFX1064-NEXT:    v_div_scale_f32 v0, s[4:5], s3, s3, s2
 ; GFX1064-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX1064-NEXT:    v_fma_f32 v2, -v0, v1, 1.0
 ; GFX1064-NEXT:    v_fmac_f32_e32 v1, v2, v1
-; GFX1064-NEXT:    v_div_scale_f32 v2, vcc, s6, s7, s6
+; GFX1064-NEXT:    v_div_scale_f32 v2, vcc, s2, s3, s2
 ; GFX1064-NEXT:    v_mul_f32_e32 v3, v2, v1
 ; GFX1064-NEXT:    v_fma_f32 v4, -v0, v3, v2
 ; GFX1064-NEXT:    v_fmac_f32_e32 v3, v4, v1
 ; GFX1064-NEXT:    v_fma_f32 v0, -v0, v3, v2
 ; GFX1064-NEXT:    v_div_fmas_f32 v0, v0, v1, v3
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-NEXT:    v_div_fixup_f32 v0, v0, s7, s6
-; GFX1064-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX1064-NEXT:    v_div_fixup_f32 v0, v0, s3, s2
+; GFX1064-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX1064-NEXT:    s_endpgm
 entry:
   %fdiv = fdiv float %a, %b
@@ -2138,23 +2138,23 @@ main_body:
 define amdgpu_kernel void @test_intr_fcmp_i64(ptr addrspace(1) %out, float %src, float %a) {
 ; GFX1032-LABEL: test_intr_fcmp_i64:
 ; GFX1032:       ; %bb.0:
-; GFX1032-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_cmp_eq_f32_e64 s0, s6, |s7|
-; GFX1032-NEXT:    v_mov_b32_e32 v0, s0
-; GFX1032-NEXT:    global_store_dwordx2 v1, v[0:1], s[4:5]
+; GFX1032-NEXT:    v_cmp_eq_f32_e64 s2, s2, |s3|
+; GFX1032-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1032-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: test_intr_fcmp_i64:
 ; GFX1064:       ; %bb.0:
-; GFX1064-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_cmp_eq_f32_e64 s[0:1], s6, |s7|
-; GFX1064-NEXT:    v_mov_b32_e32 v0, s0
-; GFX1064-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1064-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX1064-NEXT:    v_cmp_eq_f32_e64 s[2:3], s2, |s3|
+; GFX1064-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1064-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1064-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX1064-NEXT:    s_endpgm
   %temp = call float @llvm.fabs.f32(float %a)
   %result = call i64 @llvm.amdgcn.fcmp.i64.f32(float %src, float %temp, i32 1)
@@ -2195,22 +2195,22 @@ define amdgpu_kernel void @test_intr_icmp_i64(ptr addrspace(1) %out, i32 %src) {
 define amdgpu_kernel void @test_intr_fcmp_i32(ptr addrspace(1) %out, float %src, float %a) {
 ; GFX1032-LABEL: test_intr_fcmp_i32:
 ; GFX1032:       ; %bb.0:
-; GFX1032-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_cmp_eq_f32_e64 s0, s6, |s7|
-; GFX1032-NEXT:    v_mov_b32_e32 v1, s0
-; GFX1032-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX1032-NEXT:    v_cmp_eq_f32_e64 s2, s2, |s3|
+; GFX1032-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1032-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: test_intr_fcmp_i32:
 ; GFX1064:       ; %bb.0:
-; GFX1064-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_cmp_eq_f32_e64 s[0:1], s6, |s7|
-; GFX1064-NEXT:    v_mov_b32_e32 v1, s0
-; GFX1064-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX1064-NEXT:    v_cmp_eq_f32_e64 s[2:3], s2, |s3|
+; GFX1064-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1064-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX1064-NEXT:    s_endpgm
   %temp = call float @llvm.fabs.f32(float %a)
   %result = call i32 @llvm.amdgcn.fcmp.i32.f32(float %src, float %temp, i32 1)

>From e76cf18ff79836a41af11fbb4eeb4ece3c792ab8 Mon Sep 17 00:00:00 2001
From: Christudasan Devadasan <Christudasan.Devadasan at amd.com>
Date: Mon, 1 Jul 2024 06:19:49 +0000
Subject: [PATCH 4/6] Removed unwanted helper function hasXnackReplay.

---
 llvm/lib/Target/AMDGPU/GCNSubtarget.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 19d5b950d7142..db5b467f22389 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -967,7 +967,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 
   bool hasLDSFPAtomicAddF32() const { return GFX8Insts; }
   bool hasLDSFPAtomicAddF64() const { return GFX90AInsts; }
-  bool hasXnackReplay() const { return GFX8Insts; }
 
   /// \returns true if the subtarget has the v_permlanex16_b32 instruction.
   bool hasPermLaneX16() const { return getGeneration() >= GFX10; }

>From e7e6cbc4abd476a038fd7836e5078565e73d1fe9 Mon Sep 17 00:00:00 2001
From: Christudasan Devadasan <Christudasan.Devadasan at amd.com>
Date: Wed, 3 Jul 2024 04:17:25 +0000
Subject: [PATCH 5/6] Used byte width and simplified some more code.

---
 .../Target/AMDGPU/SILoadStoreOptimizer.cpp    |  28 +-
 .../AMDGPU/GlobalISel/fp-atomics-gfx940.ll    |  12 +-
 .../GlobalISel/llvm.amdgcn.update.dpp.ll      |  28 +-
 .../test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll | 152 +++----
 .../test/CodeGen/AMDGPU/GlobalISel/udivrem.ll |  84 ++--
 llvm/test/CodeGen/AMDGPU/add.v2i16.ll         |  14 +-
 .../AMDGPU/amdgpu-codegenprepare-idiv.ll      | 408 +++++++++---------
 llvm/test/CodeGen/AMDGPU/build_vector.ll      |  12 +-
 llvm/test/CodeGen/AMDGPU/cluster_stores.ll    |  88 ++--
 .../CodeGen/AMDGPU/combine-cond-add-sub.ll    |  20 +-
 llvm/test/CodeGen/AMDGPU/ctlz.ll              | 180 ++++----
 llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll   | 158 +++----
 llvm/test/CodeGen/AMDGPU/cttz.ll              | 184 ++++----
 llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll   | 126 +++---
 llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll     | 262 ++++++-----
 .../AMDGPU/divergence-driven-buildvector.ll   |  32 +-
 llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll     |  20 +-
 llvm/test/CodeGen/AMDGPU/fdiv.ll              |  86 ++--
 llvm/test/CodeGen/AMDGPU/flat_atomics.ll      |  20 +-
 .../CodeGen/AMDGPU/flat_atomics_i32_system.ll |  56 +--
 llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll       | 180 ++++----
 llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll |  12 +-
 llvm/test/CodeGen/AMDGPU/fshl.ll              |  42 +-
 llvm/test/CodeGen/AMDGPU/fshr.ll              |  14 +-
 llvm/test/CodeGen/AMDGPU/global_atomics.ll    |  16 +-
 .../AMDGPU/global_atomics_i32_system.ll       |  96 ++---
 .../insert_waitcnt_for_precise_memory.ll      | 160 +++----
 .../CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll   |  14 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll    |  32 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll    |  24 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll    |  16 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll    |  16 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll    |  16 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.permlane.ll    | 112 +++--
 .../AMDGPU/llvm.amdgcn.sched.group.barrier.ll | 244 +++++------
 llvm/test/CodeGen/AMDGPU/llvm.exp.ll          |  48 +--
 llvm/test/CodeGen/AMDGPU/llvm.exp10.ll        |  48 +--
 llvm/test/CodeGen/AMDGPU/llvm.exp2.ll         |  12 +-
 llvm/test/CodeGen/AMDGPU/llvm.log2.ll         |  12 +-
 llvm/test/CodeGen/AMDGPU/llvm.round.ll        |  76 ++--
 llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll        |  14 +-
 llvm/test/CodeGen/AMDGPU/madak.ll             | 104 ++---
 llvm/test/CodeGen/AMDGPU/memory_clause.ll     |  18 +-
 .../CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll  | 160 +++----
 llvm/test/CodeGen/AMDGPU/mul_int24.ll         |  40 +-
 llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll |  52 +--
 llvm/test/CodeGen/AMDGPU/packed-op-sel.ll     |  72 ++--
 .../AMDGPU/ptr-buffer-alias-scheduling.ll     |  36 +-
 llvm/test/CodeGen/AMDGPU/rotl.ll              |   8 +-
 llvm/test/CodeGen/AMDGPU/rotr.ll              |   6 +-
 llvm/test/CodeGen/AMDGPU/shl.v2i16.ll         |  30 +-
 llvm/test/CodeGen/AMDGPU/sub.ll               |   8 +-
 llvm/test/CodeGen/AMDGPU/sub.v2i16.ll         |  30 +-
 llvm/test/CodeGen/AMDGPU/v_cndmask.ll         |  10 +-
 llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll   |  34 +-
 llvm/test/CodeGen/AMDGPU/wave32.ll            |  54 +--
 56 files changed, 1920 insertions(+), 1916 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index 7bdc494ff545d..ea8ad39b90257 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -1234,11 +1234,8 @@ void SILoadStoreOptimizer::copyToDestRegs(
   // The constrained sload instructions in S_LOAD_IMM class will have
   // `early-clobber` flag in the dst operand. Remove the flag before using the
   // MOs in copies.
-  if (Dest0->isEarlyClobber())
-    Dest0->setIsEarlyClobber(false);
-
-  if (Dest1->isEarlyClobber())
-    Dest1->setIsEarlyClobber(false);
+  Dest0->setIsEarlyClobber(false);
+  Dest1->setIsEarlyClobber(false);
 
   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
       .add(*Dest0) // Copy to same destination including flags and sub reg.
@@ -1729,24 +1726,23 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
     // If XNACK is enabled, use the constrained opcodes when the first load is
     // under-aligned.
     const MachineMemOperand *MMO = *CI.I->memoperands_begin();
-    auto NeedsConstrainedOpc = [&MMO, Width](const GCNSubtarget &ST) {
-      return ST.isXNACKEnabled() && MMO->getAlign().value() < Width;
-    };
+    bool NeedsConstrainedOpc =
+        STM->isXNACKEnabled() && MMO->getAlign().value() < (Width << 2);
     switch (Width) {
     default:
       return 0;
     case 2:
-      return NeedsConstrainedOpc(*STM) ? AMDGPU::S_LOAD_DWORDX2_IMM_ec
-                                       : AMDGPU::S_LOAD_DWORDX2_IMM;
+      return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX2_IMM_ec
+                                 : AMDGPU::S_LOAD_DWORDX2_IMM;
     case 3:
-      return NeedsConstrainedOpc(*STM) ? AMDGPU::S_LOAD_DWORDX3_IMM_ec
-                                       : AMDGPU::S_LOAD_DWORDX3_IMM;
+      return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX3_IMM_ec
+                                 : AMDGPU::S_LOAD_DWORDX3_IMM;
     case 4:
-      return NeedsConstrainedOpc(*STM) ? AMDGPU::S_LOAD_DWORDX4_IMM_ec
-                                       : AMDGPU::S_LOAD_DWORDX4_IMM;
+      return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX4_IMM_ec
+                                 : AMDGPU::S_LOAD_DWORDX4_IMM;
     case 8:
-      return NeedsConstrainedOpc(*STM) ? AMDGPU::S_LOAD_DWORDX8_IMM_ec
-                                       : AMDGPU::S_LOAD_DWORDX8_IMM;
+      return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX8_IMM_ec
+                                 : AMDGPU::S_LOAD_DWORDX8_IMM;
     }
   }
   case GLOBAL_LOAD:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll
index fff341b07881b..a018ea5bf18f1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll
@@ -159,10 +159,10 @@ define <2 x i16> @global_atomic_fadd_v2bf16_rtn(ptr addrspace(1) %ptr, <2 x i16>
 define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, <2 x half> %data) {
 ; GFX940-LABEL: local_atomic_fadd_v2f16_noret:
 ; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX940-NEXT:    ds_pk_add_f16 v0, v1
 ; GFX940-NEXT:    s_endpgm
   %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0)
@@ -183,10 +183,10 @@ define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half>
 define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, <2 x i16> %data) {
 ; GFX940-LABEL: local_atomic_fadd_v2bf16_noret:
 ; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v0, s1
-; GFX940-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-NEXT:    v_mov_b32_e32 v0, s3
+; GFX940-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
 ; GFX940-NEXT:    ds_pk_add_bf16 v1, v0
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
index 727184a36c006..1092bb4dc834a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
@@ -19,13 +19,13 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in1, i32 %in2) {
 ;
 ; GFX10-LABEL: dpp_test:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: dpp_test:
@@ -174,16 +174,16 @@ define amdgpu_kernel void @update_dppv2i32_test(ptr addrspace(1) %arg, <2 x i32>
 ;
 ; GFX10-LABEL: update_dppv2i32_test:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[0:1]
-; GFX10-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[4:5]
+; GFX10-NEXT:    v_mov_b32_e32 v2, s6
+; GFX10-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; GFX10-NEXT:    v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
-; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1]
+; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: update_dppv2i32_test:
@@ -229,16 +229,16 @@ define amdgpu_kernel void @update_dppv2f32_test(ptr addrspace(1) %arg, <2 x floa
 ;
 ; GFX10-LABEL: update_dppv2f32_test:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[0:1]
-; GFX10-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[4:5]
+; GFX10-NEXT:    v_mov_b32_e32 v2, s6
+; GFX10-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; GFX10-NEXT:    v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
-; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1]
+; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: update_dppv2f32_test:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
index b666f45521661..8a2274cbfbf62 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
@@ -692,121 +692,121 @@ define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1
 ;
 ; GFX9-LABEL: sdivrem_v2i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x0
+; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s8, s6, 31
-; GFX9-NEXT:    s_add_i32 s6, s6, s8
-; GFX9-NEXT:    s_xor_b32 s6, s6, s8
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s6
-; GFX9-NEXT:    s_ashr_i32 s9, s7, 31
-; GFX9-NEXT:    s_add_i32 s7, s7, s9
-; GFX9-NEXT:    s_xor_b32 s7, s7, s9
+; GFX9-NEXT:    s_ashr_i32 s0, s14, 31
+; GFX9-NEXT:    s_add_i32 s1, s14, s0
+; GFX9-NEXT:    s_xor_b32 s1, s1, s0
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s1
+; GFX9-NEXT:    s_ashr_i32 s2, s15, 31
+; GFX9-NEXT:    s_add_i32 s3, s15, s2
+; GFX9-NEXT:    s_xor_b32 s3, s3, s2
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s7
-; GFX9-NEXT:    s_sub_i32 s12, 0, s6
-; GFX9-NEXT:    s_ashr_i32 s10, s4, 31
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s3
+; GFX9-NEXT:    s_sub_i32 s6, 0, s1
+; GFX9-NEXT:    s_ashr_i32 s4, s12, 31
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GFX9-NEXT:    s_add_i32 s4, s4, s10
-; GFX9-NEXT:    s_xor_b32 s4, s4, s10
-; GFX9-NEXT:    v_mul_lo_u32 v2, s12, v0
+; GFX9-NEXT:    s_sub_i32 s7, 0, s3
+; GFX9-NEXT:    s_ashr_i32 s5, s13, 31
+; GFX9-NEXT:    v_mul_lo_u32 v2, s6, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    s_sub_i32 s12, 0, s7
+; GFX9-NEXT:    s_add_i32 s6, s12, s4
 ; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
-; GFX9-NEXT:    s_ashr_i32 s11, s5, 31
-; GFX9-NEXT:    v_mul_lo_u32 v3, s12, v1
-; GFX9-NEXT:    s_add_i32 s5, s5, s11
+; GFX9-NEXT:    s_xor_b32 s6, s6, s4
+; GFX9-NEXT:    v_mul_lo_u32 v3, s7, v1
+; GFX9-NEXT:    s_add_i32 s7, s13, s5
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v0, s4, v0
+; GFX9-NEXT:    v_mul_hi_u32 v0, s6, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v2, v1, v3
-; GFX9-NEXT:    s_xor_b32 s5, s5, s11
-; GFX9-NEXT:    v_mul_lo_u32 v3, v0, s6
+; GFX9-NEXT:    s_xor_b32 s7, s7, s5
+; GFX9-NEXT:    s_xor_b32 s0, s4, s0
+; GFX9-NEXT:    v_mul_lo_u32 v3, v0, s1
 ; GFX9-NEXT:    v_add_u32_e32 v1, v1, v2
 ; GFX9-NEXT:    v_add_u32_e32 v2, 1, v0
-; GFX9-NEXT:    v_mul_hi_u32 v1, s5, v1
-; GFX9-NEXT:    v_sub_u32_e32 v3, s4, v3
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s6, v3
+; GFX9-NEXT:    v_mul_hi_u32 v1, s7, v1
+; GFX9-NEXT:    v_sub_u32_e32 v3, s6, v3
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s1, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v2, s6, v3
+; GFX9-NEXT:    v_subrev_u32_e32 v2, s1, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s6, v2
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s1, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v3, s6, v2
+; GFX9-NEXT:    v_subrev_u32_e32 v3, s1, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v3, v1, s7
+; GFX9-NEXT:    v_mul_lo_u32 v3, v1, s3
 ; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
-; GFX9-NEXT:    s_xor_b32 s4, s10, s8
-; GFX9-NEXT:    v_xor_b32_e32 v0, s4, v0
-; GFX9-NEXT:    v_sub_u32_e32 v3, s5, v3
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s7, v3
+; GFX9-NEXT:    v_xor_b32_e32 v0, s0, v0
+; GFX9-NEXT:    v_subrev_u32_e32 v0, s0, v0
+; GFX9-NEXT:    v_sub_u32_e32 v3, s7, v3
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v4, s7, v3
+; GFX9-NEXT:    v_subrev_u32_e32 v4, s3, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s7, v3
-; GFX9-NEXT:    v_subrev_u32_e32 v0, s4, v0
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v4, s7, v3
-; GFX9-NEXT:    s_xor_b32 s4, s11, s9
+; GFX9-NEXT:    v_subrev_u32_e32 v4, s3, v3
+; GFX9-NEXT:    s_xor_b32 s0, s5, s2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v1, s4, v1
-; GFX9-NEXT:    v_xor_b32_e32 v2, s10, v2
-; GFX9-NEXT:    v_subrev_u32_e32 v1, s4, v1
-; GFX9-NEXT:    v_xor_b32_e32 v3, s11, v3
+; GFX9-NEXT:    v_xor_b32_e32 v1, s0, v1
+; GFX9-NEXT:    v_xor_b32_e32 v2, s4, v2
+; GFX9-NEXT:    v_subrev_u32_e32 v1, s0, v1
+; GFX9-NEXT:    v_xor_b32_e32 v3, s5, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-NEXT:    v_subrev_u32_e32 v2, s10, v2
-; GFX9-NEXT:    v_subrev_u32_e32 v3, s11, v3
-; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
-; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[2:3]
+; GFX9-NEXT:    v_subrev_u32_e32 v2, s4, v2
+; GFX9-NEXT:    v_subrev_u32_e32 v3, s5, v3
+; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[8:9]
+; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[10:11]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: sdivrem_v2i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[4:5], 0x0
+; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_ashr_i32 s1, s10, 31
-; GFX10-NEXT:    s_ashr_i32 s2, s11, 31
-; GFX10-NEXT:    s_add_i32 s0, s10, s1
-; GFX10-NEXT:    s_add_i32 s3, s11, s2
-; GFX10-NEXT:    s_xor_b32 s10, s0, s1
+; GFX10-NEXT:    s_ashr_i32 s1, s14, 31
+; GFX10-NEXT:    s_ashr_i32 s2, s15, 31
+; GFX10-NEXT:    s_add_i32 s0, s14, s1
+; GFX10-NEXT:    s_add_i32 s3, s15, s2
+; GFX10-NEXT:    s_xor_b32 s4, s0, s1
 ; GFX10-NEXT:    s_xor_b32 s3, s3, s2
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s10
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s4
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s3
-; GFX10-NEXT:    s_sub_i32 s0, 0, s10
-; GFX10-NEXT:    s_sub_i32 s11, 0, s3
-; GFX10-NEXT:    s_ashr_i32 s12, s9, 31
+; GFX10-NEXT:    s_sub_i32 s0, 0, s4
+; GFX10-NEXT:    s_sub_i32 s5, 0, s3
+; GFX10-NEXT:    s_ashr_i32 s6, s13, 31
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GFX10-NEXT:    s_add_i32 s7, s13, s6
+; GFX10-NEXT:    s_xor_b32 s7, s7, s6
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX10-NEXT:    v_mul_lo_u32 v2, s0, v0
-; GFX10-NEXT:    v_mul_lo_u32 v3, s11, v1
-; GFX10-NEXT:    s_ashr_i32 s11, s8, 31
-; GFX10-NEXT:    s_add_i32 s0, s8, s11
-; GFX10-NEXT:    s_add_i32 s8, s9, s12
-; GFX10-NEXT:    s_xor_b32 s0, s0, s11
-; GFX10-NEXT:    s_xor_b32 s8, s8, s12
+; GFX10-NEXT:    v_mul_lo_u32 v3, s5, v1
+; GFX10-NEXT:    s_ashr_i32 s5, s12, 31
+; GFX10-NEXT:    s_add_i32 s0, s12, s5
+; GFX10-NEXT:    s_xor_b32 s1, s5, s1
+; GFX10-NEXT:    s_xor_b32 s0, s0, s5
 ; GFX10-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX10-NEXT:    v_mul_hi_u32 v3, v1, v3
-; GFX10-NEXT:    s_xor_b32 s1, s11, s1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_add_nc_u32_e32 v1, v1, v3
 ; GFX10-NEXT:    v_mul_hi_u32 v0, s0, v0
-; GFX10-NEXT:    v_mul_hi_u32 v1, s8, v1
-; GFX10-NEXT:    v_mul_lo_u32 v2, v0, s10
+; GFX10-NEXT:    v_mul_hi_u32 v1, s7, v1
+; GFX10-NEXT:    v_mul_lo_u32 v2, v0, s4
 ; GFX10-NEXT:    v_mul_lo_u32 v3, v1, s3
 ; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v5, 1, v1
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v2, s0, v2
-; GFX10-NEXT:    v_sub_nc_u32_e32 v3, s8, v3
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s10, v2
+; GFX10-NEXT:    v_sub_nc_u32_e32 v3, s7, v3
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s4, v2
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s3, v3
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s10, v2
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s4, v2
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s3, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
@@ -814,26 +814,26 @@ define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
 ; GFX10-NEXT:    v_add_nc_u32_e32 v5, 1, v1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v0
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s10, v2
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s4, v2
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s3, v3
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s10, v2
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s4, v2
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s3, v3
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
-; GFX10-NEXT:    s_xor_b32 s0, s12, s2
+; GFX10-NEXT:    s_xor_b32 s0, s6, s2
 ; GFX10-NEXT:    v_xor_b32_e32 v0, s1, v0
 ; GFX10-NEXT:    v_xor_b32_e32 v1, s0, v1
-; GFX10-NEXT:    v_xor_b32_e32 v2, s11, v2
-; GFX10-NEXT:    v_xor_b32_e32 v3, s12, v3
+; GFX10-NEXT:    v_xor_b32_e32 v2, s5, v2
+; GFX10-NEXT:    v_xor_b32_e32 v3, s6, v3
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v0, s1, v0
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v1, s0, v1
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, s11, v2
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, s12, v3
-; GFX10-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
-; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[6:7]
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, s5, v2
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, s6, v3
+; GFX10-NEXT:    global_store_dwordx2 v4, v[0:1], s[8:9]
+; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[10:11]
 ; GFX10-NEXT:    s_endpgm
   %div = sdiv <2 x i32> %x, %y
   store <2 x i32> %div, ptr addrspace(1) %out0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
index a58397eccaba7..62d8b7d6f045c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
@@ -576,12 +576,12 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1
 ;
 ; GFX9-LABEL: udivrem_v2i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[4:5], 0x0
+; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s10
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s11
-; GFX9-NEXT:    s_sub_i32 s0, 0, s10
-; GFX9-NEXT:    s_sub_i32 s1, 0, s11
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s14
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s15
+; GFX9-NEXT:    s_sub_i32 s0, 0, s14
+; GFX9-NEXT:    s_sub_i32 s1, 0, s15
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
@@ -593,47 +593,47 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v0, s8, v0
+; GFX9-NEXT:    v_mul_hi_u32 v0, s12, v0
 ; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
-; GFX9-NEXT:    v_mul_hi_u32 v1, s9, v1
-; GFX9-NEXT:    v_mul_lo_u32 v2, v0, s10
+; GFX9-NEXT:    v_mul_hi_u32 v1, s13, v1
+; GFX9-NEXT:    v_mul_lo_u32 v2, v0, s14
 ; GFX9-NEXT:    v_add_u32_e32 v4, 1, v0
-; GFX9-NEXT:    v_mul_lo_u32 v3, v1, s11
+; GFX9-NEXT:    v_mul_lo_u32 v3, v1, s15
 ; GFX9-NEXT:    v_add_u32_e32 v5, 1, v1
-; GFX9-NEXT:    v_sub_u32_e32 v2, s8, v2
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
-; GFX9-NEXT:    v_sub_u32_e32 v3, s9, v3
+; GFX9-NEXT:    v_sub_u32_e32 v2, s12, v2
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s14, v2
+; GFX9-NEXT:    v_sub_u32_e32 v3, s13, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v4, s10, v2
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v3
+; GFX9-NEXT:    v_subrev_u32_e32 v4, s14, v2
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s15, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[0:1]
-; GFX9-NEXT:    v_subrev_u32_e32 v5, s11, v3
+; GFX9-NEXT:    v_subrev_u32_e32 v5, s15, v3
 ; GFX9-NEXT:    v_add_u32_e32 v4, 1, v0
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s14, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v4, s10, v2
+; GFX9-NEXT:    v_subrev_u32_e32 v4, s14, v2
 ; GFX9-NEXT:    v_add_u32_e32 v5, 1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
-; GFX9-NEXT:    v_subrev_u32_e32 v4, s11, v3
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s15, v3
+; GFX9-NEXT:    v_subrev_u32_e32 v4, s15, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
-; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[6:7]
+; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[8:9]
+; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[10:11]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: udivrem_v2i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[4:5], 0x0
+; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s10
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s11
-; GFX10-NEXT:    s_sub_i32 s0, 0, s10
-; GFX10-NEXT:    s_sub_i32 s1, 0, s11
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s14
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s15
+; GFX10-NEXT:    s_sub_i32 s0, 0, s14
+; GFX10-NEXT:    s_sub_i32 s1, 0, s15
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
@@ -646,34 +646,34 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX10-NEXT:    v_mul_hi_u32 v3, v1, v3
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_add_nc_u32_e32 v1, v1, v3
-; GFX10-NEXT:    v_mul_hi_u32 v0, s8, v0
-; GFX10-NEXT:    v_mul_hi_u32 v1, s9, v1
-; GFX10-NEXT:    v_mul_lo_u32 v2, v0, s10
-; GFX10-NEXT:    v_mul_lo_u32 v3, v1, s11
+; GFX10-NEXT:    v_mul_hi_u32 v0, s12, v0
+; GFX10-NEXT:    v_mul_hi_u32 v1, s13, v1
+; GFX10-NEXT:    v_mul_lo_u32 v2, v0, s14
+; GFX10-NEXT:    v_mul_lo_u32 v3, v1, s15
 ; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v5, 1, v1
-; GFX10-NEXT:    v_sub_nc_u32_e32 v2, s8, v2
-; GFX10-NEXT:    v_sub_nc_u32_e32 v3, s9, v3
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s10, v2
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s11, v3
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s10, v2
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s11, v3
+; GFX10-NEXT:    v_sub_nc_u32_e32 v2, s12, v2
+; GFX10-NEXT:    v_sub_nc_u32_e32 v3, s13, v3
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s14, v2
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s15, v3
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s14, v2
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s15, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
 ; GFX10-NEXT:    v_add_nc_u32_e32 v5, 1, v1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v0
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s10, v2
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s11, v3
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s10, v2
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s11, v3
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s14, v2
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s15, v3
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s14, v2
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s15, v3
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
-; GFX10-NEXT:    global_store_dwordx2 v8, v[0:1], s[4:5]
-; GFX10-NEXT:    global_store_dwordx2 v8, v[2:3], s[6:7]
+; GFX10-NEXT:    global_store_dwordx2 v8, v[0:1], s[8:9]
+; GFX10-NEXT:    global_store_dwordx2 v8, v[2:3], s[10:11]
 ; GFX10-NEXT:    s_endpgm
   %div = udiv <2 x i32> %x, %y
   store <2 x i32> %div, ptr addrspace(1) %out0
diff --git a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
index 6f67ce4de9ce5..611a7b566070c 100644
--- a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
@@ -243,21 +243,21 @@ define amdgpu_kernel void @s_test_add_v2i16_kernarg(ptr addrspace(1) %out, <2 x
 ;
 ; GFX9-LABEL: s_test_add_v2i16_kernarg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    v_pk_add_u16 v1, s2, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    v_pk_add_u16 v1, s6, v1
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: s_test_add_v2i16_kernarg:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_pk_add_u16 v1, s2, s3
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    v_pk_add_u16 v1, s6, s7
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_test_add_v2i16_kernarg:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index 8144fb7a3b646..559871d162e13 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -72,31 +72,31 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ;
 ; GFX9-LABEL: udiv_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX9-NEXT:    s_sub_i32 s4, 0, s3
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s7
+; GFX9-NEXT:    s_sub_i32 s0, 0, s7
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_readfirstlane_b32 s5, v0
-; GFX9-NEXT:    s_mul_i32 s4, s4, s5
-; GFX9-NEXT:    s_mul_hi_u32 s4, s5, s4
-; GFX9-NEXT:    s_add_i32 s5, s5, s4
-; GFX9-NEXT:    s_mul_hi_u32 s4, s2, s5
-; GFX9-NEXT:    s_mul_i32 s5, s4, s3
-; GFX9-NEXT:    s_sub_i32 s2, s2, s5
-; GFX9-NEXT:    s_add_i32 s6, s4, 1
-; GFX9-NEXT:    s_sub_i32 s5, s2, s3
-; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
-; GFX9-NEXT:    s_cselect_b32 s4, s6, s4
-; GFX9-NEXT:    s_cselect_b32 s2, s5, s2
-; GFX9-NEXT:    s_add_i32 s5, s4, 1
-; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
-; GFX9-NEXT:    s_cselect_b32 s2, s5, s4
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX9-NEXT:    s_mul_i32 s0, s0, s1
+; GFX9-NEXT:    s_mul_hi_u32 s0, s1, s0
+; GFX9-NEXT:    s_add_i32 s1, s1, s0
+; GFX9-NEXT:    s_mul_hi_u32 s0, s6, s1
+; GFX9-NEXT:    s_mul_i32 s1, s0, s7
+; GFX9-NEXT:    s_sub_i32 s1, s6, s1
+; GFX9-NEXT:    s_add_i32 s2, s0, 1
+; GFX9-NEXT:    s_sub_i32 s3, s1, s7
+; GFX9-NEXT:    s_cmp_ge_u32 s1, s7
+; GFX9-NEXT:    s_cselect_b32 s0, s2, s0
+; GFX9-NEXT:    s_cselect_b32 s1, s3, s1
+; GFX9-NEXT:    s_add_i32 s2, s0, 1
+; GFX9-NEXT:    s_cmp_ge_u32 s1, s7
+; GFX9-NEXT:    s_cselect_b32 s0, s2, s0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %r = udiv i32 %x, %y
   store i32 %r, ptr addrspace(1) %out
@@ -167,29 +167,29 @@ define amdgpu_kernel void @urem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ;
 ; GFX9-LABEL: urem_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX9-NEXT:    s_sub_i32 s4, 0, s3
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s7
+; GFX9-NEXT:    s_sub_i32 s0, 0, s7
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_readfirstlane_b32 s5, v0
-; GFX9-NEXT:    s_mul_i32 s4, s4, s5
-; GFX9-NEXT:    s_mul_hi_u32 s4, s5, s4
-; GFX9-NEXT:    s_add_i32 s5, s5, s4
-; GFX9-NEXT:    s_mul_hi_u32 s4, s2, s5
-; GFX9-NEXT:    s_mul_i32 s4, s4, s3
-; GFX9-NEXT:    s_sub_i32 s2, s2, s4
-; GFX9-NEXT:    s_sub_i32 s4, s2, s3
-; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
-; GFX9-NEXT:    s_cselect_b32 s2, s4, s2
-; GFX9-NEXT:    s_sub_i32 s4, s2, s3
-; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
-; GFX9-NEXT:    s_cselect_b32 s2, s4, s2
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX9-NEXT:    s_mul_i32 s0, s0, s1
+; GFX9-NEXT:    s_mul_hi_u32 s0, s1, s0
+; GFX9-NEXT:    s_add_i32 s1, s1, s0
+; GFX9-NEXT:    s_mul_hi_u32 s0, s6, s1
+; GFX9-NEXT:    s_mul_i32 s0, s0, s7
+; GFX9-NEXT:    s_sub_i32 s0, s6, s0
+; GFX9-NEXT:    s_sub_i32 s1, s0, s7
+; GFX9-NEXT:    s_cmp_ge_u32 s0, s7
+; GFX9-NEXT:    s_cselect_b32 s0, s1, s0
+; GFX9-NEXT:    s_sub_i32 s1, s0, s7
+; GFX9-NEXT:    s_cmp_ge_u32 s0, s7
+; GFX9-NEXT:    s_cselect_b32 s0, s1, s0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %r = urem i32 %x, %y
   store i32 %r, ptr addrspace(1) %out
@@ -280,37 +280,37 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ;
 ; GFX9-LABEL: sdiv_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_abs_i32 s4, s3
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s4
-; GFX9-NEXT:    s_sub_i32 s5, 0, s4
-; GFX9-NEXT:    s_xor_b32 s3, s2, s3
-; GFX9-NEXT:    s_abs_i32 s2, s2
+; GFX9-NEXT:    s_abs_i32 s0, s7
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s0
+; GFX9-NEXT:    s_xor_b32 s1, s6, s7
+; GFX9-NEXT:    s_abs_i32 s2, s6
+; GFX9-NEXT:    s_sub_i32 s3, 0, s0
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    s_ashr_i32 s3, s3, 31
+; GFX9-NEXT:    s_ashr_i32 s1, s1, 31
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s6, v0
-; GFX9-NEXT:    s_mul_i32 s5, s5, s6
-; GFX9-NEXT:    s_mul_hi_u32 s5, s6, s5
-; GFX9-NEXT:    s_add_i32 s6, s6, s5
-; GFX9-NEXT:    s_mul_hi_u32 s5, s2, s6
-; GFX9-NEXT:    s_mul_i32 s6, s5, s4
+; GFX9-NEXT:    s_mul_i32 s3, s3, s6
+; GFX9-NEXT:    s_mul_hi_u32 s3, s6, s3
+; GFX9-NEXT:    s_add_i32 s6, s6, s3
+; GFX9-NEXT:    s_mul_hi_u32 s3, s2, s6
+; GFX9-NEXT:    s_mul_i32 s6, s3, s0
 ; GFX9-NEXT:    s_sub_i32 s2, s2, s6
-; GFX9-NEXT:    s_add_i32 s7, s5, 1
-; GFX9-NEXT:    s_sub_i32 s6, s2, s4
-; GFX9-NEXT:    s_cmp_ge_u32 s2, s4
-; GFX9-NEXT:    s_cselect_b32 s5, s7, s5
+; GFX9-NEXT:    s_add_i32 s7, s3, 1
+; GFX9-NEXT:    s_sub_i32 s6, s2, s0
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s0
+; GFX9-NEXT:    s_cselect_b32 s3, s7, s3
 ; GFX9-NEXT:    s_cselect_b32 s2, s6, s2
-; GFX9-NEXT:    s_add_i32 s6, s5, 1
-; GFX9-NEXT:    s_cmp_ge_u32 s2, s4
-; GFX9-NEXT:    s_cselect_b32 s2, s6, s5
-; GFX9-NEXT:    s_xor_b32 s2, s2, s3
-; GFX9-NEXT:    s_sub_i32 s2, s2, s3
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    s_add_i32 s6, s3, 1
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s0
+; GFX9-NEXT:    s_cselect_b32 s0, s6, s3
+; GFX9-NEXT:    s_xor_b32 s0, s0, s1
+; GFX9-NEXT:    s_sub_i32 s0, s0, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %r = sdiv i32 %x, %y
   store i32 %r, ptr addrspace(1) %out
@@ -394,34 +394,34 @@ define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ;
 ; GFX9-LABEL: srem_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_abs_i32 s3, s3
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX9-NEXT:    s_sub_i32 s5, 0, s3
-; GFX9-NEXT:    s_ashr_i32 s4, s2, 31
-; GFX9-NEXT:    s_abs_i32 s2, s2
+; GFX9-NEXT:    s_abs_i32 s0, s7
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s0
+; GFX9-NEXT:    s_ashr_i32 s1, s6, 31
+; GFX9-NEXT:    s_abs_i32 s2, s6
+; GFX9-NEXT:    s_sub_i32 s3, 0, s0
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s6, v0
-; GFX9-NEXT:    s_mul_i32 s5, s5, s6
-; GFX9-NEXT:    s_mul_hi_u32 s5, s6, s5
-; GFX9-NEXT:    s_add_i32 s6, s6, s5
-; GFX9-NEXT:    s_mul_hi_u32 s5, s2, s6
-; GFX9-NEXT:    s_mul_i32 s5, s5, s3
-; GFX9-NEXT:    s_sub_i32 s2, s2, s5
-; GFX9-NEXT:    s_sub_i32 s5, s2, s3
-; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
-; GFX9-NEXT:    s_cselect_b32 s2, s5, s2
-; GFX9-NEXT:    s_sub_i32 s5, s2, s3
-; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
-; GFX9-NEXT:    s_cselect_b32 s2, s5, s2
-; GFX9-NEXT:    s_xor_b32 s2, s2, s4
-; GFX9-NEXT:    s_sub_i32 s2, s2, s4
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    s_mul_i32 s3, s3, s6
+; GFX9-NEXT:    s_mul_hi_u32 s3, s6, s3
+; GFX9-NEXT:    s_add_i32 s6, s6, s3
+; GFX9-NEXT:    s_mul_hi_u32 s3, s2, s6
+; GFX9-NEXT:    s_mul_i32 s3, s3, s0
+; GFX9-NEXT:    s_sub_i32 s2, s2, s3
+; GFX9-NEXT:    s_sub_i32 s3, s2, s0
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s0
+; GFX9-NEXT:    s_cselect_b32 s2, s3, s2
+; GFX9-NEXT:    s_sub_i32 s3, s2, s0
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s0
+; GFX9-NEXT:    s_cselect_b32 s0, s3, s2
+; GFX9-NEXT:    s_xor_b32 s0, s0, s1
+; GFX9-NEXT:    s_sub_i32 s0, s0, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %r = srem i32 %x, %y
   store i32 %r, ptr addrspace(1) %out
@@ -5482,13 +5482,13 @@ define amdgpu_kernel void @udiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x
 ;
 ; GFX9-LABEL: udiv_i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_add_i32 s3, s3, 12
-; GFX9-NEXT:    s_lshr_b32 s2, s2, s3
-; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_add_i32 s0, s7, 12
+; GFX9-NEXT:    s_lshr_b32 s0, s6, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %shl.y = shl i32 4096, %y
   %r = udiv i32 %x, %shl.y
@@ -5524,14 +5524,14 @@ define amdgpu_kernel void @udiv_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3
 ;
 ; GFX9-LABEL: udiv_v2i32_pow2k_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshr_b32 s2, s2, 12
-; GFX9-NEXT:    s_lshr_b32 s3, s3, 12
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_lshr_b32 s0, s6, 12
+; GFX9-NEXT:    s_lshr_b32 s1, s7, 12
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %r = udiv <2 x i32> %x, <i32 4096, i32 4096>
   store <2 x i32> %r, ptr addrspace(1) %out
@@ -5570,18 +5570,18 @@ define amdgpu_kernel void @udiv_v2i32_mixed_pow2k_denom(ptr addrspace(1) %out, <
 ;
 ; GFX9-LABEL: udiv_v2i32_mixed_pow2k_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mul_hi_u32 s4, s3, 0x100101
-; GFX9-NEXT:    s_sub_i32 s3, s3, s4
-; GFX9-NEXT:    s_lshr_b32 s3, s3, 1
-; GFX9-NEXT:    s_add_i32 s3, s3, s4
-; GFX9-NEXT:    s_lshr_b32 s2, s2, 12
-; GFX9-NEXT:    s_lshr_b32 s3, s3, 11
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_mul_hi_u32 s1, s7, 0x100101
+; GFX9-NEXT:    s_sub_i32 s2, s7, s1
+; GFX9-NEXT:    s_lshr_b32 s2, s2, 1
+; GFX9-NEXT:    s_add_i32 s2, s2, s1
+; GFX9-NEXT:    s_lshr_b32 s0, s6, 12
+; GFX9-NEXT:    s_lshr_b32 s1, s2, 11
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %r = udiv <2 x i32> %x, <i32 4096, i32 4095>
   store <2 x i32> %r, ptr addrspace(1) %out
@@ -5875,14 +5875,14 @@ define amdgpu_kernel void @urem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x
 ;
 ; GFX9-LABEL: urem_i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b32 s3, 0x1000, s3
-; GFX9-NEXT:    s_add_i32 s3, s3, -1
-; GFX9-NEXT:    s_and_b32 s2, s2, s3
-; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_lshl_b32 s0, 0x1000, s7
+; GFX9-NEXT:    s_add_i32 s0, s0, -1
+; GFX9-NEXT:    s_and_b32 s0, s6, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %shl.y = shl i32 4096, %y
   %r = urem i32 %x, %shl.y
@@ -5918,14 +5918,14 @@ define amdgpu_kernel void @urem_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3
 ;
 ; GFX9-LABEL: urem_v2i32_pow2k_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s2, s2, 0xfff
-; GFX9-NEXT:    s_and_b32 s3, s3, 0xfff
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_and_b32 s0, s6, 0xfff
+; GFX9-NEXT:    s_and_b32 s1, s7, 0xfff
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %r = urem <2 x i32> %x, <i32 4096, i32 4096>
   store <2 x i32> %r, ptr addrspace(1) %out
@@ -6234,41 +6234,41 @@ define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x
 ;
 ; GFX9-LABEL: sdiv_i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b32 s3, 0x1000, s3
-; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
-; GFX9-NEXT:    s_add_i32 s3, s3, s4
-; GFX9-NEXT:    s_xor_b32 s3, s3, s4
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX9-NEXT:    s_sub_i32 s6, 0, s3
-; GFX9-NEXT:    s_ashr_i32 s5, s2, 31
-; GFX9-NEXT:    s_add_i32 s2, s2, s5
+; GFX9-NEXT:    s_lshl_b32 s0, 0x1000, s7
+; GFX9-NEXT:    s_ashr_i32 s1, s0, 31
+; GFX9-NEXT:    s_add_i32 s0, s0, s1
+; GFX9-NEXT:    s_xor_b32 s0, s0, s1
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s0
+; GFX9-NEXT:    s_ashr_i32 s2, s6, 31
+; GFX9-NEXT:    s_add_i32 s3, s6, s2
+; GFX9-NEXT:    s_sub_i32 s6, 0, s0
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    s_xor_b32 s2, s2, s5
+; GFX9-NEXT:    s_xor_b32 s3, s3, s2
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s7, v0
 ; GFX9-NEXT:    s_mul_i32 s6, s6, s7
 ; GFX9-NEXT:    s_mul_hi_u32 s6, s7, s6
 ; GFX9-NEXT:    s_add_i32 s7, s7, s6
-; GFX9-NEXT:    s_mul_hi_u32 s6, s2, s7
-; GFX9-NEXT:    s_mul_i32 s8, s6, s3
-; GFX9-NEXT:    s_sub_i32 s2, s2, s8
+; GFX9-NEXT:    s_mul_hi_u32 s6, s3, s7
+; GFX9-NEXT:    s_mul_i32 s8, s6, s0
+; GFX9-NEXT:    s_sub_i32 s3, s3, s8
 ; GFX9-NEXT:    s_add_i32 s7, s6, 1
-; GFX9-NEXT:    s_sub_i32 s8, s2, s3
-; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX9-NEXT:    s_sub_i32 s8, s3, s0
+; GFX9-NEXT:    s_cmp_ge_u32 s3, s0
 ; GFX9-NEXT:    s_cselect_b32 s6, s7, s6
-; GFX9-NEXT:    s_cselect_b32 s2, s8, s2
+; GFX9-NEXT:    s_cselect_b32 s3, s8, s3
 ; GFX9-NEXT:    s_add_i32 s7, s6, 1
-; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
-; GFX9-NEXT:    s_cselect_b32 s2, s7, s6
-; GFX9-NEXT:    s_xor_b32 s3, s5, s4
-; GFX9-NEXT:    s_xor_b32 s2, s2, s3
-; GFX9-NEXT:    s_sub_i32 s2, s2, s3
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    s_cmp_ge_u32 s3, s0
+; GFX9-NEXT:    s_cselect_b32 s0, s7, s6
+; GFX9-NEXT:    s_xor_b32 s1, s2, s1
+; GFX9-NEXT:    s_xor_b32 s0, s0, s1
+; GFX9-NEXT:    s_sub_i32 s0, s0, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %shl.y = shl i32 4096, %y
   %r = sdiv i32 %x, %shl.y
@@ -6310,20 +6310,20 @@ define amdgpu_kernel void @sdiv_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3
 ;
 ; GFX9-LABEL: sdiv_v2i32_pow2k_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s4, s2, 31
-; GFX9-NEXT:    s_ashr_i32 s5, s3, 31
-; GFX9-NEXT:    s_lshr_b32 s4, s4, 20
-; GFX9-NEXT:    s_lshr_b32 s5, s5, 20
-; GFX9-NEXT:    s_add_i32 s2, s2, s4
-; GFX9-NEXT:    s_add_i32 s3, s3, s5
-; GFX9-NEXT:    s_ashr_i32 s2, s2, 12
-; GFX9-NEXT:    s_ashr_i32 s3, s3, 12
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_ashr_i32 s0, s6, 31
+; GFX9-NEXT:    s_ashr_i32 s1, s7, 31
+; GFX9-NEXT:    s_lshr_b32 s0, s0, 20
+; GFX9-NEXT:    s_lshr_b32 s1, s1, 20
+; GFX9-NEXT:    s_add_i32 s0, s6, s0
+; GFX9-NEXT:    s_add_i32 s1, s7, s1
+; GFX9-NEXT:    s_ashr_i32 s0, s0, 12
+; GFX9-NEXT:    s_ashr_i32 s1, s1, 12
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %r = sdiv <2 x i32> %x, <i32 4096, i32 4096>
   store <2 x i32> %r, ptr addrspace(1) %out
@@ -6365,21 +6365,21 @@ define amdgpu_kernel void @ssdiv_v2i32_mixed_pow2k_denom(ptr addrspace(1) %out,
 ;
 ; GFX9-LABEL: ssdiv_v2i32_mixed_pow2k_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s4, s2, 31
-; GFX9-NEXT:    s_mul_hi_i32 s5, s3, 0x80080081
-; GFX9-NEXT:    s_lshr_b32 s4, s4, 20
-; GFX9-NEXT:    s_add_i32 s5, s5, s3
-; GFX9-NEXT:    s_add_i32 s2, s2, s4
-; GFX9-NEXT:    s_lshr_b32 s3, s5, 31
-; GFX9-NEXT:    s_ashr_i32 s4, s5, 11
-; GFX9-NEXT:    s_ashr_i32 s2, s2, 12
-; GFX9-NEXT:    s_add_i32 s4, s4, s3
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_ashr_i32 s0, s6, 31
+; GFX9-NEXT:    s_mul_hi_i32 s1, s7, 0x80080081
+; GFX9-NEXT:    s_lshr_b32 s0, s0, 20
+; GFX9-NEXT:    s_add_i32 s1, s1, s7
+; GFX9-NEXT:    s_add_i32 s0, s6, s0
+; GFX9-NEXT:    s_lshr_b32 s2, s1, 31
+; GFX9-NEXT:    s_ashr_i32 s1, s1, 11
+; GFX9-NEXT:    s_ashr_i32 s0, s0, 12
+; GFX9-NEXT:    s_add_i32 s1, s1, s2
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %r = sdiv <2 x i32> %x, <i32 4096, i32 4095>
   store <2 x i32> %r, ptr addrspace(1) %out
@@ -6746,38 +6746,38 @@ define amdgpu_kernel void @srem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x
 ;
 ; GFX9-LABEL: srem_i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b32 s3, 0x1000, s3
-; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
-; GFX9-NEXT:    s_add_i32 s3, s3, s4
-; GFX9-NEXT:    s_xor_b32 s3, s3, s4
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX9-NEXT:    s_sub_i32 s5, 0, s3
-; GFX9-NEXT:    s_ashr_i32 s4, s2, 31
-; GFX9-NEXT:    s_add_i32 s2, s2, s4
+; GFX9-NEXT:    s_lshl_b32 s0, 0x1000, s7
+; GFX9-NEXT:    s_ashr_i32 s1, s0, 31
+; GFX9-NEXT:    s_add_i32 s0, s0, s1
+; GFX9-NEXT:    s_xor_b32 s0, s0, s1
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s0
+; GFX9-NEXT:    s_ashr_i32 s1, s6, 31
+; GFX9-NEXT:    s_add_i32 s2, s6, s1
+; GFX9-NEXT:    s_sub_i32 s3, 0, s0
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    s_xor_b32 s2, s2, s4
+; GFX9-NEXT:    s_xor_b32 s2, s2, s1
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s6, v0
-; GFX9-NEXT:    s_mul_i32 s5, s5, s6
-; GFX9-NEXT:    s_mul_hi_u32 s5, s6, s5
-; GFX9-NEXT:    s_add_i32 s6, s6, s5
-; GFX9-NEXT:    s_mul_hi_u32 s5, s2, s6
-; GFX9-NEXT:    s_mul_i32 s5, s5, s3
-; GFX9-NEXT:    s_sub_i32 s2, s2, s5
-; GFX9-NEXT:    s_sub_i32 s5, s2, s3
-; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
-; GFX9-NEXT:    s_cselect_b32 s2, s5, s2
-; GFX9-NEXT:    s_sub_i32 s5, s2, s3
-; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
-; GFX9-NEXT:    s_cselect_b32 s2, s5, s2
-; GFX9-NEXT:    s_xor_b32 s2, s2, s4
-; GFX9-NEXT:    s_sub_i32 s2, s2, s4
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    s_mul_i32 s3, s3, s6
+; GFX9-NEXT:    s_mul_hi_u32 s3, s6, s3
+; GFX9-NEXT:    s_add_i32 s6, s6, s3
+; GFX9-NEXT:    s_mul_hi_u32 s3, s2, s6
+; GFX9-NEXT:    s_mul_i32 s3, s3, s0
+; GFX9-NEXT:    s_sub_i32 s2, s2, s3
+; GFX9-NEXT:    s_sub_i32 s3, s2, s0
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s0
+; GFX9-NEXT:    s_cselect_b32 s2, s3, s2
+; GFX9-NEXT:    s_sub_i32 s3, s2, s0
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s0
+; GFX9-NEXT:    s_cselect_b32 s0, s3, s2
+; GFX9-NEXT:    s_xor_b32 s0, s0, s1
+; GFX9-NEXT:    s_sub_i32 s0, s0, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %shl.y = shl i32 4096, %y
   %r = srem i32 %x, %shl.y
@@ -6821,22 +6821,22 @@ define amdgpu_kernel void @srem_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3
 ;
 ; GFX9-LABEL: srem_v2i32_pow2k_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s4, s2, 31
-; GFX9-NEXT:    s_ashr_i32 s5, s3, 31
-; GFX9-NEXT:    s_lshr_b32 s4, s4, 20
-; GFX9-NEXT:    s_lshr_b32 s5, s5, 20
-; GFX9-NEXT:    s_add_i32 s4, s2, s4
-; GFX9-NEXT:    s_add_i32 s5, s3, s5
-; GFX9-NEXT:    s_and_b32 s4, s4, 0xfffff000
-; GFX9-NEXT:    s_sub_i32 s2, s2, s4
-; GFX9-NEXT:    s_and_b32 s4, s5, 0xfffff000
-; GFX9-NEXT:    s_sub_i32 s3, s3, s4
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_ashr_i32 s0, s6, 31
+; GFX9-NEXT:    s_ashr_i32 s1, s7, 31
+; GFX9-NEXT:    s_lshr_b32 s0, s0, 20
+; GFX9-NEXT:    s_lshr_b32 s1, s1, 20
+; GFX9-NEXT:    s_add_i32 s0, s6, s0
+; GFX9-NEXT:    s_add_i32 s1, s7, s1
+; GFX9-NEXT:    s_and_b32 s0, s0, 0xfffff000
+; GFX9-NEXT:    s_and_b32 s1, s1, 0xfffff000
+; GFX9-NEXT:    s_sub_i32 s0, s6, s0
+; GFX9-NEXT:    s_sub_i32 s1, s7, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %r = srem <2 x i32> %x, <i32 4096, i32 4096>
   store <2 x i32> %r, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/build_vector.ll b/llvm/test/CodeGen/AMDGPU/build_vector.ll
index b26d15ed3a1c8..e914635d6c26f 100644
--- a/llvm/test/CodeGen/AMDGPU/build_vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/build_vector.ll
@@ -316,14 +316,14 @@ define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out,
 ;
 ; GFX940-LABEL: build_v2i32_from_v4i16_shuffle:
 ; GFX940:       ; %bb.0: ; %entry
-; GFX940-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX940-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX940-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX940-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX940-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-NEXT:    v_mov_b32_e32 v1, s3
-; GFX940-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
+; GFX940-NEXT:    s_lshl_b32 s0, s7, 16
+; GFX940-NEXT:    s_lshl_b32 s1, s6, 16
+; GFX940-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5] sc0 sc1
 ; GFX940-NEXT:    s_endpgm
 entry:
   %shuf = shufflevector <4 x i16> %in, <4 x i16> zeroinitializer, <2 x i32> <i32 0, i32 2>
diff --git a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll
index b6948dab6bf9f..d511bb1f4a257 100644
--- a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll
+++ b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll
@@ -49,20 +49,20 @@ define amdgpu_kernel void @cluster_load_cluster_store(ptr noalias %lb, ptr noali
 ;
 ; GFX10-LABEL: cluster_load_cluster_store:
 ; GFX10:       ; %bb.0: ; %bb
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_add_u32 s4, s0, 8
-; GFX10-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-NEXT:    s_add_u32 s6, s0, 16
-; GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-NEXT:    s_addc_u32 s7, s1, 0
-; GFX10-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-NEXT:    s_add_u32 s0, s0, 24
-; GFX10-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-NEXT:    v_mov_b32_e32 v4, s6
-; GFX10-NEXT:    v_mov_b32_e32 v5, s7
+; GFX10-NEXT:    s_add_u32 s0, s4, 8
+; GFX10-NEXT:    s_addc_u32 s1, s5, 0
+; GFX10-NEXT:    s_add_u32 s2, s4, 16
+; GFX10-NEXT:    v_mov_b32_e32 v3, s1
+; GFX10-NEXT:    s_addc_u32 s3, s5, 0
+; GFX10-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-NEXT:    v_mov_b32_e32 v2, s0
+; GFX10-NEXT:    s_add_u32 s0, s4, 24
+; GFX10-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-NEXT:    s_addc_u32 s1, s5, 0
+; GFX10-NEXT:    v_mov_b32_e32 v5, s3
+; GFX10-NEXT:    v_mov_b32_e32 v4, s2
 ; GFX10-NEXT:    v_mov_b32_e32 v7, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v6, s0
 ; GFX10-NEXT:    s_clause 0x3
@@ -70,16 +70,16 @@ define amdgpu_kernel void @cluster_load_cluster_store(ptr noalias %lb, ptr noali
 ; GFX10-NEXT:    flat_load_dword v9, v[2:3]
 ; GFX10-NEXT:    flat_load_dword v10, v[4:5]
 ; GFX10-NEXT:    flat_load_dword v11, v[6:7]
-; GFX10-NEXT:    s_add_u32 s0, s2, 8
-; GFX10-NEXT:    s_addc_u32 s1, s3, 0
-; GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-NEXT:    s_add_u32 s0, s6, 8
+; GFX10-NEXT:    s_addc_u32 s1, s7, 0
+; GFX10-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s0
-; GFX10-NEXT:    s_add_u32 s0, s2, 16
-; GFX10-NEXT:    s_addc_u32 s1, s3, 0
-; GFX10-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-NEXT:    s_add_u32 s2, s2, 24
-; GFX10-NEXT:    s_addc_u32 s3, s3, 0
+; GFX10-NEXT:    s_add_u32 s0, s6, 16
+; GFX10-NEXT:    s_addc_u32 s1, s7, 0
+; GFX10-NEXT:    s_add_u32 s2, s6, 24
+; GFX10-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-NEXT:    s_addc_u32 s3, s7, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v5, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v7, s3
@@ -175,20 +175,20 @@ define amdgpu_kernel void @cluster_load_valu_cluster_store(ptr noalias %lb, ptr
 ;
 ; GFX10-LABEL: cluster_load_valu_cluster_store:
 ; GFX10:       ; %bb.0: ; %bb
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_add_u32 s4, s0, 8
-; GFX10-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-NEXT:    s_add_u32 s6, s0, 16
-; GFX10-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-NEXT:    s_addc_u32 s7, s1, 0
-; GFX10-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-NEXT:    s_add_u32 s0, s0, 24
-; GFX10-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-NEXT:    v_mov_b32_e32 v4, s6
-; GFX10-NEXT:    v_mov_b32_e32 v5, s7
+; GFX10-NEXT:    s_add_u32 s0, s4, 8
+; GFX10-NEXT:    s_addc_u32 s1, s5, 0
+; GFX10-NEXT:    s_add_u32 s2, s4, 16
+; GFX10-NEXT:    v_mov_b32_e32 v3, s1
+; GFX10-NEXT:    v_mov_b32_e32 v2, s0
+; GFX10-NEXT:    s_addc_u32 s3, s5, 0
+; GFX10-NEXT:    s_add_u32 s0, s4, 24
+; GFX10-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-NEXT:    s_addc_u32 s1, s5, 0
+; GFX10-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-NEXT:    v_mov_b32_e32 v5, s3
+; GFX10-NEXT:    v_mov_b32_e32 v4, s2
 ; GFX10-NEXT:    flat_load_dword v6, v[2:3]
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s0
@@ -196,18 +196,18 @@ define amdgpu_kernel void @cluster_load_valu_cluster_store(ptr noalias %lb, ptr
 ; GFX10-NEXT:    flat_load_dword v8, v[0:1]
 ; GFX10-NEXT:    flat_load_dword v9, v[4:5]
 ; GFX10-NEXT:    flat_load_dword v10, v[2:3]
-; GFX10-NEXT:    s_add_u32 s0, s2, 8
-; GFX10-NEXT:    s_addc_u32 s1, s3, 0
-; GFX10-NEXT:    s_add_u32 s4, s2, 16
+; GFX10-NEXT:    s_add_u32 s0, s6, 8
+; GFX10-NEXT:    s_addc_u32 s1, s7, 0
+; GFX10-NEXT:    s_add_u32 s2, s6, 16
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s1
-; GFX10-NEXT:    s_addc_u32 s5, s3, 0
-; GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-NEXT:    s_addc_u32 s3, s7, 0
+; GFX10-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s0
-; GFX10-NEXT:    s_add_u32 s0, s2, 24
-; GFX10-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-NEXT:    v_mov_b32_e32 v4, s4
-; GFX10-NEXT:    s_addc_u32 s1, s3, 0
-; GFX10-NEXT:    v_mov_b32_e32 v5, s5
+; GFX10-NEXT:    s_add_u32 s0, s6, 24
+; GFX10-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-NEXT:    v_mov_b32_e32 v5, s3
+; GFX10-NEXT:    s_addc_u32 s1, s7, 0
+; GFX10-NEXT:    v_mov_b32_e32 v4, s2
 ; GFX10-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v11, 1, v6
 ; GFX10-NEXT:    v_mov_b32_e32 v7, s1
diff --git a/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll b/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll
index 33c0d90f94a39..9c7fa1537c0c2 100644
--- a/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll
@@ -665,17 +665,17 @@ define amdgpu_kernel void @sub_zext_setcc_commute(ptr addrspace(1) nocapture %ar
 ;
 ; GFX9-LABEL: sub_zext_setcc_commute:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, v0, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v2, s[0:1]
+; GFX9-NEXT:    global_load_dword v3, v2, s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v3
-; GFX9-NEXT:    v_add_u32_e32 v0, s2, v0
-; GFX9-NEXT:    v_subrev_u32_e32 v0, s3, v0
-; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT:    v_add_u32_e32 v0, s6, v0
+; GFX9-NEXT:    v_subrev_u32_e32 v0, s7, v0
+; GFX9-NEXT:    global_store_dword v2, v0, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 bb:
   %x = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -714,17 +714,17 @@ define amdgpu_kernel void @sub_sext_setcc_commute(ptr addrspace(1) nocapture %ar
 ;
 ; GFX9-LABEL: sub_sext_setcc_commute:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, v0, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v2, s[0:1]
+; GFX9-NEXT:    global_load_dword v3, v2, s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v3
-; GFX9-NEXT:    v_add_u32_e32 v0, s2, v0
-; GFX9-NEXT:    v_subrev_u32_e32 v0, s3, v0
-; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT:    v_add_u32_e32 v0, s6, v0
+; GFX9-NEXT:    v_subrev_u32_e32 v0, s7, v0
+; GFX9-NEXT:    global_store_dword v2, v0, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 bb:
   %x = tail call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll
index 4decf39d04013..3145ee1f6141e 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll
@@ -164,28 +164,28 @@ define amdgpu_kernel void @v_ctlz_i32(ptr addrspace(1) noalias %out, ptr addrspa
 ;
 ; GFX10-LABEL: v_ctlz_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_ctlz_i32:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_ctlz_i32:
@@ -277,32 +277,32 @@ define amdgpu_kernel void @v_ctlz_v2i32(ptr addrspace(1) noalias %out, ptr addrs
 ;
 ; GFX10-LABEL: v_ctlz_v2i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbh_u32_e32 v1, v1
 ; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX10-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
-; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_ctlz_v2i32:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
+; GFX10-GISEL-NEXT:    global_load_dwordx2 v[0:1], v0, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
-; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_ctlz_v2i32:
@@ -411,11 +411,11 @@ define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrs
 ;
 ; GFX10-LABEL: v_ctlz_v4i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbh_u32_e32 v3, v3
 ; GFX10-NEXT:    v_ffbh_u32_e32 v2, v2
@@ -425,16 +425,16 @@ define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrs
 ; GFX10-NEXT:    v_min_u32_e32 v2, 32, v2
 ; GFX10-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
-; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_ctlz_v4i32:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[0:3], v0, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
@@ -444,7 +444,7 @@ define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrs
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v2, 32, v2
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v3, 32, v3
-; GFX10-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_ctlz_v4i32:
@@ -550,28 +550,28 @@ define amdgpu_kernel void @v_ctlz_i8(ptr addrspace(1) noalias %out, ptr addrspac
 ;
 ; GFX10-LABEL: v_ctlz_i8:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ubyte v1, v0, s[2:3]
+; GFX10-NEXT:    global_load_ubyte v1, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbh_u32_e32 v1, v1
 ; GFX10-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v1, 24, v1
-; GFX10-NEXT:    global_store_byte v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_byte v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_ctlz_i8:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
+; GFX10-GISEL-NEXT:    global_load_ubyte v1, v0, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX10-GISEL-NEXT:    v_subrev_nc_u32_e32 v1, 24, v1
-; GFX10-GISEL-NEXT:    global_store_byte v0, v1, s[0:1]
+; GFX10-GISEL-NEXT:    global_store_byte v0, v1, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_ctlz_i8:
@@ -737,24 +737,24 @@ define amdgpu_kernel void @s_ctlz_i64_trunc(ptr addrspace(1) noalias %out, i64 %
 ;
 ; GFX10-LABEL: s_ctlz_i64_trunc:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_flbit_i32_b64 s2, s[2:3]
-; GFX10-NEXT:    s_min_u32 s2, s2, 64
-; GFX10-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    s_flbit_i32_b64 s0, s[6:7]
+; GFX10-NEXT:    s_min_u32 s0, s0, 64
+; GFX10-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: s_ctlz_i64_trunc:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_flbit_i32_b64 s2, s[2:3]
-; GFX10-GISEL-NEXT:    s_min_u32 s2, s2, 64
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT:    s_flbit_i32_b64 s0, s[6:7]
+; GFX10-GISEL-NEXT:    s_min_u32 s0, s0, 64
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_ctlz_i64_trunc:
@@ -847,25 +847,25 @@ define amdgpu_kernel void @v_ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspa
 ;
 ; GFX10-LABEL: v_ctlz_i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v2, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX10-NEXT:    v_ffbh_u32_e32 v1, v1
 ; GFX10-NEXT:    v_add_nc_u32_e64 v0, v0, 32 clamp
 ; GFX10-NEXT:    v_min3_u32 v0, v0, v1, 64
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_ctlz_i64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX10-GISEL-NEXT:    global_load_dwordx2 v[0:1], v2, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
@@ -873,7 +873,7 @@ define amdgpu_kernel void @v_ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspa
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, v1, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 64, v0
-; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_ctlz_i64:
@@ -974,33 +974,33 @@ define amdgpu_kernel void @v_ctlz_i64_trunc(ptr addrspace(1) noalias %out, ptr a
 ;
 ; GFX10-LABEL: v_ctlz_i64_trunc:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[1:2], v1, s[2:3]
+; GFX10-NEXT:    global_load_dwordx2 v[1:2], v1, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbh_u32_e32 v1, v1
 ; GFX10-NEXT:    v_ffbh_u32_e32 v2, v2
 ; GFX10-NEXT:    v_add_nc_u32_e64 v1, v1, 32 clamp
 ; GFX10-NEXT:    v_min3_u32 v1, v1, v2, 64
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_ctlz_i64_trunc:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dwordx2 v[1:2], v1, s[2:3]
+; GFX10-GISEL-NEXT:    global_load_dwordx2 v[1:2], v1, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v2, v2
 ; GFX10-GISEL-NEXT:    v_add_nc_u32_e64 v1, v1, 32 clamp
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, v2, v1
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 64, v1
-; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_ctlz_i64_trunc:
@@ -1090,29 +1090,29 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out,
 ;
 ; GFX10-LABEL: v_ctlz_i32_sel_eq_neg1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_ctlz_i32_sel_eq_neg1:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, -1, vcc_lo
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_ctlz_i32_sel_eq_neg1:
@@ -1197,29 +1197,29 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out,
 ;
 ; GFX10-LABEL: v_ctlz_i32_sel_ne_neg1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_ctlz_i32_sel_ne_neg1:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
 ; GFX10-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e32 v0, -1, v1, vcc_lo
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_ctlz_i32_sel_ne_neg1:
@@ -1313,32 +1313,32 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias %
 ;
 ; GFX10-LABEL: v_ctlz_i32_sel_eq_bitwidth:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 32, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_ctlz_i32_sel_eq_bitwidth:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 32, v0
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc_lo
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_ctlz_i32_sel_eq_bitwidth:
@@ -1435,32 +1435,32 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ;
 ; GFX10-LABEL: v_ctlz_i32_sel_ne_bitwidth:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 32, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_ctlz_i32_sel_ne_bitwidth:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
 ; GFX10-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 32, v0
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_ctlz_i32_sel_ne_bitwidth:
@@ -1552,22 +1552,22 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ;
 ; GFX10-LABEL: v_ctlz_i8_sel_eq_neg1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_ubyte v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
-; GFX10-NEXT:    global_store_byte v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_byte v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_ctlz_i8_sel_eq_neg1:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s7
 ; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
 ; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo
 ; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
@@ -1578,7 +1578,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; GFX10-GISEL-NEXT:    v_subrev_nc_u32_e32 v1, 24, v1
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, -1, vcc_lo
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-GISEL-NEXT:    global_store_byte v1, v0, s[0:1]
+; GFX10-GISEL-NEXT:    global_store_byte v1, v0, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_ctlz_i8_sel_eq_neg1:
@@ -1674,25 +1674,25 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ;
 ; GFX10-LABEL: v_ctlz_i16_sel_eq_neg1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3]
+; GFX10-NEXT:    global_load_ushort v1, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbh_u32_e32 v2, v1
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
 ; GFX10-NEXT:    v_min_u32_e32 v2, 32, v2
 ; GFX10-NEXT:    v_add_nc_u32_e32 v2, -16, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo
-; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_ctlz_i16_sel_eq_neg1:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_ushort v1, v0, s[2:3]
+; GFX10-GISEL-NEXT:    global_load_ushort v1, v0, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v2, v1
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
@@ -1700,7 +1700,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; GFX10-GISEL-NEXT:    v_subrev_nc_u32_e32 v2, 16, v2
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v2, 0xffff, vcc_lo
-; GFX10-GISEL-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-GISEL-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_ctlz_i16_sel_eq_neg1:
@@ -1795,23 +1795,23 @@ define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out,
 ;
 ; GFX10-LABEL: v_ctlz_i7_sel_eq_neg1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_ubyte v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0x7f, v0
-; GFX10-NEXT:    global_store_byte v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_byte v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_ctlz_i7_sel_eq_neg1:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s7
 ; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
 ; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo
 ; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
@@ -1824,7 +1824,7 @@ define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out,
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0x7f, vcc_lo
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0x7f, v0
-; GFX10-GISEL-NEXT:    global_store_byte v1, v0, s[0:1]
+; GFX10-GISEL-NEXT:    global_store_byte v1, v0, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_ctlz_i7_sel_eq_neg1:
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index 756b819099682..a377714ebf737 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -134,14 +134,14 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out,
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
-; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
@@ -211,15 +211,15 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(ptr addrspace(1) noalias %out
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i32:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_dwordx2 v[0:1], v0, s[6:7]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
-; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr <2 x i32>, ptr addrspace(1) %valptr, i32 %tid
@@ -295,17 +295,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v4i32(ptr addrspace(1) noalias %out
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v4i32:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[0:3], v0, s[6:7]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v2, v2
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v3, v3
-; GFX9-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr <4 x i32>, ptr addrspace(1) %valptr, i32 %tid
@@ -562,14 +562,14 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no
 ;
 ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i64_with_select:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_mov_b32 s1, 0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_flbit_i32_b64 s4, s[2:3]
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-GISEL-NEXT:    s_flbit_i32_b64 s0, s[6:7]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %ctlz = tail call i64 @llvm.ctlz.i64(i64 %val, i1 true) nounwind readnone
   %ctlz_ret = icmp ne i64 %val, 0
@@ -650,17 +650,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i8_with_select:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[6:7]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v1
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v2, v2
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
-; GFX9-GISEL-NEXT:    global_store_byte v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_byte v0, v1, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i8, ptr addrspace(1) %arrayidx, align 1
   %ctlz = tail call i8 @llvm.ctlz.i8(i8 %val, i1 true) nounwind readnone
@@ -754,11 +754,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i16_with_select:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
-; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:1
+; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[6:7] offset:1
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
@@ -766,7 +766,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
-; GFX9-GISEL-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i16, ptr addrspace(1) %arrayidx, align 1
   %ctlz = tail call i16 @llvm.ctlz.i16(i16 %val, i1 true) nounwind readnone
@@ -870,13 +870,13 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_with_select:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
-; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:1
-; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[2:3] offset:3
-; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:2
+; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[6:7] offset:1
+; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[6:7] offset:3
+; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[6:7] offset:2
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
@@ -887,7 +887,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v2, v1
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i32, ptr addrspace(1) %arrayidx, align 1
   %ctlz = tail call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
@@ -1051,17 +1051,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i64_with_select:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_ubyte v0, v1, s[2:3]
-; GFX9-GISEL-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:1
-; GFX9-GISEL-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:2
-; GFX9-GISEL-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:3
-; GFX9-GISEL-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:4
-; GFX9-GISEL-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:5
-; GFX9-GISEL-NEXT:    global_load_ubyte v7, v1, s[2:3] offset:6
-; GFX9-GISEL-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:7
+; GFX9-GISEL-NEXT:    global_load_ubyte v0, v1, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_ubyte v2, v1, s[6:7] offset:1
+; GFX9-GISEL-NEXT:    global_load_ubyte v3, v1, s[6:7] offset:2
+; GFX9-GISEL-NEXT:    global_load_ubyte v4, v1, s[6:7] offset:3
+; GFX9-GISEL-NEXT:    global_load_ubyte v5, v1, s[6:7] offset:4
+; GFX9-GISEL-NEXT:    global_load_ubyte v6, v1, s[6:7] offset:5
+; GFX9-GISEL-NEXT:    global_load_ubyte v7, v1, s[6:7] offset:6
+; GFX9-GISEL-NEXT:    global_load_ubyte v8, v1, s[6:7] offset:7
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v2, 8, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(5)
@@ -1082,7 +1082,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
 ; GFX9-GISEL-NEXT:    v_min_u32_e32 v0, v4, v0
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 64, v0, vcc
-; GFX9-GISEL-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dwordx2 v1, v[0:1], s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i64, ptr addrspace(1) %arrayidx, align 1
   %ctlz = tail call i64 @llvm.ctlz.i64(i64 %val, i1 true) nounwind readnone
@@ -1159,11 +1159,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i8:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s7
 ; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v1, v0
 ; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v3, vcc
 ; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
@@ -1171,7 +1171,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
-; GFX9-GISEL-NEXT:    global_store_byte v1, v0, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_byte v1, v0, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i8, ptr addrspace(1) %valptr, i32 %tid
@@ -1283,12 +1283,12 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias
 ;
 ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i64_trunc:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_flbit_i32_b64 s2, s[2:3]
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-GISEL-NEXT:    s_flbit_i32_b64 s0, s[6:7]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true)
   %trunc = trunc i64 %ctlz to i32
@@ -1365,17 +1365,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out,
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_dwordx2 v[0:1], v2, s[6:7]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
 ; GFX9-GISEL-NEXT:    v_add_u32_e32 v0, 32, v0
 ; GFX9-GISEL-NEXT:    v_min_u32_e32 v0, v1, v0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
@@ -1455,17 +1455,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i64_trunc:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dwordx2 v[1:2], v1, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_dwordx2 v[1:2], v1, s[6:7]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v2, v2
 ; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, 32, v1
 ; GFX9-GISEL-NEXT:    v_min_u32_e32 v1, v2, v1
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
@@ -1535,16 +1535,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(ptr addrspace(1) no
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, -1, vcc
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
@@ -1614,16 +1614,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_neg1(ptr addrspace(1) no
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_ne_neg1:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, -1, v1, vcc
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
@@ -1698,11 +1698,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noa
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i8_sel_eq_neg1:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s7
 ; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v1, v0
 ; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v3, vcc
 ; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
@@ -1710,9 +1710,9 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noa
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v0
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v2, v2
-; GFX9-GISEL-NEXT:    v_cmp_eq_u32_sdwa s[2:3], v0, v1 src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, -1, s[2:3]
-; GFX9-GISEL-NEXT:    global_store_byte v1, v0, s[0:1]
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_sdwa s[0:1], v0, v1 src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, -1, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_byte v1, v0, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %valptr.gep = getelementptr i8, ptr addrspace(1) %valptr, i32 %tid
@@ -1800,17 +1800,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(ptr addrspa
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1_two_use:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v2, v0
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, -1, vcc
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
@@ -1889,16 +1889,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(ptr addrspace(1) noali
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_0:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
@@ -1973,16 +1973,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(ptr addrspace(1) noali
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_ne_0:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
@@ -2058,16 +2058,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(ptr addrspace(1
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_cmp_non0:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
@@ -2143,16 +2143,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(ptr addrspace(1
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_ne_cmp_non0:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 1, v0
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/cttz.ll b/llvm/test/CodeGen/AMDGPU/cttz.ll
index ee2894a66fbfc..ec532c8e4adc3 100644
--- a/llvm/test/CodeGen/AMDGPU/cttz.ll
+++ b/llvm/test/CodeGen/AMDGPU/cttz.ll
@@ -148,28 +148,28 @@ define amdgpu_kernel void @v_cttz_i32(ptr addrspace(1) noalias %out, ptr addrspa
 ;
 ; GFX10-LABEL: v_cttz_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
 ; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_cttz_i32:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v0, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
@@ -246,32 +246,32 @@ define amdgpu_kernel void @v_cttz_v2i32(ptr addrspace(1) noalias %out, ptr addrs
 ;
 ; GFX10-LABEL: v_cttz_v2i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbl_b32_e32 v1, v1
 ; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
 ; GFX10-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
-; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_cttz_v2i32:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
+; GFX10-GISEL-NEXT:    global_load_dwordx2 v[0:1], v0, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v0, v0
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v1, v1
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
-; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr <2 x i32>, ptr addrspace(1) %valptr, i32 %tid
@@ -362,11 +362,11 @@ define amdgpu_kernel void @v_cttz_v4i32(ptr addrspace(1) noalias %out, ptr addrs
 ;
 ; GFX10-LABEL: v_cttz_v4i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbl_b32_e32 v3, v3
 ; GFX10-NEXT:    v_ffbl_b32_e32 v2, v2
@@ -376,16 +376,16 @@ define amdgpu_kernel void @v_cttz_v4i32(ptr addrspace(1) noalias %out, ptr addrs
 ; GFX10-NEXT:    v_min_u32_e32 v2, 32, v2
 ; GFX10-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
-; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_cttz_v4i32:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[0:3], v0, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v0, v0
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v1, v1
@@ -395,7 +395,7 @@ define amdgpu_kernel void @v_cttz_v4i32(ptr addrspace(1) noalias %out, ptr addrs
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v2, 32, v2
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v3, 32, v3
-; GFX10-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr <4 x i32>, ptr addrspace(1) %valptr, i32 %tid
@@ -475,26 +475,26 @@ define amdgpu_kernel void @v_cttz_i8(ptr addrspace(1) noalias %out, ptr addrspac
 ;
 ; GFX10-LABEL: v_cttz_i8:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ubyte v1, v0, s[2:3]
+; GFX10-NEXT:    global_load_ubyte v1, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_or_b32_e32 v1, 0x100, v1
 ; GFX10-NEXT:    v_ffbl_b32_e32 v1, v1
-; GFX10-NEXT:    global_store_byte v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_byte v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_cttz_i8:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
+; GFX10-GISEL-NEXT:    global_load_ubyte v1, v0, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_or_b32_e32 v1, 0x100, v1
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v1, v1
-; GFX10-GISEL-NEXT:    global_store_byte v0, v1, s[0:1]
+; GFX10-GISEL-NEXT:    global_store_byte v0, v1, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %val = load i8, ptr addrspace(1) %valptr
   %cttz = call i8 @llvm.cttz.i8(i8 %val, i1 false) nounwind readnone
@@ -629,24 +629,24 @@ define amdgpu_kernel void @s_cttz_i64_trunc(ptr addrspace(1) noalias %out, i64 %
 ;
 ; GFX10-LABEL: s_cttz_i64_trunc:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_ff1_i32_b64 s2, s[2:3]
-; GFX10-NEXT:    s_min_u32 s2, s2, 64
-; GFX10-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    s_ff1_i32_b64 s0, s[6:7]
+; GFX10-NEXT:    s_min_u32 s0, s0, 64
+; GFX10-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: s_cttz_i64_trunc:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_ff1_i32_b64 s2, s[2:3]
-; GFX10-GISEL-NEXT:    s_min_u32 s2, s2, 64
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT:    s_ff1_i32_b64 s0, s[6:7]
+; GFX10-GISEL-NEXT:    s_min_u32 s0, s0, 64
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %cttz = call i64 @llvm.cttz.i64(i64 %val, i1 false)
   %trunc = trunc i64 %cttz to i32
@@ -726,25 +726,25 @@ define amdgpu_kernel void @v_cttz_i64(ptr addrspace(1) noalias %out, ptr addrspa
 ;
 ; GFX10-LABEL: v_cttz_i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v2, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbl_b32_e32 v1, v1
 ; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
 ; GFX10-NEXT:    v_add_nc_u32_e64 v1, v1, 32 clamp
 ; GFX10-NEXT:    v_min3_u32 v0, v0, v1, 64
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_cttz_i64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX10-GISEL-NEXT:    global_load_dwordx2 v[0:1], v2, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v1, v1
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v0, v0
@@ -752,7 +752,7 @@ define amdgpu_kernel void @v_cttz_i64(ptr addrspace(1) noalias %out, ptr addrspa
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, v0, v1
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 64, v0
-; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
@@ -835,33 +835,33 @@ define amdgpu_kernel void @v_cttz_i64_trunc(ptr addrspace(1) noalias %out, ptr a
 ;
 ; GFX10-LABEL: v_cttz_i64_trunc:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[1:2], v1, s[2:3]
+; GFX10-NEXT:    global_load_dwordx2 v[1:2], v1, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbl_b32_e32 v2, v2
 ; GFX10-NEXT:    v_ffbl_b32_e32 v1, v1
 ; GFX10-NEXT:    v_add_nc_u32_e64 v2, v2, 32 clamp
 ; GFX10-NEXT:    v_min3_u32 v1, v1, v2, 64
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_cttz_i64_trunc:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dwordx2 v[1:2], v1, s[2:3]
+; GFX10-GISEL-NEXT:    global_load_dwordx2 v[1:2], v1, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v2, v2
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v1, v1
 ; GFX10-GISEL-NEXT:    v_add_nc_u32_e64 v2, v2, 32 clamp
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, v1, v2
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 64, v1
-; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
@@ -933,29 +933,29 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out,
 ;
 ; GFX10-LABEL: v_cttz_i32_sel_eq_neg1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_cttz_i32_sel_eq_neg1:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v1, v0
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, -1, vcc_lo
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
@@ -1027,29 +1027,29 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out,
 ;
 ; GFX10-LABEL: v_cttz_i32_sel_ne_neg1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_cttz_i32_sel_ne_neg1:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v1, v0
 ; GFX10-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e32 v0, -1, v1, vcc_lo
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
@@ -1130,32 +1130,32 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias %
 ;
 ; GFX10-LABEL: v_cttz_i32_sel_eq_bitwidth:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
 ; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 32, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_cttz_i32_sel_eq_bitwidth:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v0, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 32, v0
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc_lo
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
@@ -1235,32 +1235,32 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ;
 ; GFX10-LABEL: v_cttz_i32_sel_ne_bitwidth:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
 ; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 32, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_cttz_i32_sel_ne_bitwidth:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v0, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
 ; GFX10-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 32, v0
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
@@ -1335,32 +1335,32 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ;
 ; GFX10-LABEL: v_cttz_i8_sel_eq_neg1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_ubyte v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
-; GFX10-NEXT:    global_store_byte v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_byte v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_cttz_i8_sel_eq_neg1:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s7
 ; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
 ; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_or_b32_e32 v1, 0x100, v0
-; GFX10-GISEL-NEXT:    v_cmp_eq_u32_sdwa s2, v0, v2 src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_sdwa s0, v0, v2 src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v1, v1
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, -1, s2
-; GFX10-GISEL-NEXT:    global_store_byte v2, v0, s[0:1]
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, -1, s0
+; GFX10-GISEL-NEXT:    global_store_byte v2, v0, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %valptr.gep = getelementptr i8, ptr addrspace(1) %valptr, i32 %tid
@@ -1442,31 +1442,31 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ;
 ; GFX10-LABEL: v_cttz_i16_sel_eq_neg1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3]
+; GFX10-NEXT:    global_load_ushort v1, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_or_b32_e32 v2, 0x10000, v1
 ; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v1
 ; GFX10-NEXT:    v_ffbl_b32_e32 v2, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo
-; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_cttz_i16_sel_eq_neg1:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_ushort v1, v0, s[2:3]
+; GFX10-GISEL-NEXT:    global_load_ushort v1, v0, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_or_b32_e32 v2, 0x10000, v1
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v2, v2
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v2, 0xffff, vcc_lo
-; GFX10-GISEL-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-GISEL-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %val = load i16, ptr addrspace(1) %valptr
   %cttz = call i16 @llvm.cttz.i16(i16 %val, i1 false) nounwind readnone
@@ -1542,23 +1542,23 @@ define amdgpu_kernel void @v_cttz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out,
 ;
 ; GFX10-LABEL: v_cttz_i7_sel_eq_neg1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_ubyte v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0x7f, v0
-; GFX10-NEXT:    global_store_byte v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_byte v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_cttz_i7_sel_eq_neg1:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s7
 ; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
 ; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo
 ; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
@@ -1570,7 +1570,7 @@ define amdgpu_kernel void @v_cttz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out,
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0x7f, vcc_lo
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0x7f, v0
-; GFX10-GISEL-NEXT:    global_store_byte v1, v0, s[0:1]
+; GFX10-GISEL-NEXT:    global_store_byte v1, v0, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %valptr.gep = getelementptr i7, ptr addrspace(1) %valptr, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
index 392a44318b0a5..086d99916ba04 100644
--- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
@@ -121,14 +121,14 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32(ptr addrspace(1) noalias %out,
 ;
 ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i32:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v0, v0
-; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
@@ -198,15 +198,15 @@ define amdgpu_kernel void @v_cttz_zero_undef_v2i32(ptr addrspace(1) noalias %out
 ;
 ; GFX9-GISEL-LABEL: v_cttz_zero_undef_v2i32:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_dwordx2 v[0:1], v0, s[6:7]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v0, v0
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v1, v1
-; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr <2 x i32>, ptr addrspace(1) %valptr, i32 %tid
@@ -282,17 +282,17 @@ define amdgpu_kernel void @v_cttz_zero_undef_v4i32(ptr addrspace(1) noalias %out
 ;
 ; GFX9-GISEL-LABEL: v_cttz_zero_undef_v4i32:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[0:3], v0, s[6:7]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v0, v0
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v1, v1
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v2, v2
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v3, v3
-; GFX9-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr <4 x i32>, ptr addrspace(1) %valptr, i32 %tid
@@ -538,14 +538,14 @@ define amdgpu_kernel void @s_cttz_zero_undef_i64_with_select(ptr addrspace(1) no
 ;
 ; GFX9-GISEL-LABEL: s_cttz_zero_undef_i64_with_select:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_mov_b32 s1, 0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s4, s[2:3]
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s0, s[6:7]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 true) nounwind readnone
   %cttz_ret = icmp ne i64 %val, 0
@@ -622,16 +622,16 @@ define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa
 ;
 ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i8_with_select:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[6:7]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v2, v1
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
-; GFX9-GISEL-NEXT:    global_store_byte v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_byte v0, v1, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i8, ptr addrspace(1) %arrayidx, align 1
   %cttz = tail call i8 @llvm.cttz.i8(i8 %val, i1 true) nounwind readnone
@@ -721,18 +721,18 @@ define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(ptr addrspace(1) no
 ;
 ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i16_with_select:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
-; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:1
+; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[6:7] offset:1
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v2, v1
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
-; GFX9-GISEL-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i16, ptr addrspace(1) %arrayidx, align 1
   %cttz = tail call i16 @llvm.cttz.i16(i16 %val, i1 true) nounwind readnone
@@ -836,13 +836,13 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(ptr addrspace(1) no
 ;
 ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i32_with_select:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
-; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:1
-; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[2:3] offset:3
-; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:2
+; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[6:7] offset:1
+; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[6:7] offset:3
+; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[6:7] offset:2
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
@@ -853,7 +853,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(ptr addrspace(1) no
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v2, v1
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i32, ptr addrspace(1) %arrayidx, align 1
   %cttz = tail call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone
@@ -1017,17 +1017,17 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no
 ;
 ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i64_with_select:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_ubyte v0, v1, s[2:3]
-; GFX9-GISEL-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:1
-; GFX9-GISEL-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:2
-; GFX9-GISEL-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:3
-; GFX9-GISEL-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:4
-; GFX9-GISEL-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:5
-; GFX9-GISEL-NEXT:    global_load_ubyte v7, v1, s[2:3] offset:6
-; GFX9-GISEL-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:7
+; GFX9-GISEL-NEXT:    global_load_ubyte v0, v1, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_ubyte v2, v1, s[6:7] offset:1
+; GFX9-GISEL-NEXT:    global_load_ubyte v3, v1, s[6:7] offset:2
+; GFX9-GISEL-NEXT:    global_load_ubyte v4, v1, s[6:7] offset:3
+; GFX9-GISEL-NEXT:    global_load_ubyte v5, v1, s[6:7] offset:4
+; GFX9-GISEL-NEXT:    global_load_ubyte v6, v1, s[6:7] offset:5
+; GFX9-GISEL-NEXT:    global_load_ubyte v7, v1, s[6:7] offset:6
+; GFX9-GISEL-NEXT:    global_load_ubyte v8, v1, s[6:7] offset:7
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v2, 8, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(5)
@@ -1048,7 +1048,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
 ; GFX9-GISEL-NEXT:    v_min_u32_e32 v0, v0, v4
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 64, v0, vcc
-; GFX9-GISEL-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dwordx2 v1, v[0:1], s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i64, ptr addrspace(1) %arrayidx, align 1
   %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 true) nounwind readnone
@@ -1152,13 +1152,13 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out,
 ;
 ; GFX9-GISEL-LABEL: v_cttz_i32_sel_eq_neg1:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
-; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:1
-; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[2:3] offset:3
-; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:2
+; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[6:7] offset:1
+; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[6:7] offset:3
+; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[6:7] offset:2
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
@@ -1170,7 +1170,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out,
 ; GFX9-GISEL-NEXT:    v_min_u32_e32 v2, 32, v2
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, v2, -1, vcc
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i32, ptr addrspace(1) %arrayidx, align 1
   %ctlz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
@@ -1274,13 +1274,13 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out,
 ;
 ; GFX9-GISEL-LABEL: v_cttz_i32_sel_ne_neg1:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
-; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:1
-; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[2:3] offset:3
-; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:2
+; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[6:7] offset:1
+; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[6:7] offset:3
+; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[6:7] offset:2
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
@@ -1292,7 +1292,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out,
 ; GFX9-GISEL-NEXT:    v_min_u32_e32 v2, 32, v2
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, -1, v2, vcc
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i32, ptr addrspace(1) %arrayidx, align 1
   %ctlz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
@@ -1404,13 +1404,13 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ;
 ; GFX9-GISEL-LABEL: v_cttz_i32_sel_ne_bitwidth:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
-; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:1
-; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[2:3] offset:3
-; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:2
+; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[6:7] offset:1
+; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[6:7] offset:3
+; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[6:7] offset:2
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
@@ -1422,7 +1422,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; GFX9-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v1
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, -1, v1, vcc
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i32, ptr addrspace(1) %arrayidx, align 1
   %ctlz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
@@ -1498,18 +1498,18 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ;
 ; GFX9-GISEL-LABEL: v_cttz_i8_sel_eq_neg1:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xff
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[6:7]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_or_b32_e32 v3, 0x100, v1
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v3, v3
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
-; GFX9-GISEL-NEXT:    global_store_byte v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_byte v0, v1, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i8, ptr addrspace(1) %arrayidx, align 1
   %ctlz = call i8 @llvm.cttz.i8(i8 %val, i1 false) nounwind readnone
@@ -1597,12 +1597,12 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ;
 ; GFX9-GISEL-LABEL: v_cttz_i16_sel_eq_neg1:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xffff
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
-; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:1
+; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[6:7] offset:1
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
 ; GFX9-GISEL-NEXT:    v_or_b32_e32 v2, 0x10000, v1
@@ -1610,7 +1610,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-GISEL-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i16, ptr addrspace(1) %arrayidx, align 1
   %ctlz = call i16 @llvm.cttz.i16(i16 %val, i1 false) nounwind readnone
diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
index 097604d57803e..4226728dbe118 100644
--- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@@ -933,24 +933,24 @@ define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr add
 ;
 ; GFX10-LABEL: load_i8_to_f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_ubyte v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: load_i8_to_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3]
+; GFX9-NEXT:    global_load_ubyte v0, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: load_i8_to_f32:
@@ -1013,28 +1013,28 @@ define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr
 ;
 ; GFX10-LABEL: load_v2i8_to_v2f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ushort v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_ushort v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: load_v2i8_to_v2f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v0, v0, s[2:3]
+; GFX9-NEXT:    global_load_ushort v0, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: load_v2i8_to_v2f32:
@@ -1102,30 +1102,30 @@ define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr
 ;
 ; GFX10-LABEL: load_v3i8_to_v3f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
 ; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX10-NEXT:    global_store_dwordx3 v3, v[0:2], s[0:1]
+; GFX10-NEXT:    global_store_dwordx3 v3, v[0:2], s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: load_v3i8_to_v3f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
 ; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX9-NEXT:    global_store_dwordx3 v3, v[0:2], s[0:1]
+; GFX9-NEXT:    global_store_dwordx3 v3, v[0:2], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: load_v3i8_to_v3f32:
@@ -1194,32 +1194,32 @@ define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr
 ;
 ; GFX10-LABEL: load_v4i8_to_v4f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
 ; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
 ; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: load_v4i8_to_v4f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
 ; GFX9-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
 ; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: load_v4i8_to_v4f32:
@@ -1312,15 +1312,15 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias
 ;
 ; GFX10-LABEL: load_v4i8_to_v4f32_unaligned:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x3
-; GFX10-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:3
-; GFX10-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:2
-; GFX10-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:1
-; GFX10-NEXT:    global_load_ubyte v5, v0, s[2:3]
+; GFX10-NEXT:    global_load_ubyte v1, v0, s[6:7] offset:3
+; GFX10-NEXT:    global_load_ubyte v2, v0, s[6:7] offset:2
+; GFX10-NEXT:    global_load_ubyte v4, v0, s[6:7] offset:1
+; GFX10-NEXT:    global_load_ubyte v5, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(3)
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v3, v1
 ; GFX10-NEXT:    s_waitcnt vmcnt(2)
@@ -1329,19 +1329,19 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v1, v4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v5
-; GFX10-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
+; GFX10-NEXT:    global_store_dwordx4 v6, v[0:3], s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: load_v4i8_to_v4f32_unaligned:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:3
-; GFX9-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:2
-; GFX9-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:1
-; GFX9-NEXT:    global_load_ubyte v5, v0, s[2:3]
+; GFX9-NEXT:    global_load_ubyte v1, v0, s[6:7] offset:3
+; GFX9-NEXT:    global_load_ubyte v2, v0, s[6:7] offset:2
+; GFX9-NEXT:    global_load_ubyte v4, v0, s[6:7] offset:1
+; GFX9-NEXT:    global_load_ubyte v5, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v3, v1
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
@@ -1350,7 +1350,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v1, v4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v5
-; GFX9-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
+; GFX9-NEXT:    global_store_dwordx4 v6, v[0:3], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: load_v4i8_to_v4f32_unaligned:
@@ -1643,12 +1643,12 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
 ;
 ; GFX10-LABEL: load_v4i8_to_v4f32_2_uses:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffffff00, v0
@@ -1666,22 +1666,21 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
 ; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
 ; GFX10-NEXT:    v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
-; GFX10-NEXT:    global_store_dword v4, v5, s[2:3]
+; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
+; GFX10-NEXT:    global_store_dword v4, v5, s[6:7]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: load_v4i8_to_v4f32_2_uses:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT:    s_movk_i32 s4, 0xff00
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 9
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v4, v0, s[0:1]
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX9-NEXT:    s_movk_i32 s5, 0x900
+; GFX9-NEXT:    s_movk_i32 s0, 0xff00
+; GFX9-NEXT:    s_movk_i32 s1, 0x900
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte3_e32 v3, v4
 ; GFX9-NEXT:    v_cvt_f32_ubyte2_e32 v2, v4
@@ -1689,17 +1688,16 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v4
 ; GFX9-NEXT:    v_and_b32_e32 v7, 0xffffff00, v4
 ; GFX9-NEXT:    v_add_u16_e32 v8, 9, v4
-; GFX9-NEXT:    v_and_b32_sdwa v9, v4, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v9, v4, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    v_add_u16_sdwa v4, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX9-NEXT:    global_store_dwordx4 v5, v[0:3], s[4:5]
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u16_e32 v0, 0x900, v0
-; GFX9-NEXT:    v_add_u16_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_add_u16_sdwa v1, v1, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX9-NEXT:    global_store_dword v5, v0, s[2:3]
+; GFX9-NEXT:    global_store_dword v5, v0, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: load_v4i8_to_v4f32_2_uses:
@@ -1839,17 +1837,17 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr
 ;
 ; GFX10-LABEL: load_v7i8_to_v7f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x5
-; GFX10-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:6
-; GFX10-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:3
-; GFX10-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:2
-; GFX10-NEXT:    global_load_ubyte v5, v0, s[2:3] offset:1
-; GFX10-NEXT:    global_load_short_d16 v7, v0, s[2:3] offset:4
-; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_ubyte v4, v0, s[6:7] offset:6
+; GFX10-NEXT:    global_load_ubyte v1, v0, s[6:7] offset:3
+; GFX10-NEXT:    global_load_ubyte v2, v0, s[6:7] offset:2
+; GFX10-NEXT:    global_load_ubyte v5, v0, s[6:7] offset:1
+; GFX10-NEXT:    global_load_short_d16 v7, v0, s[6:7] offset:4
+; GFX10-NEXT:    global_load_ubyte v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(5)
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v6, v4
 ; GFX10-NEXT:    s_waitcnt vmcnt(4)
@@ -1863,22 +1861,22 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v4, v7
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX10-NEXT:    global_store_dwordx3 v8, v[4:6], s[0:1] offset:16
-; GFX10-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX10-NEXT:    global_store_dwordx3 v8, v[4:6], s[4:5] offset:16
+; GFX10-NEXT:    global_store_dwordx4 v8, v[0:3], s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: load_v7i8_to_v7f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v10, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:6
-; GFX9-NEXT:    global_load_ushort v2, v0, s[2:3] offset:4
-; GFX9-NEXT:    global_load_ubyte v3, v0, s[2:3] offset:3
-; GFX9-NEXT:    global_load_ubyte v7, v0, s[2:3] offset:2
-; GFX9-NEXT:    global_load_ubyte v8, v0, s[2:3] offset:1
-; GFX9-NEXT:    global_load_ubyte v9, v0, s[2:3]
+; GFX9-NEXT:    global_load_ubyte v1, v0, s[6:7] offset:6
+; GFX9-NEXT:    global_load_ushort v2, v0, s[6:7] offset:4
+; GFX9-NEXT:    global_load_ubyte v3, v0, s[6:7] offset:3
+; GFX9-NEXT:    global_load_ubyte v7, v0, s[6:7] offset:2
+; GFX9-NEXT:    global_load_ubyte v8, v0, s[6:7] offset:1
+; GFX9-NEXT:    global_load_ubyte v9, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v6, v1
 ; GFX9-NEXT:    s_waitcnt vmcnt(4)
@@ -1892,8 +1890,8 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v1, v8
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v9
-; GFX9-NEXT:    global_store_dwordx4 v10, v[0:3], s[0:1]
-; GFX9-NEXT:    global_store_dwordx3 v10, v[4:6], s[0:1] offset:16
+; GFX9-NEXT:    global_store_dwordx4 v10, v[0:3], s[4:5]
+; GFX9-NEXT:    global_store_dwordx3 v10, v[4:6], s[4:5] offset:16
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: load_v7i8_to_v7f32:
@@ -1990,11 +1988,11 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr
 ;
 ; GFX10-LABEL: load_v8i8_to_v8f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v10, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[8:9], v0, s[2:3]
+; GFX10-NEXT:    global_load_dwordx2 v[8:9], v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte3_e32 v7, v9
 ; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v6, v9
@@ -2004,17 +2002,17 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr
 ; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v2, v8
 ; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v1, v8
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v8
-; GFX10-NEXT:    global_store_dwordx4 v10, v[4:7], s[0:1] offset:16
-; GFX10-NEXT:    global_store_dwordx4 v10, v[0:3], s[0:1]
+; GFX10-NEXT:    global_store_dwordx4 v10, v[4:7], s[4:5] offset:16
+; GFX10-NEXT:    global_store_dwordx4 v10, v[0:3], s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: load_v8i8_to_v8f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v9, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[7:8], v0, s[2:3]
+; GFX9-NEXT:    global_load_dwordx2 v[7:8], v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte3_e32 v3, v7
 ; GFX9-NEXT:    v_cvt_f32_ubyte2_e32 v2, v7
@@ -2024,8 +2022,8 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr
 ; GFX9-NEXT:    v_cvt_f32_ubyte2_e32 v6, v8
 ; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v5, v8
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v4, v8
-; GFX9-NEXT:    global_store_dwordx4 v9, v[4:7], s[0:1] offset:16
-; GFX9-NEXT:    global_store_dwordx4 v9, v[0:3], s[0:1]
+; GFX9-NEXT:    global_store_dwordx4 v9, v[4:7], s[4:5] offset:16
+; GFX9-NEXT:    global_store_dwordx4 v9, v[0:3], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: load_v8i8_to_v8f32:
@@ -2098,28 +2096,28 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %ou
 ;
 ; GFX10-LABEL: i8_zext_inreg_i32_to_f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: i8_zext_inreg_i32_to_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: i8_zext_inreg_i32_to_f32:
@@ -2184,26 +2182,26 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %ou
 ;
 ; GFX10-LABEL: i8_zext_inreg_hi1_to_f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: i8_zext_inreg_hi1_to_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
-; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: i8_zext_inreg_hi1_to_f32:
@@ -2266,24 +2264,24 @@ define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr
 ;
 ; GFX10-LABEL: i8_zext_i32_to_f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_ubyte v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: i8_zext_i32_to_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3]
+; GFX9-NEXT:    global_load_ubyte v0, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: i8_zext_i32_to_f32:
@@ -2369,15 +2367,15 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou
 ;
 ; GFX10-LABEL: v4i8_zext_v4i32_to_v4f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x3
-; GFX10-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:3
-; GFX10-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:2
-; GFX10-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:1
-; GFX10-NEXT:    global_load_ubyte v5, v0, s[2:3]
+; GFX10-NEXT:    global_load_ubyte v1, v0, s[6:7] offset:3
+; GFX10-NEXT:    global_load_ubyte v2, v0, s[6:7] offset:2
+; GFX10-NEXT:    global_load_ubyte v4, v0, s[6:7] offset:1
+; GFX10-NEXT:    global_load_ubyte v5, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(3)
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v3, v1
 ; GFX10-NEXT:    s_waitcnt vmcnt(2)
@@ -2386,19 +2384,19 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v1, v4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v5
-; GFX10-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
+; GFX10-NEXT:    global_store_dwordx4 v6, v[0:3], s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v4i8_zext_v4i32_to_v4f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:3
-; GFX9-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:2
-; GFX9-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:1
-; GFX9-NEXT:    global_load_ubyte v5, v0, s[2:3]
+; GFX9-NEXT:    global_load_ubyte v1, v0, s[6:7] offset:3
+; GFX9-NEXT:    global_load_ubyte v2, v0, s[6:7] offset:2
+; GFX9-NEXT:    global_load_ubyte v4, v0, s[6:7] offset:1
+; GFX9-NEXT:    global_load_ubyte v5, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v3, v1
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
@@ -2407,7 +2405,7 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v1, v4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v5
-; GFX9-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
+; GFX9-NEXT:    global_store_dwordx4 v6, v[0:3], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v4i8_zext_v4i32_to_v4f32:
@@ -2479,26 +2477,26 @@ define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, p
 ;
 ; GFX10-LABEL: extract_byte0_to_f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: extract_byte0_to_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: extract_byte0_to_f32:
@@ -2560,26 +2558,26 @@ define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, p
 ;
 ; GFX10-LABEL: extract_byte1_to_f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: extract_byte1_to_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
-; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: extract_byte1_to_f32:
@@ -2642,26 +2640,26 @@ define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, p
 ;
 ; GFX10-LABEL: extract_byte2_to_f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v0, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: extract_byte2_to_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte2_e32 v0, v0
-; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: extract_byte2_to_f32:
@@ -2724,26 +2722,26 @@ define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, p
 ;
 ; GFX10-LABEL: extract_byte3_to_f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte3_e32 v0, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: extract_byte3_to_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte3_e32 v0, v0
-; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: extract_byte3_to_f32:
@@ -2825,16 +2823,16 @@ define amdgpu_kernel void @cvt_ubyte0_or_multiuse(ptr addrspace(1) %in, ptr addr
 ;
 ; GFX9-LABEL: cvt_ubyte0_or_multiuse:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[0:1]
+; GFX9-NEXT:    global_load_dword v0, v0, s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_e32 v0, 0x80000001, v0
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v2, v0
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX9-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_store_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: cvt_ubyte0_or_multiuse:
diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
index 8f31bb1fe0a81..739fff5084135 100644
--- a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
@@ -376,22 +376,22 @@ define amdgpu_kernel void @uniform_vec_i16_LH(ptr addrspace(1) %out, i16 %a, i32
 ;
 ; GFX9-LABEL: uniform_vec_i16_LH:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_pack_lh_b32_b16 s2, s2, s3
-; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_pack_lh_b32_b16 s0, s6, s7
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX906-LABEL: uniform_vec_i16_LH:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX906-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    s_pack_lh_b32_b16 s2, s2, s3
-; GFX906-NEXT:    v_mov_b32_e32 v1, s2
-; GFX906-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX906-NEXT:    s_pack_lh_b32_b16 s0, s6, s7
+; GFX906-NEXT:    v_mov_b32_e32 v1, s0
+; GFX906-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX906-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: uniform_vec_i16_LH:
@@ -466,22 +466,22 @@ define amdgpu_kernel void @uniform_vec_i16_HH(ptr addrspace(1) %out, i32 %a, i32
 ;
 ; GFX9-LABEL: uniform_vec_i16_HH:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_pack_hh_b32_b16 s2, s2, s3
-; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_pack_hh_b32_b16 s0, s6, s7
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX906-LABEL: uniform_vec_i16_HH:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX906-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    s_pack_hh_b32_b16 s2, s2, s3
-; GFX906-NEXT:    v_mov_b32_e32 v1, s2
-; GFX906-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX906-NEXT:    s_pack_hh_b32_b16 s0, s6, s7
+; GFX906-NEXT:    v_mov_b32_e32 v1, s0
+; GFX906-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX906-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: uniform_vec_i16_HH:
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
index b8936911f0576..ae280c5a443e1 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
@@ -1831,21 +1831,21 @@ define amdgpu_kernel void @s_copysign_v2f16(ptr addrspace(1) %arg_out, <2 x half
 ;
 ; GFX9-LABEL: s_copysign_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_movk_i32 s0, 0x7fff
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    v_mov_b32_e32 v2, s3
-; GFX9-NEXT:    s_lshr_b32 s3, s3, 16
-; GFX9-NEXT:    s_lshr_b32 s2, s2, 16
-; GFX9-NEXT:    v_bfi_b32 v1, s4, v1, v2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-NEXT:    v_mov_b32_e32 v2, s7
+; GFX9-NEXT:    s_lshr_b32 s1, s7, 16
+; GFX9-NEXT:    s_lshr_b32 s2, s6, 16
+; GFX9-NEXT:    v_bfi_b32 v1, s0, v1, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-NEXT:    v_bfi_b32 v2, s4, v2, v3
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_bfi_b32 v2, s0, v2, v3
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_copysign_v2f16:
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.ll b/llvm/test/CodeGen/AMDGPU/fdiv.ll
index 0468175c5df50..6c5b2917855fc 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.ll
@@ -118,10 +118,10 @@ define amdgpu_kernel void @s_fdiv_f32_ninf(ptr addrspace(1) %out, float %a, floa
 ;
 ; GFX10-LABEL: s_fdiv_f32_ninf:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_div_scale_f32 v0, s4, s3, s3, s2
-; GFX10-NEXT:    v_div_scale_f32 v2, vcc_lo, s2, s3, s2
+; GFX10-NEXT:    v_div_scale_f32 v0, s0, s7, s7, s6
+; GFX10-NEXT:    v_div_scale_f32 v2, vcc_lo, s6, s7, s6
 ; GFX10-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX10-NEXT:    s_denorm_mode 15
 ; GFX10-NEXT:    v_fma_f32 v3, -v0, v1, 1.0
@@ -133,8 +133,8 @@ define amdgpu_kernel void @s_fdiv_f32_ninf(ptr addrspace(1) %out, float %a, floa
 ; GFX10-NEXT:    s_denorm_mode 12
 ; GFX10-NEXT:    v_div_fmas_f32 v0, v0, v1, v3
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    v_div_fixup_f32 v0, v0, s3, s2
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    v_div_fixup_f32 v0, v0, s7, s6
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_fdiv_f32_ninf:
@@ -275,21 +275,21 @@ define amdgpu_kernel void @s_fdiv_f32_ieee(ptr addrspace(1) %out, float %a, floa
 ;
 ; GFX10-LABEL: s_fdiv_f32_ieee:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_div_scale_f32 v0, s4, s3, s3, s2
+; GFX10-NEXT:    v_div_scale_f32 v0, s0, s7, s7, s6
 ; GFX10-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX10-NEXT:    v_fma_f32 v2, -v0, v1, 1.0
 ; GFX10-NEXT:    v_fmac_f32_e32 v1, v2, v1
-; GFX10-NEXT:    v_div_scale_f32 v2, vcc_lo, s2, s3, s2
+; GFX10-NEXT:    v_div_scale_f32 v2, vcc_lo, s6, s7, s6
 ; GFX10-NEXT:    v_mul_f32_e32 v3, v2, v1
 ; GFX10-NEXT:    v_fma_f32 v4, -v0, v3, v2
 ; GFX10-NEXT:    v_fmac_f32_e32 v3, v4, v1
 ; GFX10-NEXT:    v_fma_f32 v0, -v0, v3, v2
 ; GFX10-NEXT:    v_div_fmas_f32 v0, v0, v1, v3
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    v_div_fixup_f32 v0, v0, s3, s2
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    v_div_fixup_f32 v0, v0, s7, s6
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_fdiv_f32_ieee:
@@ -370,16 +370,16 @@ define amdgpu_kernel void @s_fdiv_25ulp_f32(ptr addrspace(1) %out, float %a, flo
 ;
 ; GFX10-LABEL: s_fdiv_25ulp_f32:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_cmp_lt_f32_e64 s4, 0x6f800000, |s3|
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x2f800000, s4
-; GFX10-NEXT:    v_mul_f32_e32 v1, s3, v0
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s0, 0x6f800000, |s7|
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x2f800000, s0
+; GFX10-NEXT:    v_mul_f32_e32 v1, s7, v0
 ; GFX10-NEXT:    v_rcp_f32_e32 v1, v1
-; GFX10-NEXT:    v_mul_f32_e32 v1, s2, v1
+; GFX10-NEXT:    v_mul_f32_e32 v1, s6, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX10-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v2, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_fdiv_25ulp_f32:
@@ -482,18 +482,18 @@ define amdgpu_kernel void @s_fdiv_25ulp_ieee_f32(ptr addrspace(1) %out, float %a
 ;
 ; GFX10-LABEL: s_fdiv_25ulp_ieee_f32:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_frexp_mant_f32_e32 v0, s3
-; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v1, s3
-; GFX10-NEXT:    v_frexp_mant_f32_e32 v2, s2
-; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v3, s2
+; GFX10-NEXT:    v_frexp_mant_f32_e32 v0, s7
+; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v1, s7
+; GFX10-NEXT:    v_frexp_mant_f32_e32 v2, s6
+; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v3, s6
 ; GFX10-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v1, v3, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX10-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-NEXT:    global_store_dword v2, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_fdiv_25ulp_ieee_f32:
@@ -559,12 +559,12 @@ define amdgpu_kernel void @s_fdiv_fast_ieee_f32(ptr addrspace(1) %out, float %a,
 ;
 ; GFX10-LABEL: s_fdiv_fast_ieee_f32:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_rcp_f32_e32 v0, s3
-; GFX10-NEXT:    v_mul_f32_e32 v0, s2, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    v_rcp_f32_e32 v0, s7
+; GFX10-NEXT:    v_mul_f32_e32 v0, s6, v0
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_fdiv_fast_ieee_f32:
@@ -623,12 +623,12 @@ define amdgpu_kernel void @s_fdiv_f32_fast_math(ptr addrspace(1) %out, float %a,
 ;
 ; GFX10-LABEL: s_fdiv_f32_fast_math:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_rcp_f32_e32 v0, s3
-; GFX10-NEXT:    v_mul_f32_e32 v0, s2, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    v_rcp_f32_e32 v0, s7
+; GFX10-NEXT:    v_mul_f32_e32 v0, s6, v0
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_fdiv_f32_fast_math:
@@ -687,12 +687,12 @@ define amdgpu_kernel void @s_fdiv_ulp25_f32_fast_math(ptr addrspace(1) %out, flo
 ;
 ; GFX10-LABEL: s_fdiv_ulp25_f32_fast_math:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_rcp_f32_e32 v0, s3
-; GFX10-NEXT:    v_mul_f32_e32 v0, s2, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    v_rcp_f32_e32 v0, s7
+; GFX10-NEXT:    v_mul_f32_e32 v0, s6, v0
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_fdiv_ulp25_f32_fast_math:
@@ -829,10 +829,10 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_daz(ptr addrspace(1) %out, float %a,
 ;
 ; GFX10-LABEL: s_fdiv_f32_arcp_daz:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_div_scale_f32 v0, s4, s3, s3, s2
-; GFX10-NEXT:    v_div_scale_f32 v2, vcc_lo, s2, s3, s2
+; GFX10-NEXT:    v_div_scale_f32 v0, s0, s7, s7, s6
+; GFX10-NEXT:    v_div_scale_f32 v2, vcc_lo, s6, s7, s6
 ; GFX10-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX10-NEXT:    s_denorm_mode 15
 ; GFX10-NEXT:    v_fma_f32 v3, -v0, v1, 1.0
@@ -844,8 +844,8 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_daz(ptr addrspace(1) %out, float %a,
 ; GFX10-NEXT:    s_denorm_mode 12
 ; GFX10-NEXT:    v_div_fmas_f32 v0, v0, v1, v3
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    v_div_fixup_f32 v0, v0, s3, s2
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    v_div_fixup_f32 v0, v0, s7, s6
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_fdiv_f32_arcp_daz:
@@ -916,12 +916,12 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_ninf(ptr addrspace(1) %out, float %a,
 ;
 ; GFX10-LABEL: s_fdiv_f32_arcp_ninf:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_rcp_f32_e32 v0, s3
-; GFX10-NEXT:    v_mul_f32_e32 v0, s2, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    v_rcp_f32_e32 v0, s7
+; GFX10-NEXT:    v_mul_f32_e32 v0, s6, v0
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_fdiv_f32_arcp_ninf:
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
index e44572985e6d2..26714dcc6dfac 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
@@ -4315,12 +4315,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr %out, i32 %in, i32 %old
 ;
 ; GCN3-LABEL: atomic_cmpxchg_i32_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v0, s0
-; GCN3-NEXT:    v_mov_b32_e32 v2, s2
-; GCN3-NEXT:    v_mov_b32_e32 v1, s1
-; GCN3-NEXT:    v_mov_b32_e32 v3, s3
+; GCN3-NEXT:    v_mov_b32_e32 v0, s4
+; GCN3-NEXT:    v_mov_b32_e32 v2, s6
+; GCN3-NEXT:    v_mov_b32_e32 v1, s5
+; GCN3-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN3-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
@@ -4570,12 +4570,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32(ptr %out, i32 %in, i32 %old) {
 ;
 ; GCN3-LABEL: atomic_cmpxchg_i32:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v0, s0
-; GCN3-NEXT:    v_mov_b32_e32 v2, s2
-; GCN3-NEXT:    v_mov_b32_e32 v1, s1
-; GCN3-NEXT:    v_mov_b32_e32 v3, s3
+; GCN3-NEXT:    v_mov_b32_e32 v0, s4
+; GCN3-NEXT:    v_mov_b32_e32 v2, s6
+; GCN3-NEXT:    v_mov_b32_e32 v1, s5
+; GCN3-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN3-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll
index 5bd527149572e..66aacd7062a6d 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll
@@ -3883,13 +3883,13 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 %
 ;
 ; GCN3-LABEL: atomic_max_i32_addr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_ashr_i32 s5, s3, 31
-; GCN3-NEXT:    s_mov_b32 s4, s3
-; GCN3-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
-; GCN3-NEXT:    s_add_u32 s0, s0, s4
-; GCN3-NEXT:    s_addc_u32 s1, s1, s5
+; GCN3-NEXT:    s_ashr_i32 s1, s7, 31
+; GCN3-NEXT:    s_mov_b32 s0, s7
+; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; GCN3-NEXT:    s_add_u32 s0, s4, s0
+; GCN3-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    flat_load_dword v3, v[0:1] offset:16
@@ -3897,7 +3897,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 %
 ; GCN3-NEXT:  .LBB88_1: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_max_i32_e32 v2, s2, v3
+; GCN3-NEXT:    v_max_i32_e32 v2, s6, v3
 ; GCN3-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
@@ -4085,13 +4085,13 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index)
 ;
 ; GCN3-LABEL: atomic_max_i32_addr64:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_ashr_i32 s5, s3, 31
-; GCN3-NEXT:    s_mov_b32 s4, s3
-; GCN3-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
-; GCN3-NEXT:    s_add_u32 s0, s0, s4
-; GCN3-NEXT:    s_addc_u32 s1, s1, s5
+; GCN3-NEXT:    s_ashr_i32 s1, s7, 31
+; GCN3-NEXT:    s_mov_b32 s0, s7
+; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; GCN3-NEXT:    s_add_u32 s0, s4, s0
+; GCN3-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    flat_load_dword v3, v[0:1]
@@ -4099,7 +4099,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index)
 ; GCN3-NEXT:  .LBB90_1: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_max_i32_e32 v2, s2, v3
+; GCN3-NEXT:    v_max_i32_e32 v2, s6, v3
 ; GCN3-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
@@ -5026,13 +5026,13 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32
 ;
 ; GCN3-LABEL: atomic_umax_i32_addr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_ashr_i32 s5, s3, 31
-; GCN3-NEXT:    s_mov_b32 s4, s3
-; GCN3-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
-; GCN3-NEXT:    s_add_u32 s0, s0, s4
-; GCN3-NEXT:    s_addc_u32 s1, s1, s5
+; GCN3-NEXT:    s_ashr_i32 s1, s7, 31
+; GCN3-NEXT:    s_mov_b32 s0, s7
+; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; GCN3-NEXT:    s_add_u32 s0, s4, s0
+; GCN3-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    flat_load_dword v3, v[0:1] offset:16
@@ -5040,7 +5040,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32
 ; GCN3-NEXT:  .LBB102_1: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_max_u32_e32 v2, s2, v3
+; GCN3-NEXT:    v_max_u32_e32 v2, s6, v3
 ; GCN3-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
@@ -6820,13 +6820,13 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 %
 ;
 ; GCN3-LABEL: atomic_min_i32_addr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_ashr_i32 s5, s3, 31
-; GCN3-NEXT:    s_mov_b32 s4, s3
-; GCN3-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
-; GCN3-NEXT:    s_add_u32 s0, s0, s4
-; GCN3-NEXT:    s_addc_u32 s1, s1, s5
+; GCN3-NEXT:    s_ashr_i32 s1, s7, 31
+; GCN3-NEXT:    s_mov_b32 s0, s7
+; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; GCN3-NEXT:    s_add_u32 s0, s4, s0
+; GCN3-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    flat_load_dword v3, v[0:1] offset:16
@@ -6834,7 +6834,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 %
 ; GCN3-NEXT:  .LBB125_1: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_min_i32_e32 v2, s2, v3
+; GCN3-NEXT:    v_min_i32_e32 v2, s6, v3
 ; GCN3-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
diff --git a/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll
index 718be90eb75fc..9943976dd86da 100644
--- a/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll
@@ -1389,49 +1389,49 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out,
 ;
 ; GFX10-FLUSH-LABEL: mad_sub_f16:
 ; GFX10-FLUSH:       ; %bb.0:
-; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX10-FLUSH-NEXT:    v_sub_f16_e32 v1, v1, v3
-; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX10-DENORM-STRICT-LABEL: mad_sub_f16:
 ; GFX10-DENORM-STRICT:       ; %bb.0:
-; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-STRICT-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX10-DENORM-STRICT-NEXT:    v_sub_f16_e32 v1, v1, v3
-; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-DENORM-STRICT-NEXT:    s_endpgm
 ;
 ; GFX10-DENORM-CONTRACT-LABEL: mad_sub_f16:
 ; GFX10-DENORM-CONTRACT:       ; %bb.0:
-; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, v1, v2, -v3
-; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-DENORM-CONTRACT-NEXT:    s_endpgm
 ;
 ; GFX11-FLUSH-LABEL: mad_sub_f16:
@@ -1558,49 +1558,49 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o
 ;
 ; GFX10-FLUSH-LABEL: mad_sub_inv_f16:
 ; GFX10-FLUSH:       ; %bb.0:
-; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX10-FLUSH-NEXT:    v_sub_f16_e32 v1, v3, v1
-; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX10-DENORM-STRICT-LABEL: mad_sub_inv_f16:
 ; GFX10-DENORM-STRICT:       ; %bb.0:
-; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-STRICT-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX10-DENORM-STRICT-NEXT:    v_sub_f16_e32 v1, v3, v1
-; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-DENORM-STRICT-NEXT:    s_endpgm
 ;
 ; GFX10-DENORM-CONTRACT-LABEL: mad_sub_inv_f16:
 ; GFX10-DENORM-CONTRACT:       ; %bb.0:
-; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, -v1, v2, v3
-; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-DENORM-CONTRACT-NEXT:    s_endpgm
 ;
 ; GFX11-FLUSH-LABEL: mad_sub_inv_f16:
@@ -1727,49 +1727,49 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture %
 ;
 ; GFX10-FLUSH-LABEL: mad_sub_fabs_f16:
 ; GFX10-FLUSH:       ; %bb.0:
-; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX10-FLUSH-NEXT:    v_sub_f16_e64 v1, v1, |v3|
-; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX10-DENORM-STRICT-LABEL: mad_sub_fabs_f16:
 ; GFX10-DENORM-STRICT:       ; %bb.0:
-; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-STRICT-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX10-DENORM-STRICT-NEXT:    v_sub_f16_e64 v1, v1, |v3|
-; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-DENORM-STRICT-NEXT:    s_endpgm
 ;
 ; GFX10-DENORM-CONTRACT-LABEL: mad_sub_fabs_f16:
 ; GFX10-DENORM-CONTRACT:       ; %bb.0:
-; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, v1, v2, -|v3|
-; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-DENORM-CONTRACT-NEXT:    s_endpgm
 ;
 ; GFX11-FLUSH-LABEL: mad_sub_fabs_f16:
@@ -1897,49 +1897,49 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu
 ;
 ; GFX10-FLUSH-LABEL: mad_sub_fabs_inv_f16:
 ; GFX10-FLUSH:       ; %bb.0:
-; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX10-FLUSH-NEXT:    v_sub_f16_e64 v1, |v3|, v1
-; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX10-DENORM-STRICT-LABEL: mad_sub_fabs_inv_f16:
 ; GFX10-DENORM-STRICT:       ; %bb.0:
-; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-STRICT-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX10-DENORM-STRICT-NEXT:    v_sub_f16_e64 v1, |v3|, v1
-; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-DENORM-STRICT-NEXT:    s_endpgm
 ;
 ; GFX10-DENORM-CONTRACT-LABEL: mad_sub_fabs_inv_f16:
 ; GFX10-DENORM-CONTRACT:       ; %bb.0:
-; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, -v1, v2, |v3|
-; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-DENORM-CONTRACT-NEXT:    s_endpgm
 ;
 ; GFX11-FLUSH-LABEL: mad_sub_fabs_inv_f16:
@@ -2067,49 +2067,49 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o
 ;
 ; GFX10-FLUSH-LABEL: neg_neg_mad_f16:
 ; GFX10-FLUSH:       ; %bb.0:
-; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX10-FLUSH-NEXT:    v_add_f16_e32 v1, v3, v1
-; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX10-DENORM-STRICT-LABEL: neg_neg_mad_f16:
 ; GFX10-DENORM-STRICT:       ; %bb.0:
-; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-STRICT-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX10-DENORM-STRICT-NEXT:    v_add_f16_e32 v1, v3, v1
-; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-DENORM-STRICT-NEXT:    s_endpgm
 ;
 ; GFX10-DENORM-CONTRACT-LABEL: neg_neg_mad_f16:
 ; GFX10-DENORM-CONTRACT:       ; %bb.0:
-; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-CONTRACT-NEXT:    v_fmac_f16_e32 v3, v1, v2
-; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v3, s[0:1]
+; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v3, s[4:5]
 ; GFX10-DENORM-CONTRACT-NEXT:    s_endpgm
 ;
 ; GFX11-FLUSH-LABEL: neg_neg_mad_f16:
@@ -2238,49 +2238,49 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture %
 ;
 ; GFX10-FLUSH-LABEL: mad_fabs_sub_f16:
 ; GFX10-FLUSH:       ; %bb.0:
-; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLUSH-NEXT:    v_mul_f16_e64 v1, v1, |v2|
 ; GFX10-FLUSH-NEXT:    v_sub_f16_e32 v1, v1, v3
-; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX10-DENORM-STRICT-LABEL: mad_fabs_sub_f16:
 ; GFX10-DENORM-STRICT:       ; %bb.0:
-; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-STRICT-NEXT:    v_mul_f16_e64 v1, v1, |v2|
 ; GFX10-DENORM-STRICT-NEXT:    v_sub_f16_e32 v1, v1, v3
-; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-DENORM-STRICT-NEXT:    s_endpgm
 ;
 ; GFX10-DENORM-CONTRACT-LABEL: mad_fabs_sub_f16:
 ; GFX10-DENORM-CONTRACT:       ; %bb.0:
-; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, v1, |v2|, -v3
-; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-DENORM-CONTRACT-NEXT:    s_endpgm
 ;
 ; GFX11-FLUSH-LABEL: mad_fabs_sub_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll
index 5761c198e20ba..f55e9f4821b47 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll
@@ -316,10 +316,10 @@ define <2 x i16> @global_atomic_fadd_v2bf16_rtn(ptr addrspace(1) %ptr, <2 x i16>
 define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, <2 x half> %data) {
 ; GFX940-LABEL: local_atomic_fadd_v2f16_noret:
 ; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX940-NEXT:    ds_pk_add_f16 v0, v1
 ; GFX940-NEXT:    s_endpgm
 ;
@@ -359,10 +359,10 @@ define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half>
 define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, <2 x i16> %data) {
 ; GFX940-LABEL: local_atomic_fadd_v2bf16_noret:
 ; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
 ; GFX940-NEXT:    ds_pk_add_bf16 v0, v1
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/fshl.ll b/llvm/test/CodeGen/AMDGPU/fshl.ll
index 4ea3323a9dbfc..3c4087fe391b6 100644
--- a/llvm/test/CodeGen/AMDGPU/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshl.ll
@@ -137,12 +137,12 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) {
 ;
 ; GFX9-LABEL: fshl_i32_imm:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    v_alignbit_b32 v1, s2, v1, 25
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    v_alignbit_b32 v1, s6, v1, 25
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; R600-LABEL: fshl_i32_imm:
@@ -159,11 +159,11 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) {
 ;
 ; GFX10-LABEL: fshl_i32_imm:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_alignbit_b32 v1, s2, s3, 25
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    v_alignbit_b32 v1, s6, s7, 25
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fshl_i32_imm:
@@ -734,15 +734,15 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) {
 ;
 ; GFX9-LABEL: orxor2or1:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b32 s4, s2, 7
-; GFX9-NEXT:    s_or_b32 s4, s3, s4
-; GFX9-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX9-NEXT:    s_cselect_b32 s2, s2, s3
-; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_lshl_b32 s0, s6, 7
+; GFX9-NEXT:    s_or_b32 s0, s7, s0
+; GFX9-NEXT:    s_cmp_eq_u32 s0, 0
+; GFX9-NEXT:    s_cselect_b32 s0, s6, s7
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; R600-LABEL: orxor2or1:
@@ -761,15 +761,15 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) {
 ;
 ; GFX10-LABEL: orxor2or1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_lshl_b32 s4, s2, 7
-; GFX10-NEXT:    s_or_b32 s4, s3, s4
-; GFX10-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX10-NEXT:    s_cselect_b32 s2, s2, s3
-; GFX10-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    s_lshl_b32 s0, s6, 7
+; GFX10-NEXT:    s_or_b32 s0, s7, s0
+; GFX10-NEXT:    s_cmp_eq_u32 s0, 0
+; GFX10-NEXT:    s_cselect_b32 s0, s6, s7
+; GFX10-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: orxor2or1:
diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll
index e8310e73f9a47..e8377763e4be2 100644
--- a/llvm/test/CodeGen/AMDGPU/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshr.ll
@@ -129,12 +129,12 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) {
 ;
 ; GFX9-LABEL: fshr_i32_imm:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    v_alignbit_b32 v1, s2, v1, 7
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    v_alignbit_b32 v1, s6, v1, 7
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; R600-LABEL: fshr_i32_imm:
@@ -151,11 +151,11 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) {
 ;
 ; GFX10-LABEL: fshr_i32_imm:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_alignbit_b32 v1, s2, s3, 7
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    v_alignbit_b32 v1, s6, s7, 7
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fshr_i32_imm:
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics.ll b/llvm/test/CodeGen/AMDGPU/global_atomics.ll
index dac3a3db7b450..d4398e5367c7f 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics.ll
@@ -4155,12 +4155,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr addrspace(1) %out, i32
 ;
 ; GFX9-LABEL: atomic_cmpxchg_i32_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
@@ -4406,12 +4406,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32(ptr addrspace(1) %out, i32 %in, i3
 ;
 ; GFX9-LABEL: atomic_cmpxchg_i32:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    global_atomic_cmpswap v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
index 516c92f1640ea..1fa7c52a68802 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
@@ -4679,28 +4679,28 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i
 ;
 ; GFX9-LABEL: atomic_max_i32_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s5, s3, 31
-; GFX9-NEXT:    s_mov_b32 s4, s3
-; GFX9-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
-; GFX9-NEXT:    s_add_u32 s0, s0, s4
-; GFX9-NEXT:    s_addc_u32 s1, s1, s5
-; GFX9-NEXT:    s_load_dword s3, s[0:1], 0x10
-; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:    s_ashr_i32 s1, s7, 31
+; GFX9-NEXT:    s_mov_b32 s0, s7
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; GFX9-NEXT:    s_add_u32 s0, s4, s0
+; GFX9-NEXT:    s_addc_u32 s1, s5, s1
+; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x10
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX9-NEXT:  .LBB91_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_max_i32_e32 v0, s2, v1
+; GFX9-NEXT:    v_max_i32_e32 v0, s6, v1
 ; GFX9-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB91_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_endpgm
@@ -4890,28 +4890,28 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in,
 ;
 ; GFX9-LABEL: atomic_max_i32_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s5, s3, 31
-; GFX9-NEXT:    s_mov_b32 s4, s3
-; GFX9-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
-; GFX9-NEXT:    s_add_u32 s0, s0, s4
-; GFX9-NEXT:    s_addc_u32 s1, s1, s5
-; GFX9-NEXT:    s_load_dword s3, s[0:1], 0x0
-; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:    s_ashr_i32 s1, s7, 31
+; GFX9-NEXT:    s_mov_b32 s0, s7
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; GFX9-NEXT:    s_add_u32 s0, s4, s0
+; GFX9-NEXT:    s_addc_u32 s1, s5, s1
+; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX9-NEXT:  .LBB93_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_max_i32_e32 v0, s2, v1
+; GFX9-NEXT:    v_max_i32_e32 v0, s6, v1
 ; GFX9-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB93_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_endpgm
@@ -5932,28 +5932,28 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out,
 ;
 ; GFX9-LABEL: atomic_umax_i32_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s5, s3, 31
-; GFX9-NEXT:    s_mov_b32 s4, s3
-; GFX9-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
-; GFX9-NEXT:    s_add_u32 s0, s0, s4
-; GFX9-NEXT:    s_addc_u32 s1, s1, s5
-; GFX9-NEXT:    s_load_dword s3, s[0:1], 0x10
-; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:    s_ashr_i32 s1, s7, 31
+; GFX9-NEXT:    s_mov_b32 s0, s7
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; GFX9-NEXT:    s_add_u32 s0, s4, s0
+; GFX9-NEXT:    s_addc_u32 s1, s5, s1
+; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x10
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX9-NEXT:  .LBB105_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_max_u32_e32 v0, s2, v1
+; GFX9-NEXT:    v_max_u32_e32 v0, s6, v1
 ; GFX9-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB105_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_endpgm
@@ -7923,28 +7923,28 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i
 ;
 ; GFX9-LABEL: atomic_min_i32_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s5, s3, 31
-; GFX9-NEXT:    s_mov_b32 s4, s3
-; GFX9-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
-; GFX9-NEXT:    s_add_u32 s0, s0, s4
-; GFX9-NEXT:    s_addc_u32 s1, s1, s5
-; GFX9-NEXT:    s_load_dword s3, s[0:1], 0x10
-; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:    s_ashr_i32 s1, s7, 31
+; GFX9-NEXT:    s_mov_b32 s0, s7
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; GFX9-NEXT:    s_add_u32 s0, s4, s0
+; GFX9-NEXT:    s_addc_u32 s1, s5, s1
+; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x10
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX9-NEXT:  .LBB128_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_min_i32_e32 v0, s2, v1
+; GFX9-NEXT:    v_min_i32_e32 v0, s6, v1
 ; GFX9-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB128_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
index df03e89370377..d5265e364a17e 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
@@ -436,121 +436,121 @@ entry:
 define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ; GFX9-LABEL: udiv_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX9-NEXT:    s_sub_i32 s4, 0, s3
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s7
+; GFX9-NEXT:    s_sub_i32 s0, 0, s7
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_readfirstlane_b32 s5, v0
-; GFX9-NEXT:    s_mul_i32 s4, s4, s5
-; GFX9-NEXT:    s_mul_hi_u32 s4, s5, s4
-; GFX9-NEXT:    s_add_i32 s5, s5, s4
-; GFX9-NEXT:    s_mul_hi_u32 s4, s2, s5
-; GFX9-NEXT:    s_mul_i32 s5, s4, s3
-; GFX9-NEXT:    s_sub_i32 s2, s2, s5
-; GFX9-NEXT:    s_add_i32 s6, s4, 1
-; GFX9-NEXT:    s_sub_i32 s5, s2, s3
-; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
-; GFX9-NEXT:    s_cselect_b32 s4, s6, s4
-; GFX9-NEXT:    s_cselect_b32 s2, s5, s2
-; GFX9-NEXT:    s_add_i32 s5, s4, 1
-; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
-; GFX9-NEXT:    s_cselect_b32 s2, s5, s4
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX9-NEXT:    s_mul_i32 s0, s0, s1
+; GFX9-NEXT:    s_mul_hi_u32 s0, s1, s0
+; GFX9-NEXT:    s_add_i32 s1, s1, s0
+; GFX9-NEXT:    s_mul_hi_u32 s0, s6, s1
+; GFX9-NEXT:    s_mul_i32 s1, s0, s7
+; GFX9-NEXT:    s_sub_i32 s1, s6, s1
+; GFX9-NEXT:    s_add_i32 s2, s0, 1
+; GFX9-NEXT:    s_sub_i32 s3, s1, s7
+; GFX9-NEXT:    s_cmp_ge_u32 s1, s7
+; GFX9-NEXT:    s_cselect_b32 s0, s2, s0
+; GFX9-NEXT:    s_cselect_b32 s1, s3, s1
+; GFX9-NEXT:    s_add_i32 s2, s0, 1
+; GFX9-NEXT:    s_cmp_ge_u32 s1, s7
+; GFX9-NEXT:    s_cselect_b32 s0, s2, s0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX90A-LABEL: udiv_i32:
 ; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, 0
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX90A-NEXT:    s_sub_i32 s4, 0, s3
+; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s7
+; GFX90A-NEXT:    s_sub_i32 s0, 0, s7
 ; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX90A-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX90A-NEXT:    v_readfirstlane_b32 s5, v0
-; GFX90A-NEXT:    s_mul_i32 s4, s4, s5
-; GFX90A-NEXT:    s_mul_hi_u32 s4, s5, s4
-; GFX90A-NEXT:    s_add_i32 s5, s5, s4
-; GFX90A-NEXT:    s_mul_hi_u32 s4, s2, s5
-; GFX90A-NEXT:    s_mul_i32 s5, s4, s3
-; GFX90A-NEXT:    s_sub_i32 s2, s2, s5
-; GFX90A-NEXT:    s_add_i32 s6, s4, 1
-; GFX90A-NEXT:    s_sub_i32 s5, s2, s3
-; GFX90A-NEXT:    s_cmp_ge_u32 s2, s3
-; GFX90A-NEXT:    s_cselect_b32 s4, s6, s4
-; GFX90A-NEXT:    s_cselect_b32 s2, s5, s2
-; GFX90A-NEXT:    s_add_i32 s5, s4, 1
-; GFX90A-NEXT:    s_cmp_ge_u32 s2, s3
-; GFX90A-NEXT:    s_cselect_b32 s2, s5, s4
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX90A-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX90A-NEXT:    s_mul_i32 s0, s0, s1
+; GFX90A-NEXT:    s_mul_hi_u32 s0, s1, s0
+; GFX90A-NEXT:    s_add_i32 s1, s1, s0
+; GFX90A-NEXT:    s_mul_hi_u32 s0, s6, s1
+; GFX90A-NEXT:    s_mul_i32 s1, s0, s7
+; GFX90A-NEXT:    s_sub_i32 s1, s6, s1
+; GFX90A-NEXT:    s_add_i32 s2, s0, 1
+; GFX90A-NEXT:    s_sub_i32 s3, s1, s7
+; GFX90A-NEXT:    s_cmp_ge_u32 s1, s7
+; GFX90A-NEXT:    s_cselect_b32 s0, s2, s0
+; GFX90A-NEXT:    s_cselect_b32 s1, s3, s1
+; GFX90A-NEXT:    s_add_i32 s2, s0, 1
+; GFX90A-NEXT:    s_cmp_ge_u32 s1, s7
+; GFX90A-NEXT:    s_cselect_b32 s0, s2, s0
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: udiv_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX10-NEXT:    s_sub_i32 s5, 0, s3
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s7
+; GFX10-NEXT:    s_sub_i32 s1, 0, s7
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX10-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-NEXT:    s_mul_i32 s5, s5, s4
-; GFX10-NEXT:    s_mul_hi_u32 s5, s4, s5
-; GFX10-NEXT:    s_add_i32 s4, s4, s5
-; GFX10-NEXT:    s_mul_hi_u32 s4, s2, s4
-; GFX10-NEXT:    s_mul_i32 s5, s4, s3
-; GFX10-NEXT:    s_sub_i32 s2, s2, s5
-; GFX10-NEXT:    s_add_i32 s5, s4, 1
-; GFX10-NEXT:    s_sub_i32 s6, s2, s3
-; GFX10-NEXT:    s_cmp_ge_u32 s2, s3
-; GFX10-NEXT:    s_cselect_b32 s4, s5, s4
-; GFX10-NEXT:    s_cselect_b32 s2, s6, s2
-; GFX10-NEXT:    s_add_i32 s5, s4, 1
-; GFX10-NEXT:    s_cmp_ge_u32 s2, s3
-; GFX10-NEXT:    s_cselect_b32 s2, s5, s4
-; GFX10-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    s_mul_i32 s1, s1, s0
+; GFX10-NEXT:    s_mul_hi_u32 s1, s0, s1
+; GFX10-NEXT:    s_add_i32 s0, s0, s1
+; GFX10-NEXT:    s_mul_hi_u32 s0, s6, s0
+; GFX10-NEXT:    s_mul_i32 s1, s0, s7
+; GFX10-NEXT:    s_add_i32 s2, s0, 1
+; GFX10-NEXT:    s_sub_i32 s1, s6, s1
+; GFX10-NEXT:    s_sub_i32 s3, s1, s7
+; GFX10-NEXT:    s_cmp_ge_u32 s1, s7
+; GFX10-NEXT:    s_cselect_b32 s0, s2, s0
+; GFX10-NEXT:    s_cselect_b32 s1, s3, s1
+; GFX10-NEXT:    s_add_i32 s2, s0, 1
+; GFX10-NEXT:    s_cmp_ge_u32 s1, s7
+; GFX10-NEXT:    s_cselect_b32 s0, s2, s0
+; GFX10-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-FLATSCR-LABEL: udiv_i32:
 ; GFX9-FLATSCR:       ; %bb.0:
-; GFX9-FLATSCR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-FLATSCR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-FLATSCR-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX9-FLATSCR-NEXT:    s_sub_i32 s4, 0, s3
+; GFX9-FLATSCR-NEXT:    v_cvt_f32_u32_e32 v0, s7
+; GFX9-FLATSCR-NEXT:    s_sub_i32 s0, 0, s7
 ; GFX9-FLATSCR-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-FLATSCR-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-FLATSCR-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-FLATSCR-NEXT:    v_readfirstlane_b32 s5, v0
-; GFX9-FLATSCR-NEXT:    s_mul_i32 s4, s4, s5
-; GFX9-FLATSCR-NEXT:    s_mul_hi_u32 s4, s5, s4
-; GFX9-FLATSCR-NEXT:    s_add_i32 s5, s5, s4
-; GFX9-FLATSCR-NEXT:    s_mul_hi_u32 s4, s2, s5
-; GFX9-FLATSCR-NEXT:    s_mul_i32 s5, s4, s3
-; GFX9-FLATSCR-NEXT:    s_sub_i32 s2, s2, s5
-; GFX9-FLATSCR-NEXT:    s_add_i32 s6, s4, 1
-; GFX9-FLATSCR-NEXT:    s_sub_i32 s5, s2, s3
-; GFX9-FLATSCR-NEXT:    s_cmp_ge_u32 s2, s3
-; GFX9-FLATSCR-NEXT:    s_cselect_b32 s4, s6, s4
-; GFX9-FLATSCR-NEXT:    s_cselect_b32 s2, s5, s2
-; GFX9-FLATSCR-NEXT:    s_add_i32 s5, s4, 1
-; GFX9-FLATSCR-NEXT:    s_cmp_ge_u32 s2, s3
-; GFX9-FLATSCR-NEXT:    s_cselect_b32 s2, s5, s4
-; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-FLATSCR-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-FLATSCR-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX9-FLATSCR-NEXT:    s_mul_i32 s0, s0, s1
+; GFX9-FLATSCR-NEXT:    s_mul_hi_u32 s0, s1, s0
+; GFX9-FLATSCR-NEXT:    s_add_i32 s1, s1, s0
+; GFX9-FLATSCR-NEXT:    s_mul_hi_u32 s0, s6, s1
+; GFX9-FLATSCR-NEXT:    s_mul_i32 s1, s0, s7
+; GFX9-FLATSCR-NEXT:    s_sub_i32 s1, s6, s1
+; GFX9-FLATSCR-NEXT:    s_add_i32 s2, s0, 1
+; GFX9-FLATSCR-NEXT:    s_sub_i32 s3, s1, s7
+; GFX9-FLATSCR-NEXT:    s_cmp_ge_u32 s1, s7
+; GFX9-FLATSCR-NEXT:    s_cselect_b32 s0, s2, s0
+; GFX9-FLATSCR-NEXT:    s_cselect_b32 s1, s3, s1
+; GFX9-FLATSCR-NEXT:    s_add_i32 s2, s0, 1
+; GFX9-FLATSCR-NEXT:    s_cmp_ge_u32 s1, s7
+; GFX9-FLATSCR-NEXT:    s_cselect_b32 s0, s2, s0
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-FLATSCR-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
index 920ff8a927e2d..45a1afbf11992 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
@@ -32,21 +32,21 @@ define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, float %x
 ;
 ; GFX9-LABEL: s_cvt_pkrtz_v2f16_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    v_cvt_pkrtz_f16_f32 v1, s2, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    v_cvt_pkrtz_f16_f32 v1, s6, v1
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: s_cvt_pkrtz_v2f16_f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_cvt_pkrtz_f16_f32_e64 v1, s2, s3
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    v_cvt_pkrtz_f16_f32_e64 v1, s6, s7
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_cvt_pkrtz_v2f16_f32:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll
index 17b941c59fd3f..5d20a848bd6a6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll
@@ -27,12 +27,12 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f32_oeq_with_fabs:
 ; SDAG-GFX10:       ; %bb.0:
-; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_eq_f32_e64 s2, s2, |s3|
-; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT:    v_cmp_eq_f32_e64 s0, s6, |s7|
+; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; SDAG-GFX10-NEXT:    s_endpgm
 ;
 ; GISEL-GFX11-LABEL: v_fcmp_f32_oeq_with_fabs:
@@ -50,12 +50,12 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f32_oeq_with_fabs:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_eq_f32_e64 s2, s2, |s3|
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT:    v_cmp_eq_f32_e64 s0, s6, |s7|
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %temp = call float @llvm.fabs.f32(float %a)
   %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float %temp, i32 1)
@@ -78,12 +78,12 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace(
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs:
 ; SDAG-GFX10:       ; %bb.0:
-; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_eq_f32_e64 s2, |s2|, |s3|
-; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT:    v_cmp_eq_f32_e64 s0, |s6|, |s7|
+; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; SDAG-GFX10-NEXT:    s_endpgm
 ;
 ; GISEL-GFX11-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs:
@@ -101,12 +101,12 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace(
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_eq_f32_e64 s2, |s2|, |s3|
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT:    v_cmp_eq_f32_e64 s0, |s6|, |s7|
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %temp = call float @llvm.fabs.f32(float %a)
   %src_input = call float @llvm.fabs.f32(float %src)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll
index ce055d6527996..674fec1b865a6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll
@@ -30,14 +30,14 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float
 ;
 ; GFX9-LABEL: v_fcmp_f32_oeq_with_fabs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s3
-; GFX9-NEXT:    v_cmp_eq_f32_e64 s[2:3], s2, |v0|
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s7
+; GFX9-NEXT:    v_cmp_eq_f32_e64 s[0:1], s6, |v0|
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_fcmp_f32_oeq_with_fabs:
@@ -88,14 +88,14 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace(
 ;
 ; GFX9-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s3
-; GFX9-NEXT:    v_cmp_eq_f32_e64 s[2:3], |s2|, |v0|
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s7
+; GFX9-NEXT:    v_cmp_eq_f32_e64 s[0:1], |s6|, |v0|
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll
index 309fd99031155..8fe85e49a4207 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll
@@ -1759,16 +1759,16 @@ define amdgpu_kernel void @v_icmp_i1_ne0(ptr addrspace(1) %out, i32 %a, i32 %b)
 ;
 ; GFX10-LABEL: v_icmp_i1_ne0:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_cmp_gt_u32 s2, 1
-; GFX10-NEXT:    s_cselect_b32 s2, -1, 0
-; GFX10-NEXT:    s_cmp_gt_u32 s3, 2
-; GFX10-NEXT:    s_cselect_b32 s3, -1, 0
-; GFX10-NEXT:    s_and_b32 s2, s2, s3
-; GFX10-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    s_cmp_gt_u32 s6, 1
+; GFX10-NEXT:    s_cselect_b32 s0, -1, 0
+; GFX10-NEXT:    s_cmp_gt_u32 s7, 2
+; GFX10-NEXT:    s_cselect_b32 s1, -1, 0
+; GFX10-NEXT:    s_and_b32 s0, s0, s1
+; GFX10-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
   %c0 = icmp ugt i32 %a, 1
   %c1 = icmp ugt i32 %b, 2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll
index 5f979e0177f58..a650f999835c6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll
@@ -1986,17 +1986,17 @@ define amdgpu_kernel void @v_icmp_i1_ne0(ptr addrspace(1) %out, i32 %a, i32 %b)
 ;
 ; GFX9-LABEL: v_icmp_i1_ne0:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_cmp_gt_u32 s2, 1
-; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT:    s_cmp_gt_u32 s3, 2
+; GFX9-NEXT:    s_cmp_gt_u32 s6, 1
+; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GFX9-NEXT:    s_cmp_gt_u32 s7, 2
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GFX9-NEXT:    s_and_b64 s[2:3], s[4:5], s[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %c0 = icmp ugt i32 %a, 1
   %c1 = icmp ugt i32 %b, 2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
index dba67a03c000e..b0706025f0b68 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
@@ -14,12 +14,12 @@ entry:
 define amdgpu_kernel void @test_iglp_opt_mfma_gemm(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 {
 ; GCN-LABEL: test_iglp_opt_mfma_gemm:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
 ; GCN-NEXT:    v_mov_b32_e32 v3, 2.0
 ; GCN-NEXT:    ; iglp_opt mask(0x00000000)
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_add_u32_e32 v1, s0, v0
+; GCN-NEXT:    v_add_u32_e32 v1, s2, v0
 ; GCN-NEXT:    v_add_u32_e32 v2, 0x6000, v1
 ; GCN-NEXT:    ds_read_b128 a[28:31], v2 offset:57456
 ; GCN-NEXT:    ds_read_b128 a[24:27], v2 offset:57440
@@ -44,7 +44,7 @@ define amdgpu_kernel void @test_iglp_opt_mfma_gemm(ptr addrspace(3) noalias %in,
 ; GCN-NEXT:    ds_read_b128 a[152:155], v1 offset:96
 ; GCN-NEXT:    ds_read_b128 a[68:71], v1 offset:24592
 ; GCN-NEXT:    ds_read_b128 a[64:67], v1 offset:24576
-; GCN-NEXT:    v_add_u32_e32 v0, s1, v0
+; GCN-NEXT:    v_add_u32_e32 v0, s3, v0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(4)
 ; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[32:63], v2, v3, a[32:63]
 ; GCN-NEXT:    ds_read_b128 a[148:151], v1 offset:80
@@ -80,7 +80,7 @@ define amdgpu_kernel void @test_iglp_opt_mfma_gemm(ptr addrspace(3) noalias %in,
 ; GCN-NEXT:    ds_write_b128 v0, a[136:139] offset:32
 ; GCN-NEXT:    ds_write_b128 v0, a[132:135] offset:16
 ; GCN-NEXT:    ds_write_b128 v0, a[128:131]
-; GCN-NEXT:    v_mov_b32_e32 v0, s1
+; GCN-NEXT:    v_mov_b32_e32 v0, s3
 ; GCN-NEXT:    s_waitcnt lgkmcnt(8)
 ; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[64:95], v2, v3, a[64:95]
 ; GCN-NEXT:    ds_write_b128 v0, a[56:59] offset:24672
@@ -151,13 +151,13 @@ entry:
 define amdgpu_kernel void @test_iglp_opt_rev_mfma_gemm(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 {
 ; GCN-LABEL: test_iglp_opt_rev_mfma_gemm:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
 ; GCN-NEXT:    v_mov_b32_e32 v2, 1.0
 ; GCN-NEXT:    v_mov_b32_e32 v3, 2.0
 ; GCN-NEXT:    ; iglp_opt mask(0x00000001)
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_add_u32_e32 v1, s0, v0
+; GCN-NEXT:    v_add_u32_e32 v1, s2, v0
 ; GCN-NEXT:    ds_read_b128 a[28:31], v1 offset:112
 ; GCN-NEXT:    ds_read_b128 a[24:27], v1 offset:96
 ; GCN-NEXT:    ds_read_b128 a[20:23], v1 offset:80
@@ -176,7 +176,7 @@ define amdgpu_kernel void @test_iglp_opt_rev_mfma_gemm(ptr addrspace(3) noalias
 ; GCN-NEXT:    ds_read_b128 a[136:139], v1 offset:8224
 ; GCN-NEXT:    ds_read_b128 a[132:135], v1 offset:8208
 ; GCN-NEXT:    ds_read_b128 a[128:131], v1 offset:8192
-; GCN-NEXT:    v_add_u32_e32 v0, s1, v0
+; GCN-NEXT:    v_add_u32_e32 v0, s3, v0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[128:159], v2, v3, a[128:159]
 ; GCN-NEXT:    ds_read_b128 a[124:127], v1 offset:24688
@@ -218,7 +218,7 @@ define amdgpu_kernel void @test_iglp_opt_rev_mfma_gemm(ptr addrspace(3) noalias
 ; GCN-NEXT:    ds_write_b128 v0, a[8:11] offset:32
 ; GCN-NEXT:    ds_write_b128 v0, a[4:7] offset:16
 ; GCN-NEXT:    ds_write_b128 v0, a[0:3]
-; GCN-NEXT:    v_mov_b32_e32 v0, s1
+; GCN-NEXT:    v_mov_b32_e32 v0, s3
 ; GCN-NEXT:    ds_write_b128 v0, a[152:155] offset:8288
 ; GCN-NEXT:    ds_write_b128 v0, a[156:159] offset:8304
 ; GCN-NEXT:    ds_write_b128 v0, a[144:147] offset:8256
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
index 265d64f47bb23..cd92529b77165 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
@@ -245,24 +245,26 @@ define amdgpu_kernel void @v_permlane16_b32_vvv(ptr addrspace(1) %out, i32 %src0
 define amdgpu_kernel void @v_permlane16_b32_vvs(ptr addrspace(1) %out, i32 %src0, i32 %src2) {
 ; GFX10-SDAG-LABEL: v_permlane16_b32_vvs:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    s_mov_b32 null, 0
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-SDAG-NEXT:    v_permlane16_b32 v1, v1, s2, s3
-; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, s7
+; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_permlane16_b32_vvs:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_mov_b32 null, 0
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, s4, s3
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s7
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_permlane16_b32_vvs:
@@ -325,27 +327,17 @@ define amdgpu_kernel void @v_permlane16_b32_vvs(ptr addrspace(1) %out, i32 %src0
 }
 
 define amdgpu_kernel void @v_permlane16_b32_vsv(ptr addrspace(1) %out, i32 %src0, i32 %src1) {
-; GFX10-SDAG-LABEL: v_permlane16_b32_vsv:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-SDAG-NEXT:    v_permlane16_b32 v0, v0, s3, s2
-; GFX10-SDAG-NEXT:    global_store_dword v1, v0, s[0:1]
-; GFX10-SDAG-NEXT:    s_endpgm
-;
-; GFX10-GISEL-LABEL: v_permlane16_b32_vsv:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, s3, s4
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
-; GFX10-GISEL-NEXT:    s_endpgm
+; GFX10-LABEL: v_permlane16_b32_vsv:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_mov_b32 null, 0
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-NEXT:    v_permlane16_b32 v0, v0, s7, s0
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_permlane16_b32_vsv:
 ; GFX11-SDAG:       ; %bb.0:
@@ -781,24 +773,26 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv(ptr addrspace(1) %out, i32 %src
 define amdgpu_kernel void @v_permlanex16_b32_vvs(ptr addrspace(1) %out, i32 %src0, i32 %src2) {
 ; GFX10-SDAG-LABEL: v_permlanex16_b32_vvs:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-SDAG-NEXT:    s_mov_b32 null, 0
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s2, s3
-; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, s7
+; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_permlanex16_b32_vvs:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-GISEL-NEXT:    s_mov_b32 null, 0
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s4, s3
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s7
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_permlanex16_b32_vvs:
@@ -861,27 +855,17 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs(ptr addrspace(1) %out, i32 %src
 }
 
 define amdgpu_kernel void @v_permlanex16_b32_vsv(ptr addrspace(1) %out, i32 %src0, i32 %src1) {
-; GFX10-SDAG-LABEL: v_permlanex16_b32_vsv:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s3, s2
-; GFX10-SDAG-NEXT:    global_store_dword v1, v0, s[0:1]
-; GFX10-SDAG-NEXT:    s_endpgm
-;
-; GFX10-GISEL-LABEL: v_permlanex16_b32_vsv:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s3, s4
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
-; GFX10-GISEL-NEXT:    s_endpgm
+; GFX10-LABEL: v_permlanex16_b32_vsv:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_mov_b32 null, 0
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-NEXT:    v_permlanex16_b32 v0, v0, s7, s0
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_permlanex16_b32_vsv:
 ; GFX11-SDAG:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
index 10f09b6390aba..ae5b62ffb285b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
@@ -29,18 +29,18 @@ entry:
 define amdgpu_kernel void @test_sched_group_barrier_pipeline_READ_VALU_WRITE(ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %out) #0 {
 ; GCN-LABEL: test_sched_group_barrier_pipeline_READ_VALU_WRITE:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GCN-NEXT:    v_lshlrev_b32_e32 v32, 7, v0
-; GCN-NEXT:    ; kill: killed $sgpr0_sgpr1
+; GCN-NEXT:    ; kill: killed $sgpr4_sgpr5
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    global_load_dwordx4 v[0:3], v32, s[0:1]
-; GCN-NEXT:    global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
-; GCN-NEXT:    global_load_dwordx4 v[8:11], v32, s[0:1] offset:32
-; GCN-NEXT:    global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
-; GCN-NEXT:    global_load_dwordx4 v[16:19], v32, s[0:1] offset:64
-; GCN-NEXT:    global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
-; GCN-NEXT:    global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
-; GCN-NEXT:    global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
+; GCN-NEXT:    global_load_dwordx4 v[0:3], v32, s[4:5]
+; GCN-NEXT:    global_load_dwordx4 v[4:7], v32, s[4:5] offset:16
+; GCN-NEXT:    global_load_dwordx4 v[8:11], v32, s[4:5] offset:32
+; GCN-NEXT:    global_load_dwordx4 v[12:15], v32, s[4:5] offset:48
+; GCN-NEXT:    global_load_dwordx4 v[16:19], v32, s[4:5] offset:64
+; GCN-NEXT:    global_load_dwordx4 v[20:23], v32, s[4:5] offset:80
+; GCN-NEXT:    global_load_dwordx4 v[24:27], v32, s[4:5] offset:96
+; GCN-NEXT:    global_load_dwordx4 v[28:31], v32, s[4:5] offset:112
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000020) size(8) SyncID(0)
 ; GCN-NEXT:    s_waitcnt vmcnt(7)
 ; GCN-NEXT:    v_mul_lo_u32 v3, v3, v3
@@ -82,32 +82,32 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_READ_VALU_WRITE(ptr
 ; GCN-NEXT:    v_mul_lo_u32 v30, v30, v30
 ; GCN-NEXT:    v_mul_lo_u32 v29, v29, v29
 ; GCN-NEXT:    v_mul_lo_u32 v28, v28, v28
-; GCN-NEXT:    global_store_dwordx4 v32, v[28:31], s[2:3] offset:112
-; GCN-NEXT:    global_store_dwordx4 v32, v[24:27], s[2:3] offset:96
-; GCN-NEXT:    global_store_dwordx4 v32, v[20:23], s[2:3] offset:80
-; GCN-NEXT:    global_store_dwordx4 v32, v[16:19], s[2:3] offset:64
-; GCN-NEXT:    global_store_dwordx4 v32, v[12:15], s[2:3] offset:48
-; GCN-NEXT:    global_store_dwordx4 v32, v[8:11], s[2:3] offset:32
-; GCN-NEXT:    global_store_dwordx4 v32, v[4:7], s[2:3] offset:16
-; GCN-NEXT:    global_store_dwordx4 v32, v[0:3], s[2:3]
+; GCN-NEXT:    global_store_dwordx4 v32, v[28:31], s[6:7] offset:112
+; GCN-NEXT:    global_store_dwordx4 v32, v[24:27], s[6:7] offset:96
+; GCN-NEXT:    global_store_dwordx4 v32, v[20:23], s[6:7] offset:80
+; GCN-NEXT:    global_store_dwordx4 v32, v[16:19], s[6:7] offset:64
+; GCN-NEXT:    global_store_dwordx4 v32, v[12:15], s[6:7] offset:48
+; GCN-NEXT:    global_store_dwordx4 v32, v[8:11], s[6:7] offset:32
+; GCN-NEXT:    global_store_dwordx4 v32, v[4:7], s[6:7] offset:16
+; GCN-NEXT:    global_store_dwordx4 v32, v[0:3], s[6:7]
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000002) size(30) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000040) size(8) SyncID(0)
 ; GCN-NEXT:    s_endpgm
 ;
 ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_READ_VALU_WRITE:
 ; EXACTCUTOFF:       ; %bb.0:
-; EXACTCUTOFF-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; EXACTCUTOFF-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; EXACTCUTOFF-NEXT:    v_lshlrev_b32_e32 v32, 7, v0
-; EXACTCUTOFF-NEXT:    ; kill: killed $sgpr0_sgpr1
+; EXACTCUTOFF-NEXT:    ; kill: killed $sgpr4_sgpr5
 ; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[0:3], v32, s[0:1]
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[8:11], v32, s[0:1] offset:32
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[16:19], v32, s[0:1] offset:64
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[0:3], v32, s[4:5]
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[4:7], v32, s[4:5] offset:16
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[8:11], v32, s[4:5] offset:32
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[12:15], v32, s[4:5] offset:48
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[16:19], v32, s[4:5] offset:64
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[20:23], v32, s[4:5] offset:80
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[24:27], v32, s[4:5] offset:96
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[28:31], v32, s[4:5] offset:112
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000020) size(8) SyncID(0)
 ; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(7)
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v3, v3, v3
@@ -149,14 +149,14 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_READ_VALU_WRITE(ptr
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v30, v30, v30
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v29, v29, v29
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v28, v28, v28
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[28:31], s[2:3] offset:112
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[24:27], s[2:3] offset:96
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[20:23], s[2:3] offset:80
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[16:19], s[2:3] offset:64
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[12:15], s[2:3] offset:48
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[8:11], s[2:3] offset:32
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[4:7], s[2:3] offset:16
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[0:3], s[2:3]
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[28:31], s[6:7] offset:112
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[24:27], s[6:7] offset:96
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[20:23], s[6:7] offset:80
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[16:19], s[6:7] offset:64
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[12:15], s[6:7] offset:48
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[8:11], s[6:7] offset:32
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[4:7], s[6:7] offset:16
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[0:3], s[6:7]
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000002) size(30) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000040) size(8) SyncID(0)
 ; EXACTCUTOFF-NEXT:    s_endpgm
@@ -178,17 +178,17 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_READ_VALU_WRITE(ptr
 define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VALU(ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %out) #0 {
 ; GCN-LABEL: test_sched_group_barrier_pipeline_alternating_READ_VALU:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GCN-NEXT:    v_lshlrev_b32_e32 v32, 7, v0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    global_load_dwordx4 v[28:31], v32, s[0:1] offset:16
-; GCN-NEXT:    global_load_dwordx4 v[8:11], v32, s[0:1] offset:96
+; GCN-NEXT:    global_load_dwordx4 v[28:31], v32, s[4:5] offset:16
+; GCN-NEXT:    global_load_dwordx4 v[8:11], v32, s[4:5] offset:96
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_mul_lo_u32 v29, v29, v29
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mul_lo_u32 v9, v9, v9
-; GCN-NEXT:    global_load_dwordx4 v[0:3], v32, s[0:1]
+; GCN-NEXT:    global_load_dwordx4 v[0:3], v32, s[4:5]
 ; GCN-NEXT:    v_mul_lo_u32 v8, v8, v8
 ; GCN-NEXT:    v_mul_lo_u32 v28, v28, v28
 ; GCN-NEXT:    v_mul_lo_u32 v31, v31, v31
@@ -198,12 +198,12 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mul_lo_u32 v3, v3, v3
 ; GCN-NEXT:    v_mul_lo_u32 v2, v2, v2
-; GCN-NEXT:    global_load_dwordx4 v[4:7], v32, s[0:1] offset:112
+; GCN-NEXT:    global_load_dwordx4 v[4:7], v32, s[4:5] offset:112
 ; GCN-NEXT:    v_mul_lo_u32 v1, v1, v1
 ; GCN-NEXT:    v_mul_lo_u32 v0, v0, v0
 ; GCN-NEXT:    v_mul_lo_u32 v11, v11, v11
 ; GCN-NEXT:    v_mul_lo_u32 v10, v10, v10
-; GCN-NEXT:    global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
+; GCN-NEXT:    global_load_dwordx4 v[12:15], v32, s[4:5] offset:48
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
@@ -218,11 +218,11 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mul_lo_u32 v13, v13, v13
 ; GCN-NEXT:    v_mul_lo_u32 v15, v15, v15
-; GCN-NEXT:    global_load_dwordx4 v[16:19], v32, s[0:1] offset:80
+; GCN-NEXT:    global_load_dwordx4 v[16:19], v32, s[4:5] offset:80
 ; GCN-NEXT:    v_mul_lo_u32 v14, v14, v14
 ; GCN-NEXT:    v_mul_lo_u32 v12, v12, v12
-; GCN-NEXT:    global_load_dwordx4 v[20:23], v32, s[0:1] offset:64
-; GCN-NEXT:    global_load_dwordx4 v[24:27], v32, s[0:1] offset:32
+; GCN-NEXT:    global_load_dwordx4 v[20:23], v32, s[4:5] offset:64
+; GCN-NEXT:    global_load_dwordx4 v[24:27], v32, s[4:5] offset:32
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
@@ -242,14 +242,14 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; GCN-NEXT:    v_mul_lo_u32 v21, v21, v21
 ; GCN-NEXT:    v_mul_lo_u32 v20, v20, v20
 ; GCN-NEXT:    v_mul_lo_u32 v16, v16, v16
-; GCN-NEXT:    global_store_dwordx4 v32, v[4:7], s[2:3] offset:112
-; GCN-NEXT:    global_store_dwordx4 v32, v[8:11], s[2:3] offset:96
-; GCN-NEXT:    global_store_dwordx4 v32, v[16:19], s[2:3] offset:80
-; GCN-NEXT:    global_store_dwordx4 v32, v[20:23], s[2:3] offset:64
-; GCN-NEXT:    global_store_dwordx4 v32, v[12:15], s[2:3] offset:48
-; GCN-NEXT:    global_store_dwordx4 v32, v[24:27], s[2:3] offset:32
-; GCN-NEXT:    global_store_dwordx4 v32, v[28:31], s[2:3] offset:16
-; GCN-NEXT:    global_store_dwordx4 v32, v[0:3], s[2:3]
+; GCN-NEXT:    global_store_dwordx4 v32, v[4:7], s[6:7] offset:112
+; GCN-NEXT:    global_store_dwordx4 v32, v[8:11], s[6:7] offset:96
+; GCN-NEXT:    global_store_dwordx4 v32, v[16:19], s[6:7] offset:80
+; GCN-NEXT:    global_store_dwordx4 v32, v[20:23], s[6:7] offset:64
+; GCN-NEXT:    global_store_dwordx4 v32, v[12:15], s[6:7] offset:48
+; GCN-NEXT:    global_store_dwordx4 v32, v[24:27], s[6:7] offset:32
+; GCN-NEXT:    global_store_dwordx4 v32, v[28:31], s[6:7] offset:16
+; GCN-NEXT:    global_store_dwordx4 v32, v[0:3], s[6:7]
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
@@ -258,17 +258,17 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ;
 ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_alternating_READ_VALU:
 ; EXACTCUTOFF:       ; %bb.0:
-; EXACTCUTOFF-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; EXACTCUTOFF-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; EXACTCUTOFF-NEXT:    v_lshlrev_b32_e32 v32, 7, v0
 ; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[28:31], v32, s[0:1] offset:16
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[8:11], v32, s[0:1] offset:96
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[28:31], v32, s[4:5] offset:16
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[8:11], v32, s[4:5] offset:96
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(1)
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v29, v29, v29
 ; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(0)
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v9, v9, v9
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[0:3], v32, s[0:1]
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[0:3], v32, s[4:5]
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v8, v8, v8
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v28, v28, v28
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v31, v31, v31
@@ -278,12 +278,12 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(0)
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v3, v3, v3
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v2, v2, v2
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[4:7], v32, s[0:1] offset:112
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[4:7], v32, s[4:5] offset:112
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v1, v1, v1
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v0, v0, v0
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v11, v11, v11
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v10, v10, v10
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[12:15], v32, s[4:5] offset:48
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
@@ -298,11 +298,11 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(0)
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v13, v13, v13
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v15, v15, v15
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[16:19], v32, s[0:1] offset:80
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[16:19], v32, s[4:5] offset:80
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v14, v14, v14
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v12, v12, v12
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[20:23], v32, s[0:1] offset:64
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[24:27], v32, s[0:1] offset:32
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[20:23], v32, s[4:5] offset:64
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[24:27], v32, s[4:5] offset:32
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
@@ -322,14 +322,14 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v21, v21, v21
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v20, v20, v20
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v16, v16, v16
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[4:7], s[2:3] offset:112
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[8:11], s[2:3] offset:96
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[16:19], s[2:3] offset:80
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[20:23], s[2:3] offset:64
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[12:15], s[2:3] offset:48
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[24:27], s[2:3] offset:32
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[28:31], s[2:3] offset:16
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[0:3], s[2:3]
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[4:7], s[6:7] offset:112
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[8:11], s[6:7] offset:96
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[16:19], s[6:7] offset:80
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[20:23], s[6:7] offset:64
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[12:15], s[6:7] offset:48
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[24:27], s[6:7] offset:32
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[28:31], s[6:7] offset:16
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[0:3], s[6:7]
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
@@ -381,18 +381,18 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VALU_WRITE(ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %out) #0 {
 ; GCN-LABEL: test_sched_group_barrier_pipeline_alternating_READ_VALU_WRITE:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GCN-NEXT:    v_lshlrev_b32_e32 v16, 7, v0
-; GCN-NEXT:    ; kill: killed $sgpr0_sgpr1
+; GCN-NEXT:    ; kill: killed $sgpr4_sgpr5
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    global_load_dwordx4 v[12:15], v16, s[0:1] offset:32
-; GCN-NEXT:    global_load_dwordx4 v[4:7], v16, s[0:1] offset:48
+; GCN-NEXT:    global_load_dwordx4 v[12:15], v16, s[4:5] offset:32
+; GCN-NEXT:    global_load_dwordx4 v[4:7], v16, s[4:5] offset:48
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_mul_lo_u32 v13, v13, v13
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mul_lo_u32 v7, v7, v7
-; GCN-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1]
+; GCN-NEXT:    global_load_dwordx4 v[0:3], v16, s[4:5]
 ; GCN-NEXT:    v_mul_lo_u32 v6, v6, v6
 ; GCN-NEXT:    v_mul_lo_u32 v12, v12, v12
 ; GCN-NEXT:    v_mul_lo_u32 v15, v15, v15
@@ -403,25 +403,25 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; GCN-NEXT:    v_mul_lo_u32 v2, v2, v2
 ; GCN-NEXT:    v_mul_lo_u32 v1, v1, v1
 ; GCN-NEXT:    v_mul_lo_u32 v0, v0, v0
-; GCN-NEXT:    global_store_dwordx4 v16, v[0:3], s[2:3]
-; GCN-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1] offset:112
+; GCN-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7]
+; GCN-NEXT:    global_load_dwordx4 v[0:3], v16, s[4:5] offset:112
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mul_lo_u32 v3, v3, v3
 ; GCN-NEXT:    v_mul_lo_u32 v2, v2, v2
 ; GCN-NEXT:    v_mul_lo_u32 v1, v1, v1
 ; GCN-NEXT:    v_mul_lo_u32 v0, v0, v0
-; GCN-NEXT:    global_store_dwordx4 v16, v[0:3], s[2:3] offset:112
-; GCN-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1] offset:96
+; GCN-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7] offset:112
+; GCN-NEXT:    global_load_dwordx4 v[0:3], v16, s[4:5] offset:96
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mul_lo_u32 v3, v3, v3
 ; GCN-NEXT:    v_mul_lo_u32 v2, v2, v2
 ; GCN-NEXT:    v_mul_lo_u32 v1, v1, v1
 ; GCN-NEXT:    v_mul_lo_u32 v0, v0, v0
-; GCN-NEXT:    global_store_dwordx4 v16, v[0:3], s[2:3] offset:96
+; GCN-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7] offset:96
 ; GCN-NEXT:    v_mul_lo_u32 v5, v5, v5
 ; GCN-NEXT:    v_mul_lo_u32 v4, v4, v4
-; GCN-NEXT:    global_store_dwordx4 v16, v[4:7], s[2:3] offset:48
-; GCN-NEXT:    global_load_dwordx4 v[4:7], v16, s[0:1] offset:64
+; GCN-NEXT:    global_store_dwordx4 v16, v[4:7], s[6:7] offset:48
+; GCN-NEXT:    global_load_dwordx4 v[4:7], v16, s[4:5] offset:64
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
@@ -430,9 +430,9 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; GCN-NEXT:    v_mul_lo_u32 v6, v6, v6
 ; GCN-NEXT:    v_mul_lo_u32 v5, v5, v5
 ; GCN-NEXT:    v_mul_lo_u32 v4, v4, v4
-; GCN-NEXT:    global_store_dwordx4 v16, v[4:7], s[2:3] offset:64
-; GCN-NEXT:    global_store_dwordx4 v16, v[12:15], s[2:3] offset:32
-; GCN-NEXT:    global_load_dwordx4 v[8:11], v16, s[0:1] offset:16
+; GCN-NEXT:    global_store_dwordx4 v16, v[4:7], s[6:7] offset:64
+; GCN-NEXT:    global_store_dwordx4 v16, v[12:15], s[6:7] offset:32
+; GCN-NEXT:    global_load_dwordx4 v[8:11], v16, s[4:5] offset:16
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
@@ -452,15 +452,15 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; GCN-NEXT:    v_mul_lo_u32 v8, v8, v8
 ; GCN-NEXT:    v_mul_lo_u32 v11, v11, v11
 ; GCN-NEXT:    v_mul_lo_u32 v10, v10, v10
-; GCN-NEXT:    global_store_dwordx4 v16, v[8:11], s[2:3] offset:16
-; GCN-NEXT:    global_load_dwordx4 v[8:11], v16, s[0:1] offset:80
+; GCN-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7] offset:16
+; GCN-NEXT:    global_load_dwordx4 v[8:11], v16, s[4:5] offset:80
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mul_lo_u32 v11, v11, v11
 ; GCN-NEXT:    v_mul_lo_u32 v10, v10, v10
 ; GCN-NEXT:    v_mul_lo_u32 v9, v9, v9
 ; GCN-NEXT:    v_mul_lo_u32 v8, v8, v8
-; GCN-NEXT:    global_store_dwordx4 v16, v[8:11], s[2:3] offset:80
+; GCN-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7] offset:80
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
@@ -469,18 +469,18 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ;
 ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_alternating_READ_VALU_WRITE:
 ; EXACTCUTOFF:       ; %bb.0:
-; EXACTCUTOFF-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; EXACTCUTOFF-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; EXACTCUTOFF-NEXT:    v_lshlrev_b32_e32 v16, 7, v0
-; EXACTCUTOFF-NEXT:    ; kill: killed $sgpr0_sgpr1
+; EXACTCUTOFF-NEXT:    ; kill: killed $sgpr4_sgpr5
 ; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[12:15], v16, s[0:1] offset:32
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[4:7], v16, s[0:1] offset:48
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[12:15], v16, s[4:5] offset:32
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[4:7], v16, s[4:5] offset:48
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(1)
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v13, v13, v13
 ; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(0)
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v7, v7, v7
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1]
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[0:3], v16, s[4:5]
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v6, v6, v6
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v12, v12, v12
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v15, v15, v15
@@ -491,25 +491,25 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v2, v2, v2
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v1, v1, v1
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v0, v0, v0
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[0:3], s[2:3]
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1] offset:112
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7]
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[0:3], v16, s[4:5] offset:112
 ; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(0)
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v3, v3, v3
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v2, v2, v2
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v1, v1, v1
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v0, v0, v0
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[0:3], s[2:3] offset:112
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1] offset:96
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7] offset:112
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[0:3], v16, s[4:5] offset:96
 ; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(0)
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v3, v3, v3
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v2, v2, v2
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v1, v1, v1
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v0, v0, v0
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[0:3], s[2:3] offset:96
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7] offset:96
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v5, v5, v5
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v4, v4, v4
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[4:7], s[2:3] offset:48
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[4:7], v16, s[0:1] offset:64
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[4:7], s[6:7] offset:48
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[4:7], v16, s[4:5] offset:64
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
@@ -518,9 +518,9 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v6, v6, v6
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v5, v5, v5
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v4, v4, v4
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[4:7], s[2:3] offset:64
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[12:15], s[2:3] offset:32
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[8:11], v16, s[0:1] offset:16
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[4:7], s[6:7] offset:64
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[12:15], s[6:7] offset:32
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[8:11], v16, s[4:5] offset:16
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
@@ -540,15 +540,15 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v8, v8, v8
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v11, v11, v11
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v10, v10, v10
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[8:11], s[2:3] offset:16
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[8:11], v16, s[0:1] offset:80
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7] offset:16
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[8:11], v16, s[4:5] offset:80
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
 ; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(0)
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v11, v11, v11
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v10, v10, v10
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v9, v9, v9
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v8, v8, v8
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[8:11], s[2:3] offset:80
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7] offset:80
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
@@ -614,10 +614,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 {
 ; GCN-LABEL: test_sched_group_barrier_pipeline_MFMA_cluster:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_add_u32_e32 v1, s0, v0
+; GCN-NEXT:    v_add_u32_e32 v1, s2, v0
 ; GCN-NEXT:    ds_read_b128 a[28:31], v1 offset:112
 ; GCN-NEXT:    ds_read_b128 a[24:27], v1 offset:96
 ; GCN-NEXT:    ds_read_b128 a[20:23], v1 offset:80
@@ -661,7 +661,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad
 ; GCN-NEXT:    ds_read_b128 a[136:139], v2 offset:57376
 ; GCN-NEXT:    ds_read_b128 a[140:143], v2 offset:57392
 ; GCN-NEXT:    v_mov_b32_e32 v2, 2.0
-; GCN-NEXT:    v_add_u32_e32 v0, s1, v0
+; GCN-NEXT:    v_add_u32_e32 v0, s3, v0
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(40) SyncID(0)
 ; GCN-NEXT:    s_waitcnt lgkmcnt(14)
 ; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
@@ -681,7 +681,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad
 ; GCN-NEXT:    ds_write_b128 v0, a[8:11] offset:32
 ; GCN-NEXT:    ds_write_b128 v0, a[4:7] offset:16
 ; GCN-NEXT:    ds_write_b128 v0, a[0:3]
-; GCN-NEXT:    v_mov_b32_e32 v0, s1
+; GCN-NEXT:    v_mov_b32_e32 v0, s3
 ; GCN-NEXT:    ds_write_b128 v0, a[56:59] offset:8288
 ; GCN-NEXT:    ds_write_b128 v0, a[60:63] offset:8304
 ; GCN-NEXT:    ds_write_b128 v0, a[48:51] offset:8256
@@ -720,10 +720,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad
 ;
 ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_MFMA_cluster:
 ; EXACTCUTOFF:       ; %bb.0: ; %entry
-; EXACTCUTOFF-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; EXACTCUTOFF-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; EXACTCUTOFF-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
 ; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
-; EXACTCUTOFF-NEXT:    v_add_u32_e32 v1, s0, v0
+; EXACTCUTOFF-NEXT:    v_add_u32_e32 v1, s2, v0
 ; EXACTCUTOFF-NEXT:    ds_read_b128 a[28:31], v1 offset:112
 ; EXACTCUTOFF-NEXT:    ds_read_b128 a[24:27], v1 offset:96
 ; EXACTCUTOFF-NEXT:    ds_read_b128 a[20:23], v1 offset:80
@@ -767,7 +767,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad
 ; EXACTCUTOFF-NEXT:    ds_read_b128 a[136:139], v2 offset:57376
 ; EXACTCUTOFF-NEXT:    ds_read_b128 a[140:143], v2 offset:57392
 ; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v2, 2.0
-; EXACTCUTOFF-NEXT:    v_add_u32_e32 v0, s1, v0
+; EXACTCUTOFF-NEXT:    v_add_u32_e32 v0, s3, v0
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000100) size(40) SyncID(0)
 ; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(14)
 ; EXACTCUTOFF-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
@@ -787,7 +787,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad
 ; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[8:11] offset:32
 ; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[4:7] offset:16
 ; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[0:3]
-; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v0, s1
+; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v0, s3
 ; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[56:59] offset:8288
 ; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[60:63] offset:8304
 ; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[48:51] offset:8256
@@ -862,12 +862,12 @@ entry:
 define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 {
 ; GCN-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GCN-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
 ; GCN-NEXT:    v_mov_b32_e32 v2, 1.0
 ; GCN-NEXT:    v_mov_b32_e32 v3, 2.0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_add_u32_e32 v0, s0, v1
+; GCN-NEXT:    v_add_u32_e32 v0, s2, v1
 ; GCN-NEXT:    ds_read_b128 a[28:31], v0 offset:112
 ; GCN-NEXT:    ds_read_b128 a[24:27], v0 offset:96
 ; GCN-NEXT:    ds_read_b128 a[20:23], v0 offset:80
@@ -878,7 +878,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; GCN-NEXT:    ds_read_b128 a[12:15], v0 offset:48
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
-; GCN-NEXT:    v_add_u32_e32 v1, s1, v1
+; GCN-NEXT:    v_add_u32_e32 v1, s3, v1
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
 ; GCN-NEXT:    s_nop 7
@@ -902,7 +902,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; GCN-NEXT:    ds_read_b128 a[0:3], v0 offset:8192
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
-; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
@@ -995,12 +995,12 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ;
 ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave:
 ; EXACTCUTOFF:       ; %bb.0: ; %entry
-; EXACTCUTOFF-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; EXACTCUTOFF-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; EXACTCUTOFF-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
 ; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v2, 1.0
 ; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v3, 2.0
 ; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
-; EXACTCUTOFF-NEXT:    v_add_u32_e32 v0, s0, v1
+; EXACTCUTOFF-NEXT:    v_add_u32_e32 v0, s2, v1
 ; EXACTCUTOFF-NEXT:    ds_read_b128 a[28:31], v0 offset:112
 ; EXACTCUTOFF-NEXT:    ds_read_b128 a[24:27], v0 offset:96
 ; EXACTCUTOFF-NEXT:    ds_read_b128 a[20:23], v0 offset:80
@@ -1011,7 +1011,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; EXACTCUTOFF-NEXT:    ds_read_b128 a[12:15], v0 offset:48
 ; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
 ; EXACTCUTOFF-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
-; EXACTCUTOFF-NEXT:    v_add_u32_e32 v1, s1, v1
+; EXACTCUTOFF-NEXT:    v_add_u32_e32 v1, s3, v1
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    s_nop 7
@@ -1035,7 +1035,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; EXACTCUTOFF-NEXT:    ds_read_b128 a[0:3], v0 offset:8192
 ; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
 ; EXACTCUTOFF-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
-; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v1, s1
+; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v1, s3
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
index 3a867879bb809..4ce0ff20e3b73 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
@@ -440,59 +440,59 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
 ;
 ; GFX900-SDAG-LABEL: s_exp_v2f32:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX900-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v0, 0x3fb8aa3b
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x32a5705f
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v5, 0xc2ce8ed0
 ; GFX900-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_mul_f32_e32 v2, s3, v0
+; GFX900-SDAG-NEXT:    v_mul_f32_e32 v2, s7, v0
 ; GFX900-SDAG-NEXT:    v_rndne_f32_e32 v3, v2
-; GFX900-SDAG-NEXT:    v_fma_f32 v4, s3, v0, -v2
+; GFX900-SDAG-NEXT:    v_fma_f32 v4, s7, v0, -v2
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v2, v2, v3
-; GFX900-SDAG-NEXT:    v_fma_f32 v4, s3, v1, v4
-; GFX900-SDAG-NEXT:    v_mul_f32_e32 v6, s2, v0
+; GFX900-SDAG-NEXT:    v_fma_f32 v4, s7, v1, v4
+; GFX900-SDAG-NEXT:    v_mul_f32_e32 v6, s6, v0
 ; GFX900-SDAG-NEXT:    v_add_f32_e32 v2, v2, v4
 ; GFX900-SDAG-NEXT:    v_rndne_f32_e32 v7, v6
-; GFX900-SDAG-NEXT:    v_fma_f32 v0, s2, v0, -v6
+; GFX900-SDAG-NEXT:    v_fma_f32 v0, s6, v0, -v6
 ; GFX900-SDAG-NEXT:    v_cvt_i32_f32_e32 v3, v3
 ; GFX900-SDAG-NEXT:    v_exp_f32_e32 v2, v2
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v8, v6, v7
-; GFX900-SDAG-NEXT:    v_fma_f32 v0, s2, v1, v0
+; GFX900-SDAG-NEXT:    v_fma_f32 v0, s6, v1, v0
 ; GFX900-SDAG-NEXT:    v_add_f32_e32 v0, v8, v0
 ; GFX900-SDAG-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX900-SDAG-NEXT:    v_cvt_i32_f32_e32 v6, v7
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v2, v2, v3
-; GFX900-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s3, v5
+; GFX900-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s7, v5
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v3, 0x42b17218
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v7, 0x7f800000
-; GFX900-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s3, v3
+; GFX900-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s7, v3
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v6
-; GFX900-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s2, v5
+; GFX900-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s6, v5
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GFX900-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s2, v3
+; GFX900-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s6, v3
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc
-; GFX900-SDAG-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX900-SDAG-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
 ; GFX900-SDAG-NEXT:    s_endpgm
 ;
 ; GFX900-GISEL-LABEL: s_exp_v2f32:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX900-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, 0x3fb8aa3b
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x32a5705f
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7f800000
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, s2, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v3, s2, v0, -v2
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, s6, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v3, s6, v0, -v2
 ; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v4, v2
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v5, s3, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v3, s2, v1, v3
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v5, s7, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v3, s6, v1, v3
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v4
-; GFX900-GISEL-NEXT:    v_fma_f32 v0, s3, v0, -v5
+; GFX900-GISEL-NEXT:    v_fma_f32 v0, s7, v0, -v5
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
-; GFX900-GISEL-NEXT:    v_fma_f32 v0, s3, v1, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v0, s7, v1, v0
 ; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v1, v5
 ; GFX900-GISEL-NEXT:    v_cvt_i32_f32_e32 v3, v4
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v2, v2
@@ -502,18 +502,18 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v5, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0xc2ce8ed0
 ; GFX900-GISEL-NEXT:    v_ldexp_f32 v2, v2, v3
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v4
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v4
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x42b17218
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s2, v3
+; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s6, v3
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v6, vcc
 ; GFX900-GISEL-NEXT:    v_ldexp_f32 v1, v5, v1
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s3, v4
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s7, v4
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
-; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s3, v3
+; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s7, v3
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; GFX900-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX900-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX900-GISEL-NEXT:    s_endpgm
 ;
 ; SI-SDAG-LABEL: s_exp_v2f32:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
index a162949587481..5ab960f47f57b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
@@ -442,59 +442,59 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ;
 ; GFX900-SDAG-LABEL: s_exp10_v2f32:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX900-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v0, 0x40549a78
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x33979a37
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v5, 0xc23369f4
 ; GFX900-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_mul_f32_e32 v2, s3, v0
+; GFX900-SDAG-NEXT:    v_mul_f32_e32 v2, s7, v0
 ; GFX900-SDAG-NEXT:    v_rndne_f32_e32 v3, v2
-; GFX900-SDAG-NEXT:    v_fma_f32 v4, s3, v0, -v2
+; GFX900-SDAG-NEXT:    v_fma_f32 v4, s7, v0, -v2
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v2, v2, v3
-; GFX900-SDAG-NEXT:    v_fma_f32 v4, s3, v1, v4
-; GFX900-SDAG-NEXT:    v_mul_f32_e32 v6, s2, v0
+; GFX900-SDAG-NEXT:    v_fma_f32 v4, s7, v1, v4
+; GFX900-SDAG-NEXT:    v_mul_f32_e32 v6, s6, v0
 ; GFX900-SDAG-NEXT:    v_add_f32_e32 v2, v2, v4
 ; GFX900-SDAG-NEXT:    v_rndne_f32_e32 v7, v6
-; GFX900-SDAG-NEXT:    v_fma_f32 v0, s2, v0, -v6
+; GFX900-SDAG-NEXT:    v_fma_f32 v0, s6, v0, -v6
 ; GFX900-SDAG-NEXT:    v_cvt_i32_f32_e32 v3, v3
 ; GFX900-SDAG-NEXT:    v_exp_f32_e32 v2, v2
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v8, v6, v7
-; GFX900-SDAG-NEXT:    v_fma_f32 v0, s2, v1, v0
+; GFX900-SDAG-NEXT:    v_fma_f32 v0, s6, v1, v0
 ; GFX900-SDAG-NEXT:    v_add_f32_e32 v0, v8, v0
 ; GFX900-SDAG-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX900-SDAG-NEXT:    v_cvt_i32_f32_e32 v6, v7
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v2, v2, v3
-; GFX900-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s3, v5
+; GFX900-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s7, v5
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v3, 0x421a209b
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v7, 0x7f800000
-; GFX900-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s3, v3
+; GFX900-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s7, v3
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v6
-; GFX900-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s2, v5
+; GFX900-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s6, v5
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GFX900-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s2, v3
+; GFX900-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s6, v3
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc
-; GFX900-SDAG-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX900-SDAG-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
 ; GFX900-SDAG-NEXT:    s_endpgm
 ;
 ; GFX900-GISEL-LABEL: s_exp10_v2f32:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX900-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, 0x40549a78
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x33979a37
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7f800000
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, s2, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v3, s2, v0, -v2
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, s6, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v3, s6, v0, -v2
 ; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v4, v2
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v5, s3, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v3, s2, v1, v3
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v5, s7, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v3, s6, v1, v3
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v4
-; GFX900-GISEL-NEXT:    v_fma_f32 v0, s3, v0, -v5
+; GFX900-GISEL-NEXT:    v_fma_f32 v0, s7, v0, -v5
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
-; GFX900-GISEL-NEXT:    v_fma_f32 v0, s3, v1, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v0, s7, v1, v0
 ; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v1, v5
 ; GFX900-GISEL-NEXT:    v_cvt_i32_f32_e32 v3, v4
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v2, v2
@@ -504,18 +504,18 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v5, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0xc23369f4
 ; GFX900-GISEL-NEXT:    v_ldexp_f32 v2, v2, v3
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v4
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v4
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x421a209b
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s2, v3
+; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s6, v3
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v6, vcc
 ; GFX900-GISEL-NEXT:    v_ldexp_f32 v1, v5, v1
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s3, v4
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s7, v4
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
-; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s3, v3
+; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s7, v3
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; GFX900-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX900-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX900-GISEL-NEXT:    s_endpgm
 ;
 ; SI-SDAG-LABEL: s_exp10_v2f32:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
index 36e78975cdb01..6cca705f7b1db 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
@@ -270,25 +270,25 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ;
 ; GFX900-SDAG-LABEL: s_exp2_v2f32:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX900-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v0, 0xc2fc0000
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v2, 0x42800000
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX900-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s3, v0
+; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s7, v0
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v3, 1.0, v1, vcc
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v2, vcc
-; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v0
+; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, 1.0, v1, vcc
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GFX900-SDAG-NEXT:    v_add_f32_e32 v4, s3, v4
-; GFX900-SDAG-NEXT:    v_add_f32_e32 v1, s2, v1
+; GFX900-SDAG-NEXT:    v_add_f32_e32 v4, s7, v4
+; GFX900-SDAG-NEXT:    v_add_f32_e32 v1, s6, v1
 ; GFX900-SDAG-NEXT:    v_exp_f32_e32 v4, v4
 ; GFX900-SDAG-NEXT:    v_exp_f32_e32 v2, v1
 ; GFX900-SDAG-NEXT:    v_mul_f32_e32 v1, v4, v3
 ; GFX900-SDAG-NEXT:    v_mul_f32_e32 v0, v2, v0
-; GFX900-SDAG-NEXT:    global_store_dwordx2 v5, v[0:1], s[0:1]
+; GFX900-SDAG-NEXT:    global_store_dwordx2 v5, v[0:1], s[4:5]
 ; GFX900-SDAG-NEXT:    s_endpgm
 ;
 ; GFX900-GISEL-LABEL: s_exp2_v2f32:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
index 035b2439eff15..90a15ae8d9b28 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
@@ -313,25 +313,25 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ;
 ; GFX900-SDAG-LABEL: s_log2_v2f32:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX900-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v0, 0x800000
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX900-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s3, v0
+; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s7, v0
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v3, 0, v1, vcc
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v4, 1.0, v2, vcc
-; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v0
+; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-SDAG-NEXT:    v_mul_f32_e32 v4, s3, v4
-; GFX900-SDAG-NEXT:    v_mul_f32_e32 v1, s2, v1
+; GFX900-SDAG-NEXT:    v_mul_f32_e32 v4, s7, v4
+; GFX900-SDAG-NEXT:    v_mul_f32_e32 v1, s6, v1
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v4, v4
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v2, v1
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v1, v4, v3
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v2, v0
-; GFX900-SDAG-NEXT:    global_store_dwordx2 v5, v[0:1], s[0:1]
+; GFX900-SDAG-NEXT:    global_store_dwordx2 v5, v[0:1], s[4:5]
 ; GFX900-SDAG-NEXT:    s_endpgm
 ;
 ; GFX900-GISEL-LABEL: s_log2_v2f32:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.ll
index 7ad7cc821c1b5..8196999b8f1f1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.round.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.round.ll
@@ -133,31 +133,57 @@ define amdgpu_kernel void @round_v2f32(ptr addrspace(1) %out, <2 x float> %in) #
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
-; GFX89-LABEL: round_v2f32:
-; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT:    s_brev_b32 s8, -2
-; GFX89-NEXT:    s_mov_b32 s7, 0xf000
-; GFX89-NEXT:    s_mov_b32 s6, -1
-; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX89-NEXT:    v_trunc_f32_e32 v0, s3
-; GFX89-NEXT:    v_sub_f32_e32 v1, s3, v0
-; GFX89-NEXT:    s_mov_b32 s4, s0
-; GFX89-NEXT:    s_mov_b32 s5, s1
-; GFX89-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
-; GFX89-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
-; GFX89-NEXT:    v_mov_b32_e32 v2, s3
-; GFX89-NEXT:    v_bfi_b32 v1, s8, v1, v2
-; GFX89-NEXT:    v_add_f32_e32 v1, v0, v1
-; GFX89-NEXT:    v_trunc_f32_e32 v0, s2
-; GFX89-NEXT:    v_sub_f32_e32 v2, s2, v0
-; GFX89-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v2|, 0.5
-; GFX89-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s[0:1]
-; GFX89-NEXT:    v_mov_b32_e32 v3, s2
-; GFX89-NEXT:    v_bfi_b32 v2, s8, v2, v3
-; GFX89-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX89-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
-; GFX89-NEXT:    s_endpgm
+; GFX8-LABEL: round_v2f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT:    s_brev_b32 s8, -2
+; GFX8-NEXT:    s_mov_b32 s7, 0xf000
+; GFX8-NEXT:    s_mov_b32 s6, -1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_trunc_f32_e32 v0, s3
+; GFX8-NEXT:    v_sub_f32_e32 v1, s3, v0
+; GFX8-NEXT:    s_mov_b32 s4, s0
+; GFX8-NEXT:    s_mov_b32 s5, s1
+; GFX8-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v2, s3
+; GFX8-NEXT:    v_bfi_b32 v1, s8, v1, v2
+; GFX8-NEXT:    v_add_f32_e32 v1, v0, v1
+; GFX8-NEXT:    v_trunc_f32_e32 v0, s2
+; GFX8-NEXT:    v_sub_f32_e32 v2, s2, v0
+; GFX8-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v2|, 0.5
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v3, s2
+; GFX8-NEXT:    v_bfi_b32 v2, s8, v2, v3
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-LABEL: round_v2f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_brev_b32 s8, -2
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_trunc_f32_e32 v0, s7
+; GFX9-NEXT:    v_sub_f32_e32 v1, s7, v0
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v2, s7
+; GFX9-NEXT:    v_bfi_b32 v1, s8, v1, v2
+; GFX9-NEXT:    v_add_f32_e32 v1, v0, v1
+; GFX9-NEXT:    v_trunc_f32_e32 v0, s6
+; GFX9-NEXT:    v_sub_f32_e32 v2, s6, v0
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v3, s6
+; GFX9-NEXT:    v_bfi_b32 v2, s8, v2, v3
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: round_v2f32:
 ; GFX11:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
index 994ef22539a65..a54405bf1b471 100644
--- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
@@ -8,12 +8,12 @@
 define amdgpu_kernel void @s_lshr_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
 ; GFX9-LABEL: s_lshr_v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    v_pk_lshrrev_b16 v1, s3, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-NEXT:    v_pk_lshrrev_b16 v1, s7, v1
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_lshr_v2i16:
@@ -54,11 +54,11 @@ define amdgpu_kernel void @s_lshr_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <
 ;
 ; GFX10-LABEL: s_lshr_v2i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_pk_lshrrev_b16 v1, s3, s2
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    v_pk_lshrrev_b16 v1, s7, s6
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_lshr_v2i16:
diff --git a/llvm/test/CodeGen/AMDGPU/madak.ll b/llvm/test/CodeGen/AMDGPU/madak.ll
index 9ec37a5e14cdf..944db3d3adc3a 100644
--- a/llvm/test/CodeGen/AMDGPU/madak.ll
+++ b/llvm/test/CodeGen/AMDGPU/madak.ll
@@ -220,40 +220,40 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad
 ;
 ; GFX9-LABEL: madak_2_use_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x41200000
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v0, s[2:3] offset:4 glc
+; GFX9-NEXT:    global_load_dword v2, v0, s[6:7] offset:4 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v0, s[2:3] offset:8 glc
+; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] offset:8 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_madak_f32 v2, v1, v2, 0x41200000
 ; GFX9-NEXT:    v_mac_f32_e32 v4, v1, v3
-; GFX9-NEXT:    global_store_dword v0, v2, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v2, s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_dword v0, v4, s[2:3] offset:4
+; GFX9-NEXT:    global_store_dword v0, v4, s[6:7] offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-MAD-LABEL: madak_2_use_f32:
 ; GFX10-MAD:       ; %bb.0:
-; GFX10-MAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-MAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-MAD-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-MAD-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-MAD-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
+; GFX10-MAD-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
 ; GFX10-MAD-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-MAD-NEXT:    global_load_dword v2, v0, s[2:3] offset:4 glc dlc
+; GFX10-MAD-NEXT:    global_load_dword v2, v0, s[6:7] offset:4 glc dlc
 ; GFX10-MAD-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-MAD-NEXT:    global_load_dword v3, v0, s[2:3] offset:8 glc dlc
+; GFX10-MAD-NEXT:    global_load_dword v3, v0, s[6:7] offset:8 glc dlc
 ; GFX10-MAD-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-MAD-NEXT:    v_madak_f32 v2, v1, v2, 0x41200000
 ; GFX10-MAD-NEXT:    v_madak_f32 v1, v1, v3, 0x41200000
-; GFX10-MAD-NEXT:    global_store_dword v0, v2, s[0:1]
+; GFX10-MAD-NEXT:    global_store_dword v0, v2, s[4:5]
 ; GFX10-MAD-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-MAD-NEXT:    global_store_dword v0, v1, s[2:3] offset:4
+; GFX10-MAD-NEXT:    global_store_dword v0, v1, s[6:7] offset:4
 ; GFX10-MAD-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-MAD-NEXT:    s_endpgm
 ;
@@ -282,40 +282,40 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad
 ;
 ; GFX940-FMA-LABEL: madak_2_use_f32:
 ; GFX940-FMA:       ; %bb.0:
-; GFX940-FMA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX940-FMA-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX940-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX940-FMA-NEXT:    v_mov_b32_e32 v4, 0x41200000
 ; GFX940-FMA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-FMA-NEXT:    global_load_dword v1, v0, s[2:3] sc0 sc1
+; GFX940-FMA-NEXT:    global_load_dword v1, v0, s[6:7] sc0 sc1
 ; GFX940-FMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-FMA-NEXT:    global_load_dword v2, v0, s[2:3] offset:4 sc0 sc1
+; GFX940-FMA-NEXT:    global_load_dword v2, v0, s[6:7] offset:4 sc0 sc1
 ; GFX940-FMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-FMA-NEXT:    global_load_dword v3, v0, s[2:3] offset:8 sc0 sc1
+; GFX940-FMA-NEXT:    global_load_dword v3, v0, s[6:7] offset:8 sc0 sc1
 ; GFX940-FMA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-FMA-NEXT:    v_fmaak_f32 v2, v1, v2, 0x41200000
 ; GFX940-FMA-NEXT:    v_fmac_f32_e32 v4, v1, v3
-; GFX940-FMA-NEXT:    global_store_dword v0, v2, s[0:1] sc0 sc1
+; GFX940-FMA-NEXT:    global_store_dword v0, v2, s[4:5] sc0 sc1
 ; GFX940-FMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-FMA-NEXT:    global_store_dword v0, v4, s[2:3] offset:4 sc0 sc1
+; GFX940-FMA-NEXT:    global_store_dword v0, v4, s[6:7] offset:4 sc0 sc1
 ; GFX940-FMA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-FMA-NEXT:    s_endpgm
 ;
 ; GFX10-FMA-LABEL: madak_2_use_f32:
 ; GFX10-FMA:       ; %bb.0:
-; GFX10-FMA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-FMA-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-FMA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-FMA-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
+; GFX10-FMA-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
 ; GFX10-FMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FMA-NEXT:    global_load_dword v2, v0, s[2:3] offset:4 glc dlc
+; GFX10-FMA-NEXT:    global_load_dword v2, v0, s[6:7] offset:4 glc dlc
 ; GFX10-FMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FMA-NEXT:    global_load_dword v3, v0, s[2:3] offset:8 glc dlc
+; GFX10-FMA-NEXT:    global_load_dword v3, v0, s[6:7] offset:8 glc dlc
 ; GFX10-FMA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FMA-NEXT:    v_fmaak_f32 v2, v1, v2, 0x41200000
 ; GFX10-FMA-NEXT:    v_fmaak_f32 v1, v1, v3, 0x41200000
-; GFX10-FMA-NEXT:    global_store_dword v0, v2, s[0:1]
+; GFX10-FMA-NEXT:    global_store_dword v0, v2, s[4:5]
 ; GFX10-FMA-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-FMA-NEXT:    global_store_dword v0, v1, s[2:3] offset:4
+; GFX10-FMA-NEXT:    global_store_dword v0, v1, s[6:7] offset:4
 ; GFX10-FMA-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-FMA-NEXT:    s_endpgm
 ;
@@ -398,24 +398,24 @@ define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out,
 ;
 ; GFX9-LABEL: madak_m_inline_imm_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_madak_f32 v1, 4.0, v1, 0x41200000
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-MAD-LABEL: madak_m_inline_imm_f32:
 ; GFX10-MAD:       ; %bb.0:
-; GFX10-MAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-MAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-MAD-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-MAD-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-MAD-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-MAD-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-MAD-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-MAD-NEXT:    v_madak_f32 v1, 4.0, v1, 0x41200000
-; GFX10-MAD-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-MAD-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-MAD-NEXT:    s_endpgm
 ;
 ; GFX11-MAD-LABEL: madak_m_inline_imm_f32:
@@ -435,24 +435,24 @@ define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out,
 ;
 ; GFX940-FMA-LABEL: madak_m_inline_imm_f32:
 ; GFX940-FMA:       ; %bb.0:
-; GFX940-FMA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX940-FMA-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX940-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX940-FMA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-FMA-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX940-FMA-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX940-FMA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-FMA-NEXT:    v_fmaak_f32 v1, 4.0, v1, 0x41200000
-; GFX940-FMA-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
+; GFX940-FMA-NEXT:    global_store_dword v0, v1, s[4:5] sc0 sc1
 ; GFX940-FMA-NEXT:    s_endpgm
 ;
 ; GFX10-FMA-LABEL: madak_m_inline_imm_f32:
 ; GFX10-FMA:       ; %bb.0:
-; GFX10-FMA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-FMA-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-FMA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-FMA-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-FMA-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-FMA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FMA-NEXT:    v_fmaak_f32 v1, 4.0, v1, 0x41200000
-; GFX10-FMA-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-FMA-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-FMA-NEXT:    s_endpgm
 ;
 ; GFX11-FMA-LABEL: madak_m_inline_imm_f32:
@@ -931,23 +931,23 @@ define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float
 ;
 ; GFX9-LABEL: s_s_madak_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x41200000
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v2, s3
-; GFX9-NEXT:    v_mac_f32_e32 v1, s2, v2
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v2, s7
+; GFX9-NEXT:    v_mac_f32_e32 v1, s6, v2
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-MAD-LABEL: s_s_madak_f32:
 ; GFX10-MAD:       ; %bb.0:
-; GFX10-MAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-MAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-MAD-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-MAD-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-MAD-NEXT:    v_mov_b32_e32 v0, s3
-; GFX10-MAD-NEXT:    v_madak_f32 v0, s2, v0, 0x41200000
-; GFX10-MAD-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-MAD-NEXT:    v_mov_b32_e32 v0, s7
+; GFX10-MAD-NEXT:    v_madak_f32 v0, s6, v0, 0x41200000
+; GFX10-MAD-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-MAD-NEXT:    s_endpgm
 ;
 ; GFX11-MAD-LABEL: s_s_madak_f32:
@@ -964,23 +964,23 @@ define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float
 ;
 ; GFX940-FMA-LABEL: s_s_madak_f32:
 ; GFX940-FMA:       ; %bb.0:
-; GFX940-FMA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX940-FMA-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX940-FMA-NEXT:    v_mov_b32_e32 v1, 0x41200000
 ; GFX940-FMA-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-FMA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-FMA-NEXT:    v_mov_b32_e32 v2, s3
-; GFX940-FMA-NEXT:    v_fmac_f32_e32 v1, s2, v2
-; GFX940-FMA-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
+; GFX940-FMA-NEXT:    v_mov_b32_e32 v2, s7
+; GFX940-FMA-NEXT:    v_fmac_f32_e32 v1, s6, v2
+; GFX940-FMA-NEXT:    global_store_dword v0, v1, s[4:5] sc0 sc1
 ; GFX940-FMA-NEXT:    s_endpgm
 ;
 ; GFX10-FMA-LABEL: s_s_madak_f32:
 ; GFX10-FMA:       ; %bb.0:
-; GFX10-FMA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-FMA-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-FMA-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-FMA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-FMA-NEXT:    v_mov_b32_e32 v0, s3
-; GFX10-FMA-NEXT:    v_fmaak_f32 v0, s2, v0, 0x41200000
-; GFX10-FMA-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-FMA-NEXT:    v_mov_b32_e32 v0, s7
+; GFX10-FMA-NEXT:    v_fmaak_f32 v0, s6, v0, 0x41200000
+; GFX10-FMA-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-FMA-NEXT:    s_endpgm
 ;
 ; GFX11-FMA-LABEL: s_s_madak_f32:
diff --git a/llvm/test/CodeGen/AMDGPU/memory_clause.ll b/llvm/test/CodeGen/AMDGPU/memory_clause.ll
index 940287d44d8d1..86a5055ab0704 100644
--- a/llvm/test/CodeGen/AMDGPU/memory_clause.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory_clause.ll
@@ -5,21 +5,21 @@
 define amdgpu_kernel void @vector_clause(ptr addrspace(1) noalias nocapture readonly %arg, ptr addrspace(1) noalias nocapture %arg1) {
 ; GCN-LABEL: vector_clause:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GCN-NEXT:    v_lshlrev_b32_e32 v16, 4, v0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1]
-; GCN-NEXT:    global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
-; GCN-NEXT:    global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
-; GCN-NEXT:    global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
+; GCN-NEXT:    global_load_dwordx4 v[0:3], v16, s[4:5]
+; GCN-NEXT:    global_load_dwordx4 v[4:7], v16, s[4:5] offset:16
+; GCN-NEXT:    global_load_dwordx4 v[8:11], v16, s[4:5] offset:32
+; GCN-NEXT:    global_load_dwordx4 v[12:15], v16, s[4:5] offset:48
 ; GCN-NEXT:    s_waitcnt vmcnt(3)
-; GCN-NEXT:    global_store_dwordx4 v16, v[0:3], s[2:3]
+; GCN-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7]
 ; GCN-NEXT:    s_waitcnt vmcnt(3)
-; GCN-NEXT:    global_store_dwordx4 v16, v[4:7], s[2:3] offset:16
+; GCN-NEXT:    global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
 ; GCN-NEXT:    s_waitcnt vmcnt(3)
-; GCN-NEXT:    global_store_dwordx4 v16, v[8:11], s[2:3] offset:32
+; GCN-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
 ; GCN-NEXT:    s_waitcnt vmcnt(3)
-; GCN-NEXT:    global_store_dwordx4 v16, v[12:15], s[2:3] offset:48
+; GCN-NEXT:    global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
 ; GCN-NEXT:    s_endpgm
 ;
 ; GCN-SCRATCH-LABEL: vector_clause:
diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll
index 9d6e0927b0dfd..c98cfa08160ca 100644
--- a/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll
+++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll
@@ -8,17 +8,17 @@ declare i64 @llvm.cttz.i64(i64, i1) nounwind readnone
 define amdgpu_kernel void @ctlz_i64_poison(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
 ; GFX9-LABEL: ctlz_i64_poison:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5
-; GFX9-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:6
-; GFX9-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:7
-; GFX9-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1
-; GFX9-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3
-; GFX9-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:4
-; GFX9-NEXT:    global_load_ubyte v7, v1, s[2:3]
-; GFX9-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:2
+; GFX9-NEXT:    global_load_ubyte v0, v1, s[6:7] offset:5
+; GFX9-NEXT:    global_load_ubyte v2, v1, s[6:7] offset:6
+; GFX9-NEXT:    global_load_ubyte v3, v1, s[6:7] offset:7
+; GFX9-NEXT:    global_load_ubyte v4, v1, s[6:7] offset:1
+; GFX9-NEXT:    global_load_ubyte v5, v1, s[6:7] offset:3
+; GFX9-NEXT:    global_load_ubyte v6, v1, s[6:7] offset:4
+; GFX9-NEXT:    global_load_ubyte v7, v1, s[6:7]
+; GFX9-NEXT:    global_load_ubyte v8, v1, s[6:7] offset:2
 ; GFX9-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(5)
@@ -40,23 +40,23 @@ define amdgpu_kernel void @ctlz_i64_poison(ptr addrspace(1) noalias %out, ptr ad
 ; GFX9-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX9-NEXT:    v_add_u32_e64 v2, v2, 32 clamp
 ; GFX9-NEXT:    v_min_u32_e32 v0, v2, v0
-; GFX9-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX9-NEXT:    global_store_dwordx2 v1, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: ctlz_i64_poison:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x7
-; GFX10-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5
-; GFX10-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:6
-; GFX10-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:7
-; GFX10-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1
-; GFX10-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3
-; GFX10-NEXT:    global_load_ubyte v6, v1, s[2:3]
-; GFX10-NEXT:    global_load_ubyte v7, v1, s[2:3] offset:2
-; GFX10-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:4
+; GFX10-NEXT:    global_load_ubyte v0, v1, s[6:7] offset:5
+; GFX10-NEXT:    global_load_ubyte v2, v1, s[6:7] offset:6
+; GFX10-NEXT:    global_load_ubyte v3, v1, s[6:7] offset:7
+; GFX10-NEXT:    global_load_ubyte v4, v1, s[6:7] offset:1
+; GFX10-NEXT:    global_load_ubyte v5, v1, s[6:7] offset:3
+; GFX10-NEXT:    global_load_ubyte v6, v1, s[6:7]
+; GFX10-NEXT:    global_load_ubyte v7, v1, s[6:7] offset:2
+; GFX10-NEXT:    global_load_ubyte v8, v1, s[6:7] offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(7)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GFX10-NEXT:    s_waitcnt vmcnt(5)
@@ -76,7 +76,7 @@ define amdgpu_kernel void @ctlz_i64_poison(ptr addrspace(1) noalias %out, ptr ad
 ; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX10-NEXT:    v_add_nc_u32_e64 v2, v2, 32 clamp
 ; GFX10-NEXT:    v_min_u32_e32 v0, v2, v0
-; GFX10-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX10-NEXT:    global_store_dwordx2 v1, v[0:1], s[4:5]
 ; GFX10-NEXT:    s_endpgm
   %val = load i64, ptr addrspace(1) %arrayidx, align 1
   %ctlz = tail call i64 @llvm.ctlz.i64(i64 %val, i1 true) nounwind readnone
@@ -87,17 +87,17 @@ define amdgpu_kernel void @ctlz_i64_poison(ptr addrspace(1) noalias %out, ptr ad
 define amdgpu_kernel void @ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
 ; GFX9-LABEL: ctlz_i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5
-; GFX9-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:6
-; GFX9-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:7
-; GFX9-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1
-; GFX9-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3
-; GFX9-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:4
-; GFX9-NEXT:    global_load_ubyte v7, v1, s[2:3]
-; GFX9-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:2
+; GFX9-NEXT:    global_load_ubyte v0, v1, s[6:7] offset:5
+; GFX9-NEXT:    global_load_ubyte v2, v1, s[6:7] offset:6
+; GFX9-NEXT:    global_load_ubyte v3, v1, s[6:7] offset:7
+; GFX9-NEXT:    global_load_ubyte v4, v1, s[6:7] offset:1
+; GFX9-NEXT:    global_load_ubyte v5, v1, s[6:7] offset:3
+; GFX9-NEXT:    global_load_ubyte v6, v1, s[6:7] offset:4
+; GFX9-NEXT:    global_load_ubyte v7, v1, s[6:7]
+; GFX9-NEXT:    global_load_ubyte v8, v1, s[6:7] offset:2
 ; GFX9-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(5)
@@ -120,23 +120,23 @@ define amdgpu_kernel void @ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace
 ; GFX9-NEXT:    v_add_u32_e64 v2, v2, 32 clamp
 ; GFX9-NEXT:    v_min_u32_e32 v0, v2, v0
 ; GFX9-NEXT:    v_min_u32_e32 v0, 64, v0
-; GFX9-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX9-NEXT:    global_store_dwordx2 v1, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: ctlz_i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x7
-; GFX10-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5
-; GFX10-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:6
-; GFX10-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:7
-; GFX10-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1
-; GFX10-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3
-; GFX10-NEXT:    global_load_ubyte v6, v1, s[2:3]
-; GFX10-NEXT:    global_load_ubyte v7, v1, s[2:3] offset:2
-; GFX10-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:4
+; GFX10-NEXT:    global_load_ubyte v0, v1, s[6:7] offset:5
+; GFX10-NEXT:    global_load_ubyte v2, v1, s[6:7] offset:6
+; GFX10-NEXT:    global_load_ubyte v3, v1, s[6:7] offset:7
+; GFX10-NEXT:    global_load_ubyte v4, v1, s[6:7] offset:1
+; GFX10-NEXT:    global_load_ubyte v5, v1, s[6:7] offset:3
+; GFX10-NEXT:    global_load_ubyte v6, v1, s[6:7]
+; GFX10-NEXT:    global_load_ubyte v7, v1, s[6:7] offset:2
+; GFX10-NEXT:    global_load_ubyte v8, v1, s[6:7] offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(7)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GFX10-NEXT:    s_waitcnt vmcnt(5)
@@ -157,7 +157,7 @@ define amdgpu_kernel void @ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace
 ; GFX10-NEXT:    v_add_nc_u32_e64 v2, v2, 32 clamp
 ; GFX10-NEXT:    v_min_u32_e32 v0, v2, v0
 ; GFX10-NEXT:    v_min_u32_e32 v0, 64, v0
-; GFX10-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX10-NEXT:    global_store_dwordx2 v1, v[0:1], s[4:5]
 ; GFX10-NEXT:    s_endpgm
   %val = load i64, ptr addrspace(1) %arrayidx, align 1
   %ctlz = tail call i64 @llvm.ctlz.i64(i64 %val, i1 false) nounwind readnone
@@ -168,17 +168,17 @@ define amdgpu_kernel void @ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace
 define amdgpu_kernel void @cttz_i64_poison(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
 ; GFX9-LABEL: cttz_i64_poison:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5
-; GFX9-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:6
-; GFX9-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:7
-; GFX9-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1
-; GFX9-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3
-; GFX9-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:4
-; GFX9-NEXT:    global_load_ubyte v7, v1, s[2:3]
-; GFX9-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:2
+; GFX9-NEXT:    global_load_ubyte v0, v1, s[6:7] offset:5
+; GFX9-NEXT:    global_load_ubyte v2, v1, s[6:7] offset:6
+; GFX9-NEXT:    global_load_ubyte v3, v1, s[6:7] offset:7
+; GFX9-NEXT:    global_load_ubyte v4, v1, s[6:7] offset:1
+; GFX9-NEXT:    global_load_ubyte v5, v1, s[6:7] offset:3
+; GFX9-NEXT:    global_load_ubyte v6, v1, s[6:7] offset:4
+; GFX9-NEXT:    global_load_ubyte v7, v1, s[6:7]
+; GFX9-NEXT:    global_load_ubyte v8, v1, s[6:7] offset:2
 ; GFX9-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(5)
@@ -200,23 +200,23 @@ define amdgpu_kernel void @cttz_i64_poison(ptr addrspace(1) noalias %out, ptr ad
 ; GFX9-NEXT:    v_ffbl_b32_e32 v2, v2
 ; GFX9-NEXT:    v_add_u32_e64 v0, v0, 32 clamp
 ; GFX9-NEXT:    v_min_u32_e32 v0, v0, v2
-; GFX9-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX9-NEXT:    global_store_dwordx2 v1, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: cttz_i64_poison:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x7
-; GFX10-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5
-; GFX10-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:7
-; GFX10-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:6
-; GFX10-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1
-; GFX10-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3
-; GFX10-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:4
-; GFX10-NEXT:    global_load_ubyte v7, v1, s[2:3]
-; GFX10-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:2
+; GFX10-NEXT:    global_load_ubyte v0, v1, s[6:7] offset:5
+; GFX10-NEXT:    global_load_ubyte v2, v1, s[6:7] offset:7
+; GFX10-NEXT:    global_load_ubyte v3, v1, s[6:7] offset:6
+; GFX10-NEXT:    global_load_ubyte v4, v1, s[6:7] offset:1
+; GFX10-NEXT:    global_load_ubyte v5, v1, s[6:7] offset:3
+; GFX10-NEXT:    global_load_ubyte v6, v1, s[6:7] offset:4
+; GFX10-NEXT:    global_load_ubyte v7, v1, s[6:7]
+; GFX10-NEXT:    global_load_ubyte v8, v1, s[6:7] offset:2
 ; GFX10-NEXT:    s_waitcnt vmcnt(7)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GFX10-NEXT:    s_waitcnt vmcnt(6)
@@ -238,7 +238,7 @@ define amdgpu_kernel void @cttz_i64_poison(ptr addrspace(1) noalias %out, ptr ad
 ; GFX10-NEXT:    v_ffbl_b32_e32 v2, v2
 ; GFX10-NEXT:    v_add_nc_u32_e64 v0, v0, 32 clamp
 ; GFX10-NEXT:    v_min_u32_e32 v0, v0, v2
-; GFX10-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX10-NEXT:    global_store_dwordx2 v1, v[0:1], s[4:5]
 ; GFX10-NEXT:    s_endpgm
   %val = load i64, ptr addrspace(1) %arrayidx, align 1
   %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 true) nounwind readnone
@@ -249,17 +249,17 @@ define amdgpu_kernel void @cttz_i64_poison(ptr addrspace(1) noalias %out, ptr ad
 define amdgpu_kernel void @cttz_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
 ; GFX9-LABEL: cttz_i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5
-; GFX9-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:6
-; GFX9-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:7
-; GFX9-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1
-; GFX9-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3
-; GFX9-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:4
-; GFX9-NEXT:    global_load_ubyte v7, v1, s[2:3]
-; GFX9-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:2
+; GFX9-NEXT:    global_load_ubyte v0, v1, s[6:7] offset:5
+; GFX9-NEXT:    global_load_ubyte v2, v1, s[6:7] offset:6
+; GFX9-NEXT:    global_load_ubyte v3, v1, s[6:7] offset:7
+; GFX9-NEXT:    global_load_ubyte v4, v1, s[6:7] offset:1
+; GFX9-NEXT:    global_load_ubyte v5, v1, s[6:7] offset:3
+; GFX9-NEXT:    global_load_ubyte v6, v1, s[6:7] offset:4
+; GFX9-NEXT:    global_load_ubyte v7, v1, s[6:7]
+; GFX9-NEXT:    global_load_ubyte v8, v1, s[6:7] offset:2
 ; GFX9-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(5)
@@ -282,23 +282,23 @@ define amdgpu_kernel void @cttz_i64(ptr addrspace(1) noalias %out, ptr addrspace
 ; GFX9-NEXT:    v_add_u32_e64 v0, v0, 32 clamp
 ; GFX9-NEXT:    v_min_u32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_min_u32_e32 v0, 64, v0
-; GFX9-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX9-NEXT:    global_store_dwordx2 v1, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: cttz_i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x7
-; GFX10-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5
-; GFX10-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:7
-; GFX10-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:6
-; GFX10-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1
-; GFX10-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3
-; GFX10-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:4
-; GFX10-NEXT:    global_load_ubyte v7, v1, s[2:3]
-; GFX10-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:2
+; GFX10-NEXT:    global_load_ubyte v0, v1, s[6:7] offset:5
+; GFX10-NEXT:    global_load_ubyte v2, v1, s[6:7] offset:7
+; GFX10-NEXT:    global_load_ubyte v3, v1, s[6:7] offset:6
+; GFX10-NEXT:    global_load_ubyte v4, v1, s[6:7] offset:1
+; GFX10-NEXT:    global_load_ubyte v5, v1, s[6:7] offset:3
+; GFX10-NEXT:    global_load_ubyte v6, v1, s[6:7] offset:4
+; GFX10-NEXT:    global_load_ubyte v7, v1, s[6:7]
+; GFX10-NEXT:    global_load_ubyte v8, v1, s[6:7] offset:2
 ; GFX10-NEXT:    s_waitcnt vmcnt(7)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GFX10-NEXT:    s_waitcnt vmcnt(6)
@@ -321,7 +321,7 @@ define amdgpu_kernel void @cttz_i64(ptr addrspace(1) noalias %out, ptr addrspace
 ; GFX10-NEXT:    v_add_nc_u32_e64 v0, v0, 32 clamp
 ; GFX10-NEXT:    v_min_u32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_min_u32_e32 v0, 64, v0
-; GFX10-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX10-NEXT:    global_store_dwordx2 v1, v[0:1], s[4:5]
 ; GFX10-NEXT:    s_endpgm
   %val = load i64, ptr addrspace(1) %arrayidx, align 1
   %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 false) nounwind readnone
diff --git a/llvm/test/CodeGen/AMDGPU/mul_int24.ll b/llvm/test/CodeGen/AMDGPU/mul_int24.ll
index 357b851a8f56f..58fd4b9bd2fee 100644
--- a/llvm/test/CodeGen/AMDGPU/mul_int24.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul_int24.ll
@@ -39,17 +39,17 @@ define amdgpu_kernel void @test_smul24_i32(ptr addrspace(1) %out, i32 %a, i32 %b
 ;
 ; GFX9-LABEL: test_smul24_i32:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s4, s0
-; GFX9-NEXT:    s_mov_b32 s5, s1
-; GFX9-NEXT:    s_bfe_i32 s0, s2, 0x180000
-; GFX9-NEXT:    s_bfe_i32 s1, s3, 0x180000
-; GFX9-NEXT:    s_mul_i32 s0, s0, s1
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_bfe_i32 s4, s6, 0x180000
+; GFX9-NEXT:    s_bfe_i32 s5, s7, 0x180000
+; GFX9-NEXT:    s_mul_i32 s4, s4, s5
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; EG-LABEL: test_smul24_i32:
@@ -126,17 +126,17 @@ define amdgpu_kernel void @test_smulhi24_i64(ptr addrspace(1) %out, i32 %a, i32
 ;
 ; GFX9-LABEL: test_smulhi24_i64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s4, s0
-; GFX9-NEXT:    s_mov_b32 s5, s1
-; GFX9-NEXT:    s_bfe_i32 s0, s2, 0x180000
-; GFX9-NEXT:    s_bfe_i32 s1, s3, 0x180000
-; GFX9-NEXT:    s_mul_hi_i32 s0, s0, s1
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_bfe_i32 s4, s6, 0x180000
+; GFX9-NEXT:    s_bfe_i32 s5, s7, 0x180000
+; GFX9-NEXT:    s_mul_hi_i32 s4, s4, s5
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; EG-LABEL: test_smulhi24_i64:
diff --git a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
index 3a16c88f32cc3..698a54de108f7 100644
--- a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
@@ -39,17 +39,17 @@ define amdgpu_kernel void @test_umul24_i32(ptr addrspace(1) %out, i32 %a, i32 %b
 ;
 ; GFX9-LABEL: test_umul24_i32:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s4, s0
-; GFX9-NEXT:    s_mov_b32 s5, s1
-; GFX9-NEXT:    s_and_b32 s0, s2, 0xffffff
-; GFX9-NEXT:    s_and_b32 s1, s3, 0xffffff
-; GFX9-NEXT:    s_mul_i32 s0, s0, s1
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_and_b32 s4, s6, 0xffffff
+; GFX9-NEXT:    s_and_b32 s5, s7, 0xffffff
+; GFX9-NEXT:    s_mul_i32 s4, s4, s5
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 entry:
   %0 = shl i32 %a, 8
@@ -405,17 +405,17 @@ define amdgpu_kernel void @test_umulhi24_i32_i64(ptr addrspace(1) %out, i32 %a,
 ;
 ; GFX9-LABEL: test_umulhi24_i32_i64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s4, s0
-; GFX9-NEXT:    s_mov_b32 s5, s1
-; GFX9-NEXT:    s_and_b32 s0, s2, 0xffffff
-; GFX9-NEXT:    s_and_b32 s1, s3, 0xffffff
-; GFX9-NEXT:    s_mul_hi_u32 s0, s0, s1
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_and_b32 s4, s6, 0xffffff
+; GFX9-NEXT:    s_and_b32 s5, s7, 0xffffff
+; GFX9-NEXT:    s_mul_hi_u32 s4, s4, s5
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 entry:
   %a.24 = and i32 %a, 16777215
@@ -663,14 +663,14 @@ define amdgpu_kernel void @test_umulhi16_i32(ptr addrspace(1) %out, i32 %a, i32
 ;
 ; GFX9-LABEL: test_umulhi16_i32:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX9-NEXT:    s_and_b32 s3, s3, 0xffff
-; GFX9-NEXT:    s_mul_i32 s2, s2, s3
-; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    global_store_short_d16_hi v0, v1, s[0:1]
+; GFX9-NEXT:    s_and_b32 s0, s6, 0xffff
+; GFX9-NEXT:    s_and_b32 s1, s7, 0xffff
+; GFX9-NEXT:    s_mul_i32 s0, s0, s1
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    global_store_short_d16_hi v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 entry:
   %a.16 = and i32 %a, 65535
diff --git a/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll b/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll
index a3f7906a05f6b..1899a0abc6592 100644
--- a/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll
+++ b/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll
@@ -4,17 +4,17 @@
 define amdgpu_kernel void @fma_vector_vector_scalar_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 {
 ; GCN-LABEL: fma_vector_vector_scalar_lo:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN-NEXT:    ds_read_b32 v2, v0
 ; GCN-NEXT:    ds_read_b32 v0, v0 offset:4
 ; GCN-NEXT:    ds_read_u16 v1, v1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0]
-; GCN-NEXT:    global_store_dword v3, v0, s[0:1]
+; GCN-NEXT:    global_store_dword v3, v0, s[4:5]
 ; GCN-NEXT:    s_endpgm
 bb:
   %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -35,17 +35,17 @@ bb:
 define amdgpu_kernel void @fma_vector_vector_neg_broadcast_scalar_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 {
 ; GCN-LABEL: fma_vector_vector_neg_broadcast_scalar_lo:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN-NEXT:    ds_read_b32 v2, v0
 ; GCN-NEXT:    ds_read_b32 v0, v0 offset:4
 ; GCN-NEXT:    ds_read_u16 v1, v1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1]
-; GCN-NEXT:    global_store_dword v3, v0, s[0:1]
+; GCN-NEXT:    global_store_dword v3, v0, s[4:5]
 ; GCN-NEXT:    s_endpgm
 bb:
   %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -67,17 +67,17 @@ bb:
 define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 {
 ; GCN-LABEL: fma_vector_vector_neg_scalar_lo:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN-NEXT:    ds_read_b32 v2, v0
 ; GCN-NEXT:    ds_read_b32 v0, v0 offset:4
 ; GCN-NEXT:    ds_read_u16 v1, v1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1]
-; GCN-NEXT:    global_store_dword v3, v0, s[0:1]
+; GCN-NEXT:    global_store_dword v3, v0, s[4:5]
 ; GCN-NEXT:    s_endpgm
 bb:
   %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -99,17 +99,17 @@ bb:
 define amdgpu_kernel void @fma_vector_vector_neg_broadcast_neg_scalar_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 {
 ; GCN-LABEL: fma_vector_vector_neg_broadcast_neg_scalar_lo:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN-NEXT:    ds_read_b32 v2, v0
 ; GCN-NEXT:    ds_read_b32 v0, v0 offset:4
 ; GCN-NEXT:    ds_read_u16 v1, v1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0]
-; GCN-NEXT:    global_store_dword v3, v0, s[0:1]
+; GCN-NEXT:    global_store_dword v3, v0, s[4:5]
 ; GCN-NEXT:    s_endpgm
 bb:
   %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -132,17 +132,17 @@ bb:
 define amdgpu_kernel void @fma_vector_vector_scalar_neg_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 {
 ; GCN-LABEL: fma_vector_vector_scalar_neg_lo:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN-NEXT:    ds_read_b32 v2, v0
 ; GCN-NEXT:    ds_read_b32 v0, v0 offset:4
 ; GCN-NEXT:    ds_read_u16 v1, v1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] neg_lo:[0,0,1]
-; GCN-NEXT:    global_store_dword v3, v0, s[0:1]
+; GCN-NEXT:    global_store_dword v3, v0, s[4:5]
 ; GCN-NEXT:    s_endpgm
 bb:
   %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -163,17 +163,17 @@ bb:
 define amdgpu_kernel void @fma_vector_vector_scalar_neg_hi(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 {
 ; GCN-LABEL: fma_vector_vector_scalar_neg_hi:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN-NEXT:    ds_read_b32 v2, v0
 ; GCN-NEXT:    ds_read_b32 v0, v0 offset:4
 ; GCN-NEXT:    ds_read_u16 v1, v1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] neg_hi:[0,0,1]
-; GCN-NEXT:    global_store_dword v3, v0, s[0:1]
+; GCN-NEXT:    global_store_dword v3, v0, s[4:5]
 ; GCN-NEXT:    s_endpgm
 bb:
   %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -194,16 +194,16 @@ bb:
 define amdgpu_kernel void @add_vector_neg_bitcast_scalar_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 {
 ; GCN-LABEL: add_vector_neg_bitcast_scalar_lo:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN-NEXT:    ds_read_b32 v0, v0
 ; GCN-NEXT:    ds_read_u16 v1, v1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_pk_add_u16 v0, v0, v1 op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]
-; GCN-NEXT:    global_store_dword v2, v0, s[0:1]
+; GCN-NEXT:    global_store_dword v2, v0, s[4:5]
 ; GCN-NEXT:    s_endpgm
 bb:
   %vec0 = load volatile <2 x i16>, ptr addrspace(3) %lds, align 4
@@ -222,11 +222,11 @@ bb:
 define amdgpu_kernel void @fma_vector_vector_scalar_lo_neg_scalar_hi(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 {
 ; GCN-LABEL: fma_vector_vector_scalar_lo_neg_scalar_hi:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN-NEXT:    ds_read_b32 v2, v0
 ; GCN-NEXT:    ds_read_b32 v0, v0 offset:4
 ; GCN-NEXT:    ds_read_u16 v3, v1
@@ -237,7 +237,7 @@ define amdgpu_kernel void @fma_vector_vector_scalar_lo_neg_scalar_hi(ptr addrspa
 ; GCN-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
 ; GCN-NEXT:    v_lshl_or_b32 v1, v1, 16, v3
 ; GCN-NEXT:    v_pk_fma_f16 v0, v2, v0, v1
-; GCN-NEXT:    global_store_dword v4, v0, s[0:1]
+; GCN-NEXT:    global_store_dword v4, v0, s[4:5]
 ; GCN-NEXT:    s_endpgm
 bb:
   %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -261,10 +261,10 @@ bb:
 define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo_scalar_hi(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 {
 ; GCN-LABEL: fma_vector_vector_neg_scalar_lo_scalar_hi:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN-NEXT:    ds_read_b32 v2, v0
 ; GCN-NEXT:    ds_read_b32 v0, v0 offset:4
 ; GCN-NEXT:    ds_read_u16 v3, v1
@@ -273,7 +273,7 @@ define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo_scalar_hi(ptr addrspa
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_pk_fma_f16 v0, v2, v0, v3 neg_lo:[0,0,1] neg_hi:[0,0,1]
-; GCN-NEXT:    global_store_dword v1, v0, s[0:1]
+; GCN-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GCN-NEXT:    s_endpgm
 bb:
   %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
diff --git a/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll b/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll
index ce92d40cca2b0..2ce0b9eed02cb 100644
--- a/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll
+++ b/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll
@@ -50,40 +50,40 @@ define amdgpu_kernel void @buffers_dont_alias(ptr addrspace(8) noalias %a, ptr a
 define amdgpu_kernel void @buffers_from_flat_dont_alias(ptr noalias %a.flat, ptr noalias %b.flat) {
 ; SDAG-LABEL: buffers_from_flat_dont_alias:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; SDAG-NEXT:    s_mov_b32 s7, 0
-; SDAG-NEXT:    s_mov_b32 s6, 16
+; SDAG-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; SDAG-NEXT:    s_mov_b32 s3, 0
+; SDAG-NEXT:    s_mov_b32 s2, 16
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    s_and_b32 s5, s1, 0xffff
-; SDAG-NEXT:    s_mov_b32 s4, s0
-; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; SDAG-NEXT:    s_and_b32 s5, s3, 0xffff
-; SDAG-NEXT:    s_mov_b32 s4, s2
+; SDAG-NEXT:    s_and_b32 s1, s5, 0xffff
+; SDAG-NEXT:    s_mov_b32 s0, s4
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; SDAG-NEXT:    s_and_b32 s1, s7, 0xffff
+; SDAG-NEXT:    s_mov_b32 s0, s6
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    v_mul_f32_e32 v0, v0, v0
 ; SDAG-NEXT:    v_mul_f32_e32 v1, v1, v1
 ; SDAG-NEXT:    v_mul_f32_e32 v2, v2, v2
 ; SDAG-NEXT:    v_mul_f32_e32 v3, v3, v3
-; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: buffers_from_flat_dont_alias:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-NEXT:    s_mov_b32 s7, 0
-; GISEL-NEXT:    s_mov_b32 s6, 16
+; GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GISEL-NEXT:    s_mov_b32 s3, 0
+; GISEL-NEXT:    s_mov_b32 s2, 16
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    s_and_b32 s5, s1, 0xffff
-; GISEL-NEXT:    s_mov_b32 s4, s0
-; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; GISEL-NEXT:    s_and_b32 s5, s3, 0xffff
-; GISEL-NEXT:    s_mov_b32 s4, s2
+; GISEL-NEXT:    s_and_b32 s1, s5, 0xffff
+; GISEL-NEXT:    s_mov_b32 s0, s4
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; GISEL-NEXT:    s_and_b32 s1, s7, 0xffff
+; GISEL-NEXT:    s_mov_b32 s0, s6
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    v_mul_f32_e32 v0, v0, v0
 ; GISEL-NEXT:    v_mul_f32_e32 v1, v1, v1
 ; GISEL-NEXT:    v_mul_f32_e32 v2, v2, v2
 ; GISEL-NEXT:    v_mul_f32_e32 v3, v3, v3
-; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GISEL-NEXT:    s_endpgm
   %a = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %a.flat, i16 0, i32 16, i32 0)
   %b = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %b.flat, i16 0, i32 16, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/rotl.ll b/llvm/test/CodeGen/AMDGPU/rotl.ll
index a87973d93ac77..fcccd2da07f76 100644
--- a/llvm/test/CodeGen/AMDGPU/rotl.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotl.ll
@@ -47,12 +47,12 @@ define amdgpu_kernel void @rotl_i32(ptr addrspace(1) %in, i32 %x, i32 %y) {
 ;
 ; GFX10-LABEL: rotl_i32:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_sub_i32 s3, 32, s3
-; GFX10-NEXT:    v_alignbit_b32 v1, s2, s2, s3
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    s_sub_i32 s0, 32, s7
+; GFX10-NEXT:    v_alignbit_b32 v1, s6, s6, s0
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: rotl_i32:
diff --git a/llvm/test/CodeGen/AMDGPU/rotr.ll b/llvm/test/CodeGen/AMDGPU/rotr.ll
index 058ee589bc4b0..214894092a8b0 100644
--- a/llvm/test/CodeGen/AMDGPU/rotr.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotr.ll
@@ -43,11 +43,11 @@ define amdgpu_kernel void @rotr_i32(ptr addrspace(1) %in, i32 %x, i32 %y) {
 ;
 ; GFX10-LABEL: rotr_i32:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_alignbit_b32 v1, s2, s2, s3
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    v_alignbit_b32 v1, s6, s6, s7
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: rotr_i32:
diff --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
index b81af3eb838f1..acdcb631dccbd 100644
--- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
@@ -8,15 +8,15 @@
 define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
 ; GFX9-LABEL: s_shl_v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    s_mov_b32 s4, s0
-; GFX9-NEXT:    s_mov_b32 s5, s1
-; GFX9-NEXT:    v_pk_lshlrev_b16 v0, s3, v0
-; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    v_pk_lshlrev_b16 v0, s7, v0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_shl_v2i16:
@@ -59,14 +59,14 @@ define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2
 ;
 ; GFX10-LABEL: s_shl_v2i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX10-NEXT:    s_mov_b32 s6, -1
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s2, -1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_pk_lshlrev_b16 v0, s3, s2
-; GFX10-NEXT:    s_mov_b32 s4, s0
-; GFX10-NEXT:    s_mov_b32 s5, s1
-; GFX10-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX10-NEXT:    v_pk_lshlrev_b16 v0, s7, s6
+; GFX10-NEXT:    s_mov_b32 s0, s4
+; GFX10-NEXT:    s_mov_b32 s1, s5
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_shl_v2i16:
diff --git a/llvm/test/CodeGen/AMDGPU/sub.ll b/llvm/test/CodeGen/AMDGPU/sub.ll
index ded308ae4f230..3ae982089228d 100644
--- a/llvm/test/CodeGen/AMDGPU/sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.ll
@@ -33,12 +33,12 @@ define amdgpu_kernel void @s_sub_i32(ptr addrspace(1) %out, i32 %a, i32 %b) {
 ;
 ; GFX9-LABEL: s_sub_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_sub_i32 s2, s2, s3
-; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_sub_i32 s0, s6, s7
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: s_sub_i32:
diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
index 6ec213a06999b..bfeab97d81dbe 100644
--- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
@@ -214,15 +214,15 @@ define amdgpu_kernel void @s_test_sub_self_v2i16(ptr addrspace(1) %out, ptr addr
 define amdgpu_kernel void @s_test_sub_v2i16_kernarg(ptr addrspace(1) %out, <2 x i16> %a, <2 x i16> %b) #1 {
 ; GFX9-LABEL: s_test_sub_v2i16_kernarg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s3
-; GFX9-NEXT:    s_mov_b32 s4, s0
-; GFX9-NEXT:    s_mov_b32 s5, s1
-; GFX9-NEXT:    v_pk_sub_i16 v0, s2, v0
-; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s7
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    v_pk_sub_i16 v0, s6, v0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_test_sub_v2i16_kernarg:
@@ -246,14 +246,14 @@ define amdgpu_kernel void @s_test_sub_v2i16_kernarg(ptr addrspace(1) %out, <2 x
 ;
 ; GFX10-LABEL: s_test_sub_v2i16_kernarg:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX10-NEXT:    s_mov_b32 s6, -1
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s2, -1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_pk_sub_i16 v0, s2, s3
-; GFX10-NEXT:    s_mov_b32 s4, s0
-; GFX10-NEXT:    s_mov_b32 s5, s1
-; GFX10-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX10-NEXT:    v_pk_sub_i16 v0, s6, s7
+; GFX10-NEXT:    s_mov_b32 s0, s4
+; GFX10-NEXT:    s_mov_b32 s1, s5
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_test_sub_v2i16_kernarg:
diff --git a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
index fc6df735c05b0..65eb1cee42350 100644
--- a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
@@ -135,13 +135,13 @@ define amdgpu_kernel void @v_cnd_nan(ptr addrspace(1) %out, i32 %c, float %f) #0
 ;
 ; GFX10-LABEL: v_cnd_nan:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_cmp_eq_u32 s2, 0
-; GFX10-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, -1, s3, s[4:5]
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX10-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, -1, s7, s[0:1]
+; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_cnd_nan:
diff --git a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
index 89fef7eead839..e3185e189157b 100644
--- a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
@@ -104,15 +104,15 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg
 ;
 ; SDAG-GFX9-LABEL: basic_smax_smin_sgpr:
 ; SDAG-GFX9:       ; %bb.0:
-; SDAG-GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; SDAG-GFX9-NEXT:    v_mov_b32_e32 v1, 0xff
 ; SDAG-GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX9-NEXT:    v_med3_i16 v2, s2, 0, v1
-; SDAG-GFX9-NEXT:    v_med3_i16 v1, s3, 0, v1
+; SDAG-GFX9-NEXT:    v_med3_i16 v2, s6, 0, v1
+; SDAG-GFX9-NEXT:    v_med3_i16 v1, s7, 0, v1
 ; SDAG-GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; SDAG-GFX9-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
-; SDAG-GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; SDAG-GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; SDAG-GFX9-NEXT:    s_endpgm
 ;
 ; SDAG-GFX11-LABEL: basic_smax_smin_sgpr:
@@ -156,22 +156,22 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg
 ;
 ; GISEL-GFX9-LABEL: basic_smax_smin_sgpr:
 ; GISEL-GFX9:       ; %bb.0:
-; GISEL-GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-GFX9-NEXT:    s_sext_i32_i16 s4, 0
-; GISEL-GFX9-NEXT:    s_sext_i32_i16 s5, 0xff
+; GISEL-GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GISEL-GFX9-NEXT:    s_sext_i32_i16 s0, 0
+; GISEL-GFX9-NEXT:    s_sext_i32_i16 s1, 0xff
 ; GISEL-GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-GFX9-NEXT:    s_sext_i32_i16 s2, s6
+; GISEL-GFX9-NEXT:    s_sext_i32_i16 s3, s7
+; GISEL-GFX9-NEXT:    s_max_i32 s2, s2, s0
+; GISEL-GFX9-NEXT:    s_max_i32 s0, s3, s0
 ; GISEL-GFX9-NEXT:    s_sext_i32_i16 s2, s2
-; GISEL-GFX9-NEXT:    s_sext_i32_i16 s3, s3
-; GISEL-GFX9-NEXT:    s_max_i32 s2, s2, s4
-; GISEL-GFX9-NEXT:    s_max_i32 s3, s3, s4
-; GISEL-GFX9-NEXT:    s_sext_i32_i16 s2, s2
-; GISEL-GFX9-NEXT:    s_sext_i32_i16 s3, s3
-; GISEL-GFX9-NEXT:    s_min_i32 s2, s2, s5
-; GISEL-GFX9-NEXT:    s_min_i32 s3, s3, s5
-; GISEL-GFX9-NEXT:    s_pack_ll_b32_b16 s2, s2, s3
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GISEL-GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GISEL-GFX9-NEXT:    s_sext_i32_i16 s0, s0
+; GISEL-GFX9-NEXT:    s_min_i32 s2, s2, s1
+; GISEL-GFX9-NEXT:    s_min_i32 s0, s0, s1
+; GISEL-GFX9-NEXT:    s_pack_ll_b32_b16 s0, s2, s0
+; GISEL-GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GISEL-GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GISEL-GFX9-NEXT:    s_endpgm
 ;
 ; GISEL-GFX11-LABEL: basic_smax_smin_sgpr:
diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
index 901e88a4c6aca..e12a4beb5dbe5 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -1344,40 +1344,40 @@ exit:
 define amdgpu_kernel void @fdiv_f32(ptr addrspace(1) %out, float %a, float %b) #0 {
 ; GFX1032-LABEL: fdiv_f32:
 ; GFX1032:       ; %bb.0: ; %entry
-; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1032-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_div_scale_f32 v0, s4, s3, s3, s2
+; GFX1032-NEXT:    v_div_scale_f32 v0, s0, s7, s7, s6
 ; GFX1032-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX1032-NEXT:    v_fma_f32 v2, -v0, v1, 1.0
 ; GFX1032-NEXT:    v_fmac_f32_e32 v1, v2, v1
-; GFX1032-NEXT:    v_div_scale_f32 v2, vcc_lo, s2, s3, s2
+; GFX1032-NEXT:    v_div_scale_f32 v2, vcc_lo, s6, s7, s6
 ; GFX1032-NEXT:    v_mul_f32_e32 v3, v2, v1
 ; GFX1032-NEXT:    v_fma_f32 v4, -v0, v3, v2
 ; GFX1032-NEXT:    v_fmac_f32_e32 v3, v4, v1
 ; GFX1032-NEXT:    v_fma_f32 v0, -v0, v3, v2
 ; GFX1032-NEXT:    v_div_fmas_f32 v0, v0, v1, v3
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-NEXT:    v_div_fixup_f32 v0, v0, s3, s2
-; GFX1032-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1032-NEXT:    v_div_fixup_f32 v0, v0, s7, s6
+; GFX1032-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: fdiv_f32:
 ; GFX1064:       ; %bb.0: ; %entry
-; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1064-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_div_scale_f32 v0, s[4:5], s3, s3, s2
+; GFX1064-NEXT:    v_div_scale_f32 v0, s[0:1], s7, s7, s6
 ; GFX1064-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX1064-NEXT:    v_fma_f32 v2, -v0, v1, 1.0
 ; GFX1064-NEXT:    v_fmac_f32_e32 v1, v2, v1
-; GFX1064-NEXT:    v_div_scale_f32 v2, vcc, s2, s3, s2
+; GFX1064-NEXT:    v_div_scale_f32 v2, vcc, s6, s7, s6
 ; GFX1064-NEXT:    v_mul_f32_e32 v3, v2, v1
 ; GFX1064-NEXT:    v_fma_f32 v4, -v0, v3, v2
 ; GFX1064-NEXT:    v_fmac_f32_e32 v3, v4, v1
 ; GFX1064-NEXT:    v_fma_f32 v0, -v0, v3, v2
 ; GFX1064-NEXT:    v_div_fmas_f32 v0, v0, v1, v3
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-NEXT:    v_div_fixup_f32 v0, v0, s3, s2
-; GFX1064-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1064-NEXT:    v_div_fixup_f32 v0, v0, s7, s6
+; GFX1064-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX1064-NEXT:    s_endpgm
 entry:
   %fdiv = fdiv float %a, %b
@@ -2138,23 +2138,23 @@ main_body:
 define amdgpu_kernel void @test_intr_fcmp_i64(ptr addrspace(1) %out, float %src, float %a) {
 ; GFX1032-LABEL: test_intr_fcmp_i64:
 ; GFX1032:       ; %bb.0:
-; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1032-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_cmp_eq_f32_e64 s2, s2, |s3|
-; GFX1032-NEXT:    v_mov_b32_e32 v0, s2
-; GFX1032-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX1032-NEXT:    v_cmp_eq_f32_e64 s0, s6, |s7|
+; GFX1032-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1032-NEXT:    global_store_dwordx2 v1, v[0:1], s[4:5]
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: test_intr_fcmp_i64:
 ; GFX1064:       ; %bb.0:
-; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1064-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_cmp_eq_f32_e64 s[2:3], s2, |s3|
-; GFX1064-NEXT:    v_mov_b32_e32 v0, s2
-; GFX1064-NEXT:    v_mov_b32_e32 v1, s3
-; GFX1064-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064-NEXT:    v_cmp_eq_f32_e64 s[0:1], s6, |s7|
+; GFX1064-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1064-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1064-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX1064-NEXT:    s_endpgm
   %temp = call float @llvm.fabs.f32(float %a)
   %result = call i64 @llvm.amdgcn.fcmp.i64.f32(float %src, float %temp, i32 1)
@@ -2195,22 +2195,22 @@ define amdgpu_kernel void @test_intr_icmp_i64(ptr addrspace(1) %out, i32 %src) {
 define amdgpu_kernel void @test_intr_fcmp_i32(ptr addrspace(1) %out, float %src, float %a) {
 ; GFX1032-LABEL: test_intr_fcmp_i32:
 ; GFX1032:       ; %bb.0:
-; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1032-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_cmp_eq_f32_e64 s2, s2, |s3|
-; GFX1032-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1032-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX1032-NEXT:    v_cmp_eq_f32_e64 s0, s6, |s7|
+; GFX1032-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1032-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: test_intr_fcmp_i32:
 ; GFX1064:       ; %bb.0:
-; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1064-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_cmp_eq_f32_e64 s[2:3], s2, |s3|
-; GFX1064-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1064-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX1064-NEXT:    v_cmp_eq_f32_e64 s[0:1], s6, |s7|
+; GFX1064-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1064-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX1064-NEXT:    s_endpgm
   %temp = call float @llvm.fabs.f32(float %a)
   %result = call i32 @llvm.amdgcn.fcmp.i32.f32(float %src, float %temp, i32 1)

>From 5887ff638a32e325921a09c05bba7babce413e96 Mon Sep 17 00:00:00 2001
From: Christudasan Devadasan <Christudasan.Devadasan at amd.com>
Date: Wed, 3 Jul 2024 13:28:04 +0000
Subject: [PATCH 6/6] code fixup.

---
 llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index ea8ad39b90257..ae537b194f50c 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -1727,7 +1727,7 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
     // under-aligned.
     const MachineMemOperand *MMO = *CI.I->memoperands_begin();
     bool NeedsConstrainedOpc =
-        STM->isXNACKEnabled() && MMO->getAlign().value() < (Width << 2);
+        STM->isXNACKEnabled() && MMO->getAlign().value() < Width * 4;
     switch (Width) {
     default:
       return 0;



More information about the llvm-branch-commits mailing list