[llvm] AMDGPU/GlobalISel: Regbanklegalize rules for G_PHI (PR #179735)

Wed Feb 18 04:02:25 PST 2026

https://github.com/petar-avramovic updated https://github.com/llvm/llvm-project/pull/179735

>From a210b352df29de054cf9a76488611dfc44e8fb68 Mon Sep 17 00:00:00 2001
From: Petar Avramovic <Petar.Avramovic at amd.com>
Date: Wed, 4 Feb 2026 20:27:49 +0100
Subject: [PATCH] AMDGPU/GlobalISel: Regbanklegalize rules for G_PHI

Move G_PHI handling to AMDGPURegBankLegalizeRules.cpp.
Support all legal types.
---
 .../Target/AMDGPU/AMDGPURegBankLegalize.cpp   |   6 -
 .../AMDGPU/AMDGPURegBankLegalizeHelper.cpp    |  88 ++---
 .../AMDGPU/AMDGPURegBankLegalizeHelper.h      |   1 -
 .../AMDGPU/AMDGPURegBankLegalizeRules.cpp     |   6 +
 .../AMDGPU/AMDGPURegBankLegalizeRules.h       |   5 +-
 .../GlobalISel/divergent-control-flow.ll      |  52 +--
 .../GlobalISel/llvm.amdgcn.set.inactive.ll    | 244 +++++++------
 .../CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll  | 330 ++++++++++--------
 .../CodeGen/AMDGPU/GlobalISel/llvm.memset.ll  |  91 +++--
 .../AMDGPU/GlobalISel/mul-known-bits.i64.ll   |   4 +-
 .../AMDGPU/GlobalISel/vni8-across-blocks.ll   |  34 +-
 llvm/test/CodeGen/AMDGPU/fptoi.i128.ll        | 135 +++----
 llvm/test/CodeGen/AMDGPU/global-saddr-load.ll |  80 ++---
 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dead.ll  |  20 +-
 .../CodeGen/AMDGPU/remat-fp64-constants.ll    |   2 +-
 15 files changed, 574 insertions(+), 524 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
index 888717f13ebe9..87275c2d73959 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
@@ -459,12 +459,6 @@ bool AMDGPURegBankLegalize::runOnMachineFunction(MachineFunction &MF) {
       continue;
 
     unsigned Opc = MI->getOpcode();
-    // Insert point for use operands needs some calculation.
-    if (Opc == AMDGPU::G_PHI) {
-      if (!RBLHelper.applyMappingPHI(*MI))
-        return false;
-      continue;
-    }
 
     // Opcodes that support pretty much all combinations of reg banks and LLTs
     // (except S1). There is no point in writing rules for them.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
index d262f074679a8..5501810d4b85e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
@@ -1056,6 +1056,44 @@ bool RegBankLegalizeHelper::lower(MachineInstr &MI,
     MI.eraseFromParent();
     return true;
   }
+  case AextToS32InIncomingBlockGPHI: {
+    Register Dst = MI.getOperand(0).getReg();
+    Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
+    B.setInsertPt(*MI.getParent(), MI.getParent()->getFirstNonPHI());
+    MI.getOperand(0).setReg(NewDst);
+    B.buildTrunc(Dst, NewDst);
+
+    for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
+      Register UseReg = MI.getOperand(i).getReg();
+
+      auto DefMI = MRI.getVRegDef(UseReg)->getIterator();
+      MachineBasicBlock *DefMBB = DefMI->getParent();
+
+      B.setInsertPt(*DefMBB, DefMBB->SkipPHIsAndLabels(std::next(DefMI)));
+
+      auto NewUse = B.buildAnyExt(SgprRB_S32, UseReg);
+      MI.getOperand(i).setReg(NewUse.getReg(0));
+    }
+    break;
+  }
+  case VerifyAllSgprGPHI: {
+    assert(llvm::all_of(MI.operands(), [&](const MachineOperand &Op) {
+      if (Op.isMBB())
+        return true;
+      return MRI.getRegBankOrNull(Op.getReg()) == SgprRB;
+    }));
+    return true;
+  }
+  case VerifyAllSgprOrVgprGPHI: {
+    assert(MRI.getRegBankOrNull(MI.getOperand(0).getReg()) == VgprRB);
+    assert(llvm::all_of(MI.operands(), [&](const MachineOperand &Op) {
+      if (Op.isMBB())
+        return true;
+      const RegisterBank *RB = MRI.getRegBankOrNull(Op.getReg());
+      return RB == VgprRB || RB == SgprRB;
+    }));
+    return true;
+  }
   }
 
   if (!WaterfallSgprs.empty()) {
@@ -1632,56 +1670,6 @@ bool RegBankLegalizeHelper::applyMappingSrc(
   return true;
 }
 
-bool RegBankLegalizeHelper::applyMappingPHI(MachineInstr &MI) {
-  Register Dst = MI.getOperand(0).getReg();
-  LLT Ty = MRI.getType(Dst);
-
-  if (Ty == LLT::scalar(1) && MUI.isUniform(Dst)) {
-    B.setInsertPt(*MI.getParent(), MI.getParent()->getFirstNonPHI());
-
-    Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
-    MI.getOperand(0).setReg(NewDst);
-    B.buildTrunc(Dst, NewDst);
-
-    for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
-      Register UseReg = MI.getOperand(i).getReg();
-
-      auto DefMI = MRI.getVRegDef(UseReg)->getIterator();
-      MachineBasicBlock *DefMBB = DefMI->getParent();
-
-      B.setInsertPt(*DefMBB, DefMBB->SkipPHIsAndLabels(std::next(DefMI)));
-
-      auto NewUse = B.buildAnyExt(SgprRB_S32, UseReg);
-      MI.getOperand(i).setReg(NewUse.getReg(0));
-    }
-
-    return true;
-  }
-
-  // ALL divergent i1 phis should have been lowered and inst-selected into PHI
-  // with sgpr reg class and S1 LLT in AMDGPUGlobalISelDivergenceLowering pass.
-  // Note: this includes divergent phis that don't require lowering.
-  if (Ty == LLT::scalar(1) && MUI.isDivergent(Dst)) {
-    reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
-                       "AMDGPU RegBankLegalize: Can't lower divergent S1 G_PHI",
-                       MI);
-    return false;
-  }
-
-  // We accept all types that can fit in some register class.
-  // Uniform G_PHIs have all sgpr registers.
-  // Divergent G_PHIs have vgpr dst but inputs can be sgpr or vgpr.
-  if (Ty == LLT::scalar(32) || Ty == LLT::pointer(1, 64) ||
-      Ty == LLT::pointer(4, 64)) {
-    return true;
-  }
-
-  reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
-                     "AMDGPU RegBankLegalize: type not supported for G_PHI",
-                     MI);
-  return false;
-}
-
 [[maybe_unused]] static bool verifyRegBankOnOperands(MachineInstr &MI,
                                                      const RegisterBank *RB,
                                                      MachineRegisterInfo &MRI,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
index 86669ae6ff6c7..6eb2ff7009403 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
@@ -87,7 +87,6 @@ class RegBankLegalizeHelper {
   bool findRuleAndApplyMapping(MachineInstr &MI);
 
   // Manual apply helpers.
-  bool applyMappingPHI(MachineInstr &MI);
   void applyMappingTrivial(MachineInstr &MI);
 
 private:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
index 73592d52ad04c..ad68c4dad8ff3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
@@ -644,6 +644,12 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
       .Any({{UniBRC}, {{}, {}, VerifyAllSgpr}})
       .Any({{DivBRC}, {{}, {}, ApplyAllVgpr}});
 
+  addRulesForGOpcs({G_PHI})
+      .Any({{UniS1}, {{}, {}, AextToS32InIncomingBlockGPHI}})
+      .Any({{UniS16}, {{}, {}, VerifyAllSgprGPHI}})
+      .Any({{UniBRC}, {{}, {}, VerifyAllSgprGPHI}})
+      .Any({{DivBRC}, {{}, {}, VerifyAllSgprOrVgprGPHI}});
+
   Predicate isSignedICmp([](const MachineInstr &MI) -> bool {
     auto Pred =
         static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
index eee4f6276b925..df2663d906d3e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
@@ -261,7 +261,10 @@ enum LoweringMethodID {
   UnpackAExt,
   VerifyAllSgpr,
   ApplyAllVgpr,
-  UnmergeToShiftTrunc
+  UnmergeToShiftTrunc,
+  AextToS32InIncomingBlockGPHI,
+  VerifyAllSgprGPHI,
+  VerifyAllSgprOrVgprGPHI
 };
 
 enum FastRulesTypes {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
index 6148bc2d5ae6e..a98fdb9c91f43 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s
 
 ; Make sure the branch targets are correct after lowering llvm.amdgcn.if
 
@@ -8,14 +8,16 @@ define i32 @divergent_if_swap_brtarget_order0(i32 %value) {
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; CHECK-NEXT:    ; implicit-def: $vgpr0
+; CHECK-NEXT:    ; implicit-def: $sgpr6
 ; CHECK-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_2
 ; CHECK-NEXT:  ; %bb.1: ; %if.true
 ; CHECK-NEXT:    global_load_dword v0, v[0:1], off glc
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_readfirstlane_b32 s6, v0
 ; CHECK-NEXT:  .LBB0_2: ; %endif
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
+; CHECK-NEXT:    v_mov_b32_e32 v0, s6
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %c = icmp ne i32 %value, 0
@@ -35,14 +37,16 @@ define i32 @divergent_if_swap_brtarget_order1(i32 %value) {
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; CHECK-NEXT:    ; implicit-def: $vgpr0
+; CHECK-NEXT:    ; implicit-def: $sgpr6
 ; CHECK-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; CHECK-NEXT:    s_cbranch_execz .LBB1_2
 ; CHECK-NEXT:  ; %bb.1: ; %if.true
 ; CHECK-NEXT:    global_load_dword v0, v[0:1], off glc
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_readfirstlane_b32 s6, v0
 ; CHECK-NEXT:  .LBB1_2: ; %endif
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
+; CHECK-NEXT:    v_mov_b32_e32 v0, s6
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %c = icmp ne i32 %value, 0
@@ -64,14 +68,16 @@ define i32 @divergent_if_nonboolean_condition0(i32 %value) {
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; CHECK-NEXT:    ; implicit-def: $vgpr0
+; CHECK-NEXT:    ; implicit-def: $sgpr6
 ; CHECK-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; CHECK-NEXT:    s_cbranch_execz .LBB2_2
 ; CHECK-NEXT:  ; %bb.1: ; %if.true
 ; CHECK-NEXT:    global_load_dword v0, v[0:1], off glc
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_readfirstlane_b32 s6, v0
 ; CHECK-NEXT:  .LBB2_2: ; %endif
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
+; CHECK-NEXT:    v_mov_b32_e32 v0, s6
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %c = trunc i32 %value to i1
@@ -92,17 +98,19 @@ define i32 @divergent_if_nonboolean_condition1(ptr addrspace(1) %ptr) {
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    global_load_dword v0, v[0:1], off
+; CHECK-NEXT:    ; implicit-def: $sgpr6
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; CHECK-NEXT:    ; implicit-def: $vgpr0
 ; CHECK-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; CHECK-NEXT:    s_cbranch_execz .LBB3_2
 ; CHECK-NEXT:  ; %bb.1: ; %if.true
 ; CHECK-NEXT:    global_load_dword v0, v[0:1], off glc
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_readfirstlane_b32 s6, v0
 ; CHECK-NEXT:  .LBB3_2: ; %endif
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
+; CHECK-NEXT:    v_mov_b32_e32 v0, s6
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %value = load i32, ptr addrspace(1) %ptr
@@ -143,22 +151,24 @@ define void @constrained_if_register_class() {
 ; CHECK-NEXT:    s_add_u32 s4, s4, const.ptr at gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, const.ptr at gotpcrel32@hi+12
 ; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; CHECK-NEXT:    s_mov_b32 s6, -1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v0, s4
 ; CHECK-NEXT:    v_mov_b32_e32 v1, s5
 ; CHECK-NEXT:    flat_load_dword v0, v[0:1]
+; CHECK-NEXT:    s_mov_b32 s4, -1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_cmp_ngt_f32_e32 vcc, 1.0, v0
+; CHECK-NEXT:    v_mov_b32_e32 v0, s4
 ; CHECK-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; CHECK-NEXT:  ; %bb.3: ; %bb7
 ; CHECK-NEXT:    s_mov_b32 s6, 0
+; CHECK-NEXT:    v_mov_b32_e32 v0, s6
 ; CHECK-NEXT:  ; %bb.4: ; %bb8
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
-; CHECK-NEXT:    v_cmp_eq_u32_e64 s[6:7], s6, 0
-; CHECK-NEXT:    s_and_saveexec_b64 s[4:5], s[6:7]
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; CHECK-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; CHECK-NEXT:    s_cbranch_execz .LBB4_6
 ; CHECK-NEXT:  ; %bb.5: ; %bb11
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 4.0
@@ -200,33 +210,35 @@ define amdgpu_kernel void @break_loop(i32 %arg) {
 ; CHECK:       ; %bb.0: ; %bb
 ; CHECK-NEXT:    s_load_dword s0, s[8:9], 0x0
 ; CHECK-NEXT:    ; implicit-def: $sgpr2_sgpr3
-; CHECK-NEXT:    ; implicit-def: $vgpr1
+; CHECK-NEXT:    ; implicit-def: $sgpr4
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_subrev_u32_e32 v0, s0, v0
 ; CHECK-NEXT:    s_mov_b64 s[0:1], 0
 ; CHECK-NEXT:    s_branch .LBB5_2
 ; CHECK-NEXT:  .LBB5_1: ; %Flow
 ; CHECK-NEXT:    ; in Loop: Header=BB5_2 Depth=1
-; CHECK-NEXT:    s_and_b64 s[4:5], exec, s[2:3]
-; CHECK-NEXT:    s_or_b64 s[0:1], s[4:5], s[0:1]
+; CHECK-NEXT:    s_and_b64 s[6:7], exec, s[2:3]
+; CHECK-NEXT:    s_or_b64 s[0:1], s[6:7], s[0:1]
 ; CHECK-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; CHECK-NEXT:    s_cbranch_execz .LBB5_4
 ; CHECK-NEXT:  .LBB5_2: ; %bb1
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    v_add_u32_e32 v1, 1, v1
+; CHECK-NEXT:    s_add_i32 s4, s4, 1
+; CHECK-NEXT:    s_cmp_ge_i32 s4, 0
+; CHECK-NEXT:    s_cselect_b32 s5, 1, 0
 ; CHECK-NEXT:    s_andn2_b64 s[2:3], s[2:3], exec
-; CHECK-NEXT:    s_and_b64 s[4:5], exec, -1
-; CHECK-NEXT:    v_cmp_le_i32_e32 vcc, 0, v1
-; CHECK-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
-; CHECK-NEXT:    s_cbranch_vccnz .LBB5_1
+; CHECK-NEXT:    s_and_b64 s[6:7], exec, exec
+; CHECK-NEXT:    s_or_b64 s[2:3], s[2:3], s[6:7]
+; CHECK-NEXT:    s_cmp_lg_u32 s5, 0
+; CHECK-NEXT:    s_cbranch_scc1 .LBB5_1
 ; CHECK-NEXT:  ; %bb.3: ; %bb4
 ; CHECK-NEXT:    ; in Loop: Header=BB5_2 Depth=1
-; CHECK-NEXT:    global_load_dword v2, v[0:1], off glc
+; CHECK-NEXT:    global_load_dword v1, v[0:1], off glc
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_andn2_b64 s[2:3], s[2:3], exec
-; CHECK-NEXT:    v_cmp_ge_i32_e32 vcc, v0, v2
-; CHECK-NEXT:    s_and_b64 s[4:5], exec, vcc
-; CHECK-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
+; CHECK-NEXT:    v_cmp_ge_i32_e32 vcc, v0, v1
+; CHECK-NEXT:    s_and_b64 s[6:7], exec, vcc
+; CHECK-NEXT:    s_or_b64 s[2:3], s[2:3], s[6:7]
 ; CHECK-NEXT:    s_branch .LBB5_1
 ; CHECK-NEXT:  .LBB5_4: ; %bb9
 ; CHECK-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
index 614e3f50998b8..9ccfde8d4c37c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
@@ -1,19 +1,19 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel -new-reg-bank-select -global-isel-abort=2 -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN %s
 
 define amdgpu_kernel void @set_inactive(ptr addrspace(1) %out, i32 %in) {
 ; GCN-LABEL: set_inactive:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s3, s[4:5], 0x2c
+; GCN-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    v_mov_b32_e32 v1, s6
 ; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 42, v1, s[4:5]
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v1, v0
-; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
   %tmp.0 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) #0
@@ -27,9 +27,9 @@ define amdgpu_kernel void @set_inactive_imm_poison(ptr addrspace(1) %out) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v0, 1
-; GCN-NEXT:    v_mov_b32_e32 v1, v0
-; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
@@ -43,18 +43,22 @@ define amdgpu_kernel void @set_inactive_64(ptr addrspace(1) %out, i64 %in) {
 ; GCN-LABEL: set_inactive_64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    v_mov_b32_e32 v2, s3
+; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GCN-NEXT:    v_cndmask_b32_e64 v1, 0, v2, s[0:1]
+; GCN-NEXT:    s_mov_b64 exec, s[0:1]
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
-; GCN-NEXT:    v_mov_b32_e32 v3, s3
-; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, v2, s[4:5]
-; GCN-NEXT:    v_cndmask_b32_e64 v1, 0, v3, s[4:5]
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, v2, s[0:1]
+; GCN-NEXT:    s_mov_b64 exec, s[0:1]
 ; GCN-NEXT:    v_mov_b32_e32 v2, v0
 ; GCN-NEXT:    v_mov_b32_e32 v3, v1
-; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[0:3], 0
+; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
   %tmp.0 = call i64 @llvm.amdgcn.set.inactive.i64(i64 %in, i64 0) #0
   %tmp = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp.0)
@@ -69,9 +73,9 @@ define amdgpu_kernel void @set_inactive_imm_poison_64(ptr addrspace(1) %out) {
 ; GCN-NEXT:    v_mov_b32_e32 v0, 1
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    v_mov_b32_e32 v2, v0
-; GCN-NEXT:    v_mov_b32_e32 v3, v1
-; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
@@ -96,23 +100,23 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_cmp_lg_u32 s7, 56
 ; GCN-NEXT:    v_mov_b32_e32 v1, v0
-; GCN-NEXT:    s_mov_b32 s2, 1
-; GCN-NEXT:    s_cbranch_scc0 .LBB4_2
-; GCN-NEXT:  ; %bb.1: ; %.one
+; GCN-NEXT:    s_mov_b64 s[2:3], -1
+; GCN-NEXT:    s_cbranch_scc1 .LBB4_3
+; GCN-NEXT:  ; %bb.1: ; %Flow
+; GCN-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN-NEXT:    s_cbranch_vccz .LBB4_4
+; GCN-NEXT:  .LBB4_2: ; %.exit
+; GCN-NEXT:    s_endpgm
+; GCN-NEXT:  .LBB4_3: ; %.one
 ; GCN-NEXT:    v_add_u32_e32 v2, vcc, 1, v1
-; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], 0
-; GCN-NEXT:    s_mov_b32 s2, 0
-; GCN-NEXT:  .LBB4_2: ; %Flow
-; GCN-NEXT:    s_xor_b32 s2, s2, 1
-; GCN-NEXT:    s_cmp_lg_u32 s2, 0
-; GCN-NEXT:    s_cbranch_scc1 .LBB4_4
-; GCN-NEXT:  ; %bb.3: ; %.zero
 ; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], 0
+; GCN-NEXT:    s_cbranch_execnz .LBB4_2
+; GCN-NEXT:  .LBB4_4: ; %.zero
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
-; GCN-NEXT:  .LBB4_4: ; %.exit
 ; GCN-NEXT:    s_endpgm
   %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 0, i32 0)
   %cmp = icmp eq i32 %val, 56
@@ -138,17 +142,15 @@ define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GCN-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GCN-NEXT:    v_mov_b32_e32 v0, 0x40400000
-; GCN-NEXT:    s_mov_b64 exec, s[2:3]
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v1, s6
 ; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x40400000
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v1, v0
-; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
   %tmp.0 = call float @llvm.amdgcn.set.inactive.f32(float %in, float 3.0) #0
@@ -161,22 +163,24 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) {
 ; GCN-LABEL: set_inactive_f64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    v_mov_b32_e32 v0, 0xcccccccd
-; GCN-NEXT:    v_mov_b32_e32 v1, 0x4010cccc
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    v_mov_b32_e32 v2, s3
+; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x4010cccc
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v0, v2, s[0:1]
+; GCN-NEXT:    s_mov_b64 exec, s[0:1]
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
-; GCN-NEXT:    v_mov_b32_e32 v3, s3
-; GCN-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[2:3]
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[2:3]
-; GCN-NEXT:    s_mov_b64 exec, s[2:3]
+; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GCN-NEXT:    v_mov_b32_e32 v0, 0xcccccccd
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GCN-NEXT:    s_mov_b64 exec, s[0:1]
 ; GCN-NEXT:    v_mov_b32_e32 v2, v0
 ; GCN-NEXT:    v_mov_b32_e32 v3, v1
-; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[0:3], 0
+; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
   %tmp.0 = call double @llvm.amdgcn.set.inactive.f64(double %in, double 4.2) #0
   %tmp = call double @llvm.amdgcn.strict.wwm.f64(double %tmp.0)
@@ -189,17 +193,15 @@ define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GCN-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GCN-NEXT:    v_mov_b32_e32 v0, 0x10001
-; GCN-NEXT:    s_mov_b64 exec, s[2:3]
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v1, s6
 ; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x10001
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v1, v0
-; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
   %tmp.0 = call <2 x i16> @llvm.amdgcn.set.inactive.v2i16(<2 x i16> %in, <2 x i16> <i16 1, i16 1>) #0
@@ -213,17 +215,15 @@ define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half>
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GCN-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GCN-NEXT:    v_mov_b32_e32 v0, 0x3c003c00
-; GCN-NEXT:    s_mov_b64 exec, s[2:3]
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v1, s6
 ; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x3c003c00
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v1, v0
-; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
   %tmp.0 = call <2 x half> @llvm.amdgcn.set.inactive.v2f16(<2 x half> %in, <2 x half> <half 1.0, half 1.0>) #0
@@ -236,18 +236,22 @@ define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> %
 ; GCN-LABEL: set_inactive_v2i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    v_mov_b32_e32 v2, s3
+; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GCN-NEXT:    v_cndmask_b32_e64 v1, 1, v2, s[0:1]
+; GCN-NEXT:    s_mov_b64 exec, s[0:1]
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
-; GCN-NEXT:    v_mov_b32_e32 v3, s3
-; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 1, v2, s[4:5]
-; GCN-NEXT:    v_cndmask_b32_e64 v1, 1, v3, s[4:5]
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 1, v2, s[0:1]
+; GCN-NEXT:    s_mov_b64 exec, s[0:1]
 ; GCN-NEXT:    v_mov_b32_e32 v2, v0
 ; GCN-NEXT:    v_mov_b32_e32 v3, v1
-; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[0:3], 0
+; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
   %tmp.0 = call <2 x i32> @llvm.amdgcn.set.inactive.v2i32(<2 x i32> %in, <2 x i32> <i32 1, i32 1>) #0
   %tmp = call <2 x i32> @llvm.amdgcn.strict.wwm.v2i32(<2 x i32> %tmp.0)
@@ -259,18 +263,22 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float>
 ; GCN-LABEL: set_inactive_v2f32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    v_mov_b32_e32 v2, s3
+; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GCN-NEXT:    v_cndmask_b32_e64 v1, 1.0, v2, s[0:1]
+; GCN-NEXT:    s_mov_b64 exec, s[0:1]
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
-; GCN-NEXT:    v_mov_b32_e32 v3, s3
-; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 1.0, v2, s[4:5]
-; GCN-NEXT:    v_cndmask_b32_e64 v1, 1.0, v3, s[4:5]
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 1.0, v2, s[0:1]
+; GCN-NEXT:    s_mov_b64 exec, s[0:1]
 ; GCN-NEXT:    v_mov_b32_e32 v2, v0
 ; GCN-NEXT:    v_mov_b32_e32 v3, v1
-; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[0:3], 0
+; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
   %tmp.0 = call <2 x float> @llvm.amdgcn.set.inactive.v2f32(<2 x float> %in, <2 x float> <float 1.0, float 1.0>) #0
   %tmp = call <2 x float> @llvm.amdgcn.strict.wwm.v2f32(<2 x float> %tmp.0)
@@ -304,21 +312,23 @@ define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> %
 ; GCN-LABEL: set_inactive_v4i16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    v_mov_b32_e32 v0, 0x10001
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v3, s2
-; GCN-NEXT:    v_mov_b32_e32 v4, s3
-; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v0, v3, s[4:5]
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v0, v4, s[4:5]
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    v_mov_b32_e32 v2, s3
+; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x10001
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v0, v2, s[0:1]
+; GCN-NEXT:    s_mov_b64 exec, s[0:1]
+; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GCN-NEXT:    s_mov_b64 exec, s[0:1]
+; GCN-NEXT:    v_mov_b32_e32 v2, v0
 ; GCN-NEXT:    v_mov_b32_e32 v3, v1
-; GCN-NEXT:    v_mov_b32_e32 v4, v2
-; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    buffer_store_dwordx2 v[3:4], off, s[0:3], 0
+; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
   %tmp.0 = call <4 x i16> @llvm.amdgcn.set.inactive.v4i16(<4 x i16> %in, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) #0
   %tmp = call <4 x i16> @llvm.amdgcn.strict.wwm.v4i16(<4 x i16> %tmp.0)
@@ -330,21 +340,23 @@ define amdgpu_kernel void @set_inactive_v4f16(ptr addrspace(1) %out, <4 x half>
 ; GCN-LABEL: set_inactive_v4f16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    v_mov_b32_e32 v0, 0x3c003c00
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v3, s2
-; GCN-NEXT:    v_mov_b32_e32 v4, s3
-; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v0, v3, s[4:5]
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v0, v4, s[4:5]
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    v_mov_b32_e32 v2, s3
+; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x3c003c00
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v0, v2, s[0:1]
+; GCN-NEXT:    s_mov_b64 exec, s[0:1]
+; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GCN-NEXT:    s_mov_b64 exec, s[0:1]
+; GCN-NEXT:    v_mov_b32_e32 v2, v0
 ; GCN-NEXT:    v_mov_b32_e32 v3, v1
-; GCN-NEXT:    v_mov_b32_e32 v4, v2
-; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    buffer_store_dwordx2 v[3:4], off, s[0:3], 0
+; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
   %tmp.0 = call <4 x half> @llvm.amdgcn.set.inactive.v4f16(<4 x half> %in, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>) #0
   %tmp = call <4 x half> @llvm.amdgcn.strict.wwm.v4f16(<4 x half> %tmp.0)
@@ -384,18 +396,22 @@ define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) {
 ; GCN-LABEL: set_inactive_p0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    v_mov_b32_e32 v2, s3
+; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GCN-NEXT:    v_cndmask_b32_e64 v1, 0, v2, s[0:1]
+; GCN-NEXT:    s_mov_b64 exec, s[0:1]
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
-; GCN-NEXT:    v_mov_b32_e32 v3, s3
-; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, v2, s[4:5]
-; GCN-NEXT:    v_cndmask_b32_e64 v1, 0, v3, s[4:5]
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, v2, s[0:1]
+; GCN-NEXT:    s_mov_b64 exec, s[0:1]
 ; GCN-NEXT:    v_mov_b32_e32 v2, v0
 ; GCN-NEXT:    v_mov_b32_e32 v3, v1
-; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[0:3], 0
+; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
   %tmp.0 = call ptr @llvm.amdgcn.set.inactive.p0(ptr %in, ptr null) #0
   %tmp = call ptr @llvm.amdgcn.strict.wwm.p0(ptr %tmp.0)
@@ -406,16 +422,16 @@ define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) {
 define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace(2) %in) {
 ; GCN-LABEL: set_inactive_p2:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s3, s[4:5], 0x2c
+; GCN-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    v_mov_b32_e32 v1, s6
 ; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, v1, s[4:5]
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v1, v0
-; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
   %tmp.0 = call ptr addrspace(2) @llvm.amdgcn.set.inactive.p2(ptr addrspace(2) %in, ptr addrspace(2) zeroinitializer) #0
@@ -427,16 +443,16 @@ define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace(
 define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace(3) %in) {
 ; GCN-LABEL: set_inactive_p3:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s3, s[4:5], 0x2c
+; GCN-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    v_mov_b32_e32 v1, s6
 ; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, v1, s[4:5]
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v1, v0
-; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
   %tmp.0 = call ptr addrspace(3) @llvm.amdgcn.set.inactive.p3(ptr addrspace(3) %in, ptr addrspace(3) zeroinitializer) #0
@@ -448,16 +464,16 @@ define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace(
 define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace(5) %in) {
 ; GCN-LABEL: set_inactive_p5:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s3, s[4:5], 0x2c
+; GCN-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    v_mov_b32_e32 v1, s6
 ; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, v1, s[4:5]
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v1, v0
-; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
   %tmp.0 = call ptr addrspace(5) @llvm.amdgcn.set.inactive.p5(ptr addrspace(5) %in, ptr addrspace(5) zeroinitializer) #0
@@ -469,16 +485,16 @@ define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace(
 define amdgpu_kernel void @set_inactive_p6(ptr addrspace(1) %out, ptr addrspace(6) %in) {
 ; GCN-LABEL: set_inactive_p6:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s3, s[4:5], 0x2c
+; GCN-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    v_mov_b32_e32 v1, s6
 ; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, v1, s[4:5]
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v1, v0
-; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
   %tmp.0 = call ptr addrspace(6) @llvm.amdgcn.set.inactive.p6(ptr addrspace(6) %in, ptr addrspace(6) null) #0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll
index 993fb7eeb3aa9..b7a25dc76725c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-- -amdgpu-memcpy-loop-unroll=2 -mem-intrinsic-expand-size=35 %s -o - | FileCheck -check-prefix=LOOP %s
-; RUN: llc -global-isel -mtriple=amdgcn-- -amdgpu-memcpy-loop-unroll=2 -mem-intrinsic-expand-size=37 %s -o - | FileCheck -check-prefix=UNROLL %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-- -amdgpu-memcpy-loop-unroll=2 -mem-intrinsic-expand-size=35 %s -o - | FileCheck -check-prefix=LOOP %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-- -amdgpu-memcpy-loop-unroll=2 -mem-intrinsic-expand-size=37 %s -o - | FileCheck -check-prefix=UNROLL %s
 
 declare void @llvm.memcpy.p1.p1.i32(ptr addrspace(1), ptr addrspace(1), i32, i1 immarg)
 
@@ -10,177 +10,203 @@ define amdgpu_cs void @memcpy_p1i8(ptr addrspace(1) %dst, ptr addrspace(1) %src)
 ; LOOP-NEXT:    s_mov_b64 s[0:1], 0
 ; LOOP-NEXT:    s_mov_b32 s2, 0
 ; LOOP-NEXT:    s_mov_b32 s3, 0xf000
-; LOOP-NEXT:    v_mov_b32_e32 v5, s1
-; LOOP-NEXT:    v_mov_b32_e32 v4, s0
 ; LOOP-NEXT:  .LBB0_1: ; %static-memcpy-expansion-main-body
 ; LOOP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; LOOP-NEXT:    v_add_i32_e32 v6, vcc, v2, v4
-; LOOP-NEXT:    v_addc_u32_e32 v7, vcc, v3, v5, vcc
-; LOOP-NEXT:    buffer_load_ubyte v26, v[6:7], s[0:3], 0 addr64
-; LOOP-NEXT:    s_waitcnt expcnt(5)
-; LOOP-NEXT:    buffer_load_ubyte v29, v[6:7], s[0:3], 0 addr64 offset:1
-; LOOP-NEXT:    s_waitcnt expcnt(2)
-; LOOP-NEXT:    buffer_load_ubyte v31, v[6:7], s[0:3], 0 addr64 offset:2
-; LOOP-NEXT:    buffer_load_ubyte v32, v[6:7], s[0:3], 0 addr64 offset:3
-; LOOP-NEXT:    buffer_load_ubyte v36, v[6:7], s[0:3], 0 addr64 offset:4
-; LOOP-NEXT:    buffer_load_ubyte v37, v[6:7], s[0:3], 0 addr64 offset:5
-; LOOP-NEXT:    buffer_load_ubyte v38, v[6:7], s[0:3], 0 addr64 offset:6
-; LOOP-NEXT:    buffer_load_ubyte v39, v[6:7], s[0:3], 0 addr64 offset:7
-; LOOP-NEXT:    buffer_load_ubyte v8, v[6:7], s[0:3], 0 addr64 offset:8
-; LOOP-NEXT:    buffer_load_ubyte v11, v[6:7], s[0:3], 0 addr64 offset:9
-; LOOP-NEXT:    buffer_load_ubyte v12, v[6:7], s[0:3], 0 addr64 offset:10
+; LOOP-NEXT:    s_waitcnt expcnt(1)
+; LOOP-NEXT:    buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64
+; LOOP-NEXT:    buffer_load_ubyte v5, v[2:3], s[0:3], 0 addr64 offset:1
+; LOOP-NEXT:    buffer_load_ubyte v6, v[2:3], s[0:3], 0 addr64 offset:2
+; LOOP-NEXT:    buffer_load_ubyte v7, v[2:3], s[0:3], 0 addr64 offset:3
+; LOOP-NEXT:    buffer_load_ubyte v8, v[2:3], s[0:3], 0 addr64 offset:4
+; LOOP-NEXT:    buffer_load_ubyte v9, v[2:3], s[0:3], 0 addr64 offset:5
+; LOOP-NEXT:    buffer_load_ubyte v10, v[2:3], s[0:3], 0 addr64 offset:6
 ; LOOP-NEXT:    s_waitcnt expcnt(0)
-; LOOP-NEXT:    buffer_load_ubyte v13, v[6:7], s[0:3], 0 addr64 offset:11
-; LOOP-NEXT:    buffer_load_ubyte v9, v[6:7], s[0:3], 0 addr64 offset:12
-; LOOP-NEXT:    buffer_load_ubyte v15, v[6:7], s[0:3], 0 addr64 offset:13
-; LOOP-NEXT:    buffer_load_ubyte v16, v[6:7], s[0:3], 0 addr64 offset:14
-; LOOP-NEXT:    buffer_load_ubyte v17, v[6:7], s[0:3], 0 addr64 offset:15
-; LOOP-NEXT:    buffer_load_ubyte v10, v[6:7], s[0:3], 0 addr64 offset:16
-; LOOP-NEXT:    buffer_load_ubyte v19, v[6:7], s[0:3], 0 addr64 offset:17
-; LOOP-NEXT:    buffer_load_ubyte v20, v[6:7], s[0:3], 0 addr64 offset:18
-; LOOP-NEXT:    buffer_load_ubyte v21, v[6:7], s[0:3], 0 addr64 offset:19
-; LOOP-NEXT:    buffer_load_ubyte v14, v[6:7], s[0:3], 0 addr64 offset:20
-; LOOP-NEXT:    buffer_load_ubyte v23, v[6:7], s[0:3], 0 addr64 offset:21
-; LOOP-NEXT:    buffer_load_ubyte v24, v[6:7], s[0:3], 0 addr64 offset:22
-; LOOP-NEXT:    buffer_load_ubyte v25, v[6:7], s[0:3], 0 addr64 offset:23
-; LOOP-NEXT:    buffer_load_ubyte v18, v[6:7], s[0:3], 0 addr64 offset:24
-; LOOP-NEXT:    buffer_load_ubyte v27, v[6:7], s[0:3], 0 addr64 offset:25
-; LOOP-NEXT:    buffer_load_ubyte v28, v[6:7], s[0:3], 0 addr64 offset:26
-; LOOP-NEXT:    buffer_load_ubyte v30, v[6:7], s[0:3], 0 addr64 offset:27
-; LOOP-NEXT:    buffer_load_ubyte v22, v[6:7], s[0:3], 0 addr64 offset:28
-; LOOP-NEXT:    buffer_load_ubyte v33, v[6:7], s[0:3], 0 addr64 offset:29
-; LOOP-NEXT:    buffer_load_ubyte v34, v[6:7], s[0:3], 0 addr64 offset:30
-; LOOP-NEXT:    buffer_load_ubyte v35, v[6:7], s[0:3], 0 addr64 offset:31
-; LOOP-NEXT:    s_waitcnt vmcnt(14)
-; LOOP-NEXT:    v_lshlrev_b32_e32 v6, 8, v29
-; LOOP-NEXT:    v_or_b32_e32 v26, v6, v26
-; LOOP-NEXT:    v_lshlrev_b32_e32 v6, 24, v32
-; LOOP-NEXT:    v_lshlrev_b32_e32 v7, 16, v31
-; LOOP-NEXT:    v_or_b32_e32 v29, v6, v7
-; LOOP-NEXT:    v_lshlrev_b32_e32 v6, 8, v37
-; LOOP-NEXT:    v_lshlrev_b32_e32 v7, 24, v39
-; LOOP-NEXT:    v_lshlrev_b32_e32 v32, 16, v38
-; LOOP-NEXT:    v_or_b32_e32 v31, v6, v36
-; LOOP-NEXT:    v_or_b32_e32 v32, v7, v32
-; LOOP-NEXT:    v_add_i32_e32 v6, vcc, v0, v4
-; LOOP-NEXT:    v_addc_u32_e32 v7, vcc, v1, v5, vcc
-; LOOP-NEXT:    v_add_i32_e32 v4, vcc, 32, v4
-; LOOP-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; LOOP-NEXT:    v_cmp_gt_u32_e32 vcc, 32, v4
-; LOOP-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
-; LOOP-NEXT:    v_lshlrev_b32_e32 v13, 24, v13
-; LOOP-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; LOOP-NEXT:    v_lshlrev_b32_e32 v15, 8, v15
-; LOOP-NEXT:    v_lshlrev_b32_e32 v17, 24, v17
-; LOOP-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; LOOP-NEXT:    v_lshlrev_b32_e32 v19, 8, v19
-; LOOP-NEXT:    s_waitcnt vmcnt(12)
-; LOOP-NEXT:    v_lshlrev_b32_e32 v21, 24, v21
-; LOOP-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; LOOP-NEXT:    s_waitcnt vmcnt(10)
-; LOOP-NEXT:    v_lshlrev_b32_e32 v23, 8, v23
-; LOOP-NEXT:    s_waitcnt vmcnt(8)
-; LOOP-NEXT:    v_lshlrev_b32_e32 v25, 24, v25
-; LOOP-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; LOOP-NEXT:    buffer_load_ubyte v11, v[2:3], s[0:3], 0 addr64 offset:7
+; LOOP-NEXT:    buffer_load_ubyte v12, v[2:3], s[0:3], 0 addr64 offset:8
+; LOOP-NEXT:    s_waitcnt vmcnt(7)
+; LOOP-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
+; LOOP-NEXT:    v_or_b32_e32 v4, v5, v4
+; LOOP-NEXT:    buffer_load_ubyte v5, v[2:3], s[0:3], 0 addr64 offset:9
 ; LOOP-NEXT:    s_waitcnt vmcnt(6)
-; LOOP-NEXT:    v_lshlrev_b32_e32 v27, 8, v27
+; LOOP-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
+; LOOP-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; LOOP-NEXT:    v_or_b32_e32 v6, v7, v6
+; LOOP-NEXT:    buffer_load_ubyte v7, v[2:3], s[0:3], 0 addr64 offset:10
+; LOOP-NEXT:    s_waitcnt vmcnt(5)
+; LOOP-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
+; LOOP-NEXT:    v_or_b32_e32 v8, v9, v8
+; LOOP-NEXT:    buffer_load_ubyte v9, v[2:3], s[0:3], 0 addr64 offset:11
 ; LOOP-NEXT:    s_waitcnt vmcnt(4)
-; LOOP-NEXT:    v_lshlrev_b32_e32 v30, 24, v30
-; LOOP-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; LOOP-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
+; LOOP-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; LOOP-NEXT:    v_or_b32_e32 v10, v11, v10
+; LOOP-NEXT:    buffer_load_ubyte v11, v[2:3], s[0:3], 0 addr64 offset:12
+; LOOP-NEXT:    s_waitcnt vmcnt(3)
+; LOOP-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
+; LOOP-NEXT:    v_or_b32_e32 v5, v5, v12
+; LOOP-NEXT:    buffer_load_ubyte v12, v[2:3], s[0:3], 0 addr64 offset:13
+; LOOP-NEXT:    s_waitcnt vmcnt(2)
+; LOOP-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
+; LOOP-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; LOOP-NEXT:    v_or_b32_e32 v7, v9, v7
+; LOOP-NEXT:    buffer_load_ubyte v9, v[2:3], s[0:3], 0 addr64 offset:15
+; LOOP-NEXT:    buffer_load_ubyte v13, v[2:3], s[0:3], 0 addr64 offset:14
 ; LOOP-NEXT:    s_waitcnt vmcnt(2)
-; LOOP-NEXT:    v_lshlrev_b32_e32 v33, 8, v33
+; LOOP-NEXT:    v_lshlrev_b32_e32 v12, 8, v12
+; LOOP-NEXT:    v_or_b32_e32 v11, v12, v11
+; LOOP-NEXT:    s_waitcnt vmcnt(1)
+; LOOP-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
 ; LOOP-NEXT:    s_waitcnt vmcnt(0)
-; LOOP-NEXT:    v_lshlrev_b32_e32 v35, 24, v35
-; LOOP-NEXT:    v_lshlrev_b32_e32 v34, 16, v34
-; LOOP-NEXT:    v_or_b32_e32 v8, v11, v8
-; LOOP-NEXT:    v_or_b32_e32 v11, v13, v12
-; LOOP-NEXT:    v_or_b32_e32 v9, v15, v9
-; LOOP-NEXT:    v_or_b32_e32 v12, v17, v16
-; LOOP-NEXT:    v_or_b32_e32 v10, v19, v10
-; LOOP-NEXT:    v_or_b32_e32 v13, v21, v20
-; LOOP-NEXT:    v_or_b32_e32 v14, v23, v14
-; LOOP-NEXT:    v_or_b32_e32 v15, v25, v24
-; LOOP-NEXT:    v_or_b32_e32 v16, v27, v18
-; LOOP-NEXT:    v_or_b32_e32 v17, v30, v28
-; LOOP-NEXT:    v_or_b32_e32 v18, v33, v22
-; LOOP-NEXT:    v_or_b32_e32 v19, v35, v34
-; LOOP-NEXT:    v_or_b32_e32 v20, v29, v26
-; LOOP-NEXT:    v_or_b32_e32 v21, v32, v31
-; LOOP-NEXT:    v_or_b32_e32 v8, v11, v8
-; LOOP-NEXT:    v_or_b32_e32 v9, v12, v9
-; LOOP-NEXT:    v_or_b32_e32 v10, v13, v10
-; LOOP-NEXT:    v_or_b32_e32 v11, v15, v14
-; LOOP-NEXT:    v_or_b32_e32 v12, v17, v16
-; LOOP-NEXT:    v_or_b32_e32 v13, v19, v18
-; LOOP-NEXT:    v_lshrrev_b32_e32 v14, 16, v20
-; LOOP-NEXT:    v_bfe_u32 v15, v20, 8, 8
-; LOOP-NEXT:    buffer_store_byte v20, v[6:7], s[0:3], 0 addr64
-; LOOP-NEXT:    v_lshrrev_b32_e32 v16, 24, v20
-; LOOP-NEXT:    v_lshrrev_b32_e32 v17, 16, v21
-; LOOP-NEXT:    v_bfe_u32 v18, v21, 8, 8
-; LOOP-NEXT:    buffer_store_byte v21, v[6:7], s[0:3], 0 addr64 offset:4
-; LOOP-NEXT:    v_lshrrev_b32_e32 v19, 24, v21
-; LOOP-NEXT:    s_waitcnt expcnt(1)
-; LOOP-NEXT:    v_lshrrev_b32_e32 v20, 16, v8
+; LOOP-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
+; LOOP-NEXT:    v_or_b32_e32 v9, v9, v12
+; LOOP-NEXT:    buffer_load_ubyte v12, v[2:3], s[0:3], 0 addr64 offset:17
+; LOOP-NEXT:    buffer_load_ubyte v13, v[2:3], s[0:3], 0 addr64 offset:16
+; LOOP-NEXT:    buffer_load_ubyte v14, v[2:3], s[0:3], 0 addr64 offset:19
+; LOOP-NEXT:    buffer_load_ubyte v15, v[2:3], s[0:3], 0 addr64 offset:18
+; LOOP-NEXT:    s_waitcnt vmcnt(3)
+; LOOP-NEXT:    v_lshlrev_b32_e32 v12, 8, v12
+; LOOP-NEXT:    s_waitcnt vmcnt(2)
+; LOOP-NEXT:    v_or_b32_e32 v12, v12, v13
+; LOOP-NEXT:    s_waitcnt vmcnt(1)
+; LOOP-NEXT:    v_lshlrev_b32_e32 v13, 24, v14
+; LOOP-NEXT:    s_waitcnt vmcnt(0)
+; LOOP-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
+; LOOP-NEXT:    v_or_b32_e32 v13, v13, v14
+; LOOP-NEXT:    buffer_load_ubyte v14, v[2:3], s[0:3], 0 addr64 offset:21
+; LOOP-NEXT:    buffer_load_ubyte v15, v[2:3], s[0:3], 0 addr64 offset:20
+; LOOP-NEXT:    buffer_load_ubyte v16, v[2:3], s[0:3], 0 addr64 offset:23
+; LOOP-NEXT:    buffer_load_ubyte v17, v[2:3], s[0:3], 0 addr64 offset:22
+; LOOP-NEXT:    s_waitcnt vmcnt(3)
+; LOOP-NEXT:    v_lshlrev_b32_e32 v14, 8, v14
+; LOOP-NEXT:    s_waitcnt vmcnt(2)
+; LOOP-NEXT:    v_or_b32_e32 v14, v14, v15
+; LOOP-NEXT:    s_waitcnt vmcnt(1)
+; LOOP-NEXT:    v_lshlrev_b32_e32 v15, 24, v16
+; LOOP-NEXT:    s_waitcnt vmcnt(0)
+; LOOP-NEXT:    v_lshlrev_b32_e32 v16, 16, v17
+; LOOP-NEXT:    v_or_b32_e32 v15, v15, v16
+; LOOP-NEXT:    buffer_load_ubyte v16, v[2:3], s[0:3], 0 addr64 offset:25
+; LOOP-NEXT:    buffer_load_ubyte v17, v[2:3], s[0:3], 0 addr64 offset:24
+; LOOP-NEXT:    buffer_load_ubyte v18, v[2:3], s[0:3], 0 addr64 offset:27
+; LOOP-NEXT:    buffer_load_ubyte v19, v[2:3], s[0:3], 0 addr64 offset:26
+; LOOP-NEXT:    s_waitcnt vmcnt(3)
+; LOOP-NEXT:    v_lshlrev_b32_e32 v16, 8, v16
+; LOOP-NEXT:    s_waitcnt vmcnt(2)
+; LOOP-NEXT:    v_or_b32_e32 v16, v16, v17
+; LOOP-NEXT:    s_waitcnt vmcnt(1)
+; LOOP-NEXT:    v_lshlrev_b32_e32 v17, 24, v18
+; LOOP-NEXT:    s_waitcnt vmcnt(0)
+; LOOP-NEXT:    v_lshlrev_b32_e32 v18, 16, v19
+; LOOP-NEXT:    v_or_b32_e32 v17, v17, v18
+; LOOP-NEXT:    buffer_load_ubyte v18, v[2:3], s[0:3], 0 addr64 offset:29
+; LOOP-NEXT:    buffer_load_ubyte v19, v[2:3], s[0:3], 0 addr64 offset:28
+; LOOP-NEXT:    buffer_load_ubyte v20, v[2:3], s[0:3], 0 addr64 offset:31
+; LOOP-NEXT:    buffer_load_ubyte v21, v[2:3], s[0:3], 0 addr64 offset:30
+; LOOP-NEXT:    s_waitcnt vmcnt(3)
+; LOOP-NEXT:    v_lshlrev_b32_e32 v18, 8, v18
+; LOOP-NEXT:    s_waitcnt vmcnt(2)
+; LOOP-NEXT:    v_or_b32_e32 v18, v18, v19
+; LOOP-NEXT:    s_waitcnt vmcnt(1)
+; LOOP-NEXT:    v_lshlrev_b32_e32 v19, 24, v20
+; LOOP-NEXT:    s_waitcnt vmcnt(0)
+; LOOP-NEXT:    v_lshlrev_b32_e32 v20, 16, v21
+; LOOP-NEXT:    v_or_b32_e32 v19, v19, v20
+; LOOP-NEXT:    v_or_b32_e32 v4, v6, v4
+; LOOP-NEXT:    v_or_b32_e32 v6, v10, v8
+; LOOP-NEXT:    v_or_b32_e32 v5, v7, v5
+; LOOP-NEXT:    v_or_b32_e32 v7, v9, v11
+; LOOP-NEXT:    v_or_b32_e32 v8, v13, v12
+; LOOP-NEXT:    v_or_b32_e32 v9, v15, v14
+; LOOP-NEXT:    v_or_b32_e32 v10, v17, v16
+; LOOP-NEXT:    v_or_b32_e32 v11, v19, v18
+; LOOP-NEXT:    v_bfe_u32 v12, v4, 8, 8
+; LOOP-NEXT:    buffer_store_byte v12, v[0:1], s[0:3], 0 addr64 offset:1
+; LOOP-NEXT:    s_waitcnt expcnt(0)
+; LOOP-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
+; LOOP-NEXT:    buffer_store_byte v4, v[0:1], s[0:3], 0 addr64
+; LOOP-NEXT:    s_waitcnt expcnt(0)
+; LOOP-NEXT:    v_lshrrev_b32_e32 v4, 24, v4
+; LOOP-NEXT:    buffer_store_byte v12, v[0:1], s[0:3], 0 addr64 offset:2
+; LOOP-NEXT:    s_waitcnt expcnt(0)
+; LOOP-NEXT:    v_lshrrev_b32_e32 v12, 16, v6
+; LOOP-NEXT:    buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:3
+; LOOP-NEXT:    s_waitcnt expcnt(0)
+; LOOP-NEXT:    v_bfe_u32 v4, v6, 8, 8
+; LOOP-NEXT:    buffer_store_byte v6, v[0:1], s[0:3], 0 addr64 offset:4
+; LOOP-NEXT:    s_waitcnt expcnt(0)
+; LOOP-NEXT:    v_lshrrev_b32_e32 v6, 24, v6
+; LOOP-NEXT:    buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:5
+; LOOP-NEXT:    s_waitcnt expcnt(0)
+; LOOP-NEXT:    v_lshrrev_b32_e32 v4, 16, v5
+; LOOP-NEXT:    buffer_store_byte v12, v[0:1], s[0:3], 0 addr64 offset:6
+; LOOP-NEXT:    s_waitcnt expcnt(0)
+; LOOP-NEXT:    v_bfe_u32 v12, v5, 8, 8
+; LOOP-NEXT:    buffer_store_byte v5, v[0:1], s[0:3], 0 addr64 offset:8
+; LOOP-NEXT:    s_waitcnt expcnt(0)
+; LOOP-NEXT:    v_lshrrev_b32_e32 v5, 24, v5
+; LOOP-NEXT:    buffer_store_byte v6, v[0:1], s[0:3], 0 addr64 offset:7
+; LOOP-NEXT:    s_waitcnt expcnt(0)
+; LOOP-NEXT:    v_lshrrev_b32_e32 v6, 16, v7
+; LOOP-NEXT:    buffer_store_byte v12, v[0:1], s[0:3], 0 addr64 offset:9
 ; LOOP-NEXT:    s_waitcnt expcnt(0)
-; LOOP-NEXT:    v_bfe_u32 v21, v8, 8, 8
-; LOOP-NEXT:    buffer_store_byte v8, v[6:7], s[0:3], 0 addr64 offset:8
+; LOOP-NEXT:    v_bfe_u32 v12, v7, 8, 8
+; LOOP-NEXT:    buffer_store_byte v7, v[0:1], s[0:3], 0 addr64 offset:12
+; LOOP-NEXT:    s_waitcnt expcnt(0)
+; LOOP-NEXT:    v_lshrrev_b32_e32 v7, 24, v7
+; LOOP-NEXT:    buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:10
+; LOOP-NEXT:    s_waitcnt expcnt(0)
+; LOOP-NEXT:    v_lshrrev_b32_e32 v4, 16, v8
+; LOOP-NEXT:    buffer_store_byte v5, v[0:1], s[0:3], 0 addr64 offset:11
+; LOOP-NEXT:    s_waitcnt expcnt(0)
+; LOOP-NEXT:    v_bfe_u32 v5, v8, 8, 8
+; LOOP-NEXT:    buffer_store_byte v8, v[0:1], s[0:3], 0 addr64 offset:16
 ; LOOP-NEXT:    s_waitcnt expcnt(0)
 ; LOOP-NEXT:    v_lshrrev_b32_e32 v8, 24, v8
-; LOOP-NEXT:    v_lshrrev_b32_e32 v22, 16, v9
-; LOOP-NEXT:    v_bfe_u32 v23, v9, 8, 8
-; LOOP-NEXT:    buffer_store_byte v9, v[6:7], s[0:3], 0 addr64 offset:12
+; LOOP-NEXT:    buffer_store_byte v12, v[0:1], s[0:3], 0 addr64 offset:13
+; LOOP-NEXT:    s_waitcnt expcnt(0)
+; LOOP-NEXT:    v_lshrrev_b32_e32 v12, 16, v9
+; LOOP-NEXT:    buffer_store_byte v6, v[0:1], s[0:3], 0 addr64 offset:14
+; LOOP-NEXT:    s_waitcnt expcnt(0)
+; LOOP-NEXT:    v_bfe_u32 v6, v9, 8, 8
+; LOOP-NEXT:    buffer_store_byte v9, v[0:1], s[0:3], 0 addr64 offset:20
 ; LOOP-NEXT:    s_waitcnt expcnt(0)
 ; LOOP-NEXT:    v_lshrrev_b32_e32 v9, 24, v9
-; LOOP-NEXT:    v_lshrrev_b32_e32 v24, 16, v10
-; LOOP-NEXT:    v_bfe_u32 v25, v10, 8, 8
-; LOOP-NEXT:    buffer_store_byte v10, v[6:7], s[0:3], 0 addr64 offset:16
+; LOOP-NEXT:    buffer_store_byte v7, v[0:1], s[0:3], 0 addr64 offset:15
+; LOOP-NEXT:    s_waitcnt expcnt(0)
+; LOOP-NEXT:    v_lshrrev_b32_e32 v7, 16, v10
+; LOOP-NEXT:    buffer_store_byte v5, v[0:1], s[0:3], 0 addr64 offset:17
+; LOOP-NEXT:    s_waitcnt expcnt(0)
+; LOOP-NEXT:    v_bfe_u32 v5, v10, 8, 8
+; LOOP-NEXT:    buffer_store_byte v10, v[0:1], s[0:3], 0 addr64 offset:24
 ; LOOP-NEXT:    s_waitcnt expcnt(0)
 ; LOOP-NEXT:    v_lshrrev_b32_e32 v10, 24, v10
-; LOOP-NEXT:    v_lshrrev_b32_e32 v26, 16, v11
-; LOOP-NEXT:    v_bfe_u32 v27, v11, 8, 8
-; LOOP-NEXT:    buffer_store_byte v11, v[6:7], s[0:3], 0 addr64 offset:20
+; LOOP-NEXT:    buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:18
 ; LOOP-NEXT:    s_waitcnt expcnt(0)
-; LOOP-NEXT:    v_lshrrev_b32_e32 v11, 24, v11
-; LOOP-NEXT:    v_lshrrev_b32_e32 v28, 16, v12
-; LOOP-NEXT:    v_bfe_u32 v29, v12, 8, 8
-; LOOP-NEXT:    buffer_store_byte v12, v[6:7], s[0:3], 0 addr64 offset:24
+; LOOP-NEXT:    v_lshrrev_b32_e32 v4, 16, v11
+; LOOP-NEXT:    buffer_store_byte v8, v[0:1], s[0:3], 0 addr64 offset:19
 ; LOOP-NEXT:    s_waitcnt expcnt(0)
-; LOOP-NEXT:    v_lshrrev_b32_e32 v12, 24, v12
-; LOOP-NEXT:    v_lshrrev_b32_e32 v30, 16, v13
-; LOOP-NEXT:    v_bfe_u32 v31, v13, 8, 8
-; LOOP-NEXT:    buffer_store_byte v13, v[6:7], s[0:3], 0 addr64 offset:28
+; LOOP-NEXT:    v_bfe_u32 v8, v11, 8, 8
+; LOOP-NEXT:    buffer_store_byte v11, v[0:1], s[0:3], 0 addr64 offset:28
 ; LOOP-NEXT:    s_waitcnt expcnt(0)
-; LOOP-NEXT:    v_lshrrev_b32_e32 v13, 24, v13
-; LOOP-NEXT:    buffer_store_byte v15, v[6:7], s[0:3], 0 addr64 offset:1
-; LOOP-NEXT:    buffer_store_byte v14, v[6:7], s[0:3], 0 addr64 offset:2
-; LOOP-NEXT:    buffer_store_byte v16, v[6:7], s[0:3], 0 addr64 offset:3
-; LOOP-NEXT:    buffer_store_byte v18, v[6:7], s[0:3], 0 addr64 offset:5
-; LOOP-NEXT:    buffer_store_byte v17, v[6:7], s[0:3], 0 addr64 offset:6
-; LOOP-NEXT:    buffer_store_byte v19, v[6:7], s[0:3], 0 addr64 offset:7
-; LOOP-NEXT:    buffer_store_byte v21, v[6:7], s[0:3], 0 addr64 offset:9
-; LOOP-NEXT:    buffer_store_byte v20, v[6:7], s[0:3], 0 addr64 offset:10
-; LOOP-NEXT:    buffer_store_byte v8, v[6:7], s[0:3], 0 addr64 offset:11
-; LOOP-NEXT:    buffer_store_byte v23, v[6:7], s[0:3], 0 addr64 offset:13
-; LOOP-NEXT:    buffer_store_byte v22, v[6:7], s[0:3], 0 addr64 offset:14
-; LOOP-NEXT:    buffer_store_byte v9, v[6:7], s[0:3], 0 addr64 offset:15
-; LOOP-NEXT:    buffer_store_byte v25, v[6:7], s[0:3], 0 addr64 offset:17
-; LOOP-NEXT:    buffer_store_byte v24, v[6:7], s[0:3], 0 addr64 offset:18
-; LOOP-NEXT:    buffer_store_byte v10, v[6:7], s[0:3], 0 addr64 offset:19
-; LOOP-NEXT:    buffer_store_byte v27, v[6:7], s[0:3], 0 addr64 offset:21
-; LOOP-NEXT:    buffer_store_byte v26, v[6:7], s[0:3], 0 addr64 offset:22
-; LOOP-NEXT:    buffer_store_byte v11, v[6:7], s[0:3], 0 addr64 offset:23
-; LOOP-NEXT:    buffer_store_byte v29, v[6:7], s[0:3], 0 addr64 offset:25
-; LOOP-NEXT:    buffer_store_byte v28, v[6:7], s[0:3], 0 addr64 offset:26
-; LOOP-NEXT:    buffer_store_byte v12, v[6:7], s[0:3], 0 addr64 offset:27
-; LOOP-NEXT:    buffer_store_byte v31, v[6:7], s[0:3], 0 addr64 offset:29
-; LOOP-NEXT:    buffer_store_byte v30, v[6:7], s[0:3], 0 addr64 offset:30
-; LOOP-NEXT:    buffer_store_byte v13, v[6:7], s[0:3], 0 addr64 offset:31
-; LOOP-NEXT:    s_cbranch_vccnz .LBB0_1
+; LOOP-NEXT:    v_lshrrev_b32_e32 v11, 24, v11
+; LOOP-NEXT:    buffer_store_byte v6, v[0:1], s[0:3], 0 addr64 offset:21
+; LOOP-NEXT:    buffer_store_byte v12, v[0:1], s[0:3], 0 addr64 offset:22
+; LOOP-NEXT:    buffer_store_byte v9, v[0:1], s[0:3], 0 addr64 offset:23
+; LOOP-NEXT:    buffer_store_byte v5, v[0:1], s[0:3], 0 addr64 offset:25
+; LOOP-NEXT:    buffer_store_byte v7, v[0:1], s[0:3], 0 addr64 offset:26
+; LOOP-NEXT:    buffer_store_byte v10, v[0:1], s[0:3], 0 addr64 offset:27
+; LOOP-NEXT:    buffer_store_byte v8, v[0:1], s[0:3], 0 addr64 offset:29
+; LOOP-NEXT:    buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:30
+; LOOP-NEXT:    buffer_store_byte v11, v[0:1], s[0:3], 0 addr64 offset:31
+; LOOP-NEXT:    s_add_u32 s0, s0, 32
+; LOOP-NEXT:    s_addc_u32 s1, s1, 0
+; LOOP-NEXT:    s_cmp_lt_u32 s0, 32
+; LOOP-NEXT:    s_cbranch_scc1 .LBB0_1
 ; LOOP-NEXT:  ; %bb.2: ; %static-memcpy-post-expansion
 ; LOOP-NEXT:    s_mov_b32 s2, 0
 ; LOOP-NEXT:    s_mov_b32 s3, 0xf000
 ; LOOP-NEXT:    s_mov_b64 s[0:1], 0
+; LOOP-NEXT:    s_waitcnt expcnt(1)
 ; LOOP-NEXT:    buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:33
 ; LOOP-NEXT:    buffer_load_ubyte v5, v[2:3], s[0:3], 0 addr64 offset:35
 ; LOOP-NEXT:    buffer_load_ubyte v6, v[2:3], s[0:3], 0 addr64 offset:34
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll
index 4d35f3198bc0a..0de062bec8443 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-- -amdgpu-memcpy-loop-unroll=2 -mem-intrinsic-expand-size=35  %s -o - | FileCheck -check-prefix=LOOP %s
-; RUN: llc -global-isel -mtriple=amdgcn-- -amdgpu-memcpy-loop-unroll=2 -mem-intrinsic-expand-size=37 %s -o - | FileCheck -check-prefix=UNROLL %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-- -amdgpu-memcpy-loop-unroll=2 -mem-intrinsic-expand-size=35  %s -o - | FileCheck -check-prefix=LOOP %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-- -amdgpu-memcpy-loop-unroll=2 -mem-intrinsic-expand-size=37 %s -o - | FileCheck -check-prefix=UNROLL %s
 
 declare void @llvm.memset.p1.i32(ptr addrspace(1), i8, i32, i1)
 
@@ -16,60 +16,57 @@ define amdgpu_cs void @memset_p1i8(ptr addrspace(1) %dst, i8 %val) {
 ; LOOP-NEXT:    v_lshlrev_b32_e32 v6, 24, v3
 ; LOOP-NEXT:    v_or_b32_e32 v3, v3, v4
 ; LOOP-NEXT:    v_or_b32_e32 v3, v3, v5
-; LOOP-NEXT:    v_or_b32_e32 v5, v3, v6
-; LOOP-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
-; LOOP-NEXT:    v_bfe_u32 v7, v5, 8, 8
-; LOOP-NEXT:    v_lshrrev_b32_e32 v8, 24, v5
-; LOOP-NEXT:    v_mov_b32_e32 v4, s1
-; LOOP-NEXT:    v_mov_b32_e32 v3, s0
+; LOOP-NEXT:    v_or_b32_e32 v3, v3, v6
+; LOOP-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
+; LOOP-NEXT:    v_bfe_u32 v5, v3, 8, 8
+; LOOP-NEXT:    v_lshrrev_b32_e32 v6, 24, v3
 ; LOOP-NEXT:  .LBB0_1: ; %static-memset-expansion-main-body
 ; LOOP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; LOOP-NEXT:    v_add_i32_e32 v9, vcc, v0, v3
-; LOOP-NEXT:    v_addc_u32_e32 v10, vcc, v1, v4, vcc
-; LOOP-NEXT:    v_add_i32_e32 v3, vcc, 32, v3
-; LOOP-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
-; LOOP-NEXT:    buffer_store_byte v5, v[9:10], s[0:3], 0 addr64
-; LOOP-NEXT:    buffer_store_byte v7, v[9:10], s[0:3], 0 addr64 offset:1
-; LOOP-NEXT:    buffer_store_byte v6, v[9:10], s[0:3], 0 addr64 offset:2
-; LOOP-NEXT:    buffer_store_byte v8, v[9:10], s[0:3], 0 addr64 offset:3
-; LOOP-NEXT:    buffer_store_byte v5, v[9:10], s[0:3], 0 addr64 offset:4
-; LOOP-NEXT:    buffer_store_byte v7, v[9:10], s[0:3], 0 addr64 offset:5
-; LOOP-NEXT:    buffer_store_byte v6, v[9:10], s[0:3], 0 addr64 offset:6
-; LOOP-NEXT:    buffer_store_byte v8, v[9:10], s[0:3], 0 addr64 offset:7
-; LOOP-NEXT:    buffer_store_byte v5, v[9:10], s[0:3], 0 addr64 offset:8
-; LOOP-NEXT:    buffer_store_byte v7, v[9:10], s[0:3], 0 addr64 offset:9
-; LOOP-NEXT:    buffer_store_byte v6, v[9:10], s[0:3], 0 addr64 offset:10
-; LOOP-NEXT:    buffer_store_byte v8, v[9:10], s[0:3], 0 addr64 offset:11
-; LOOP-NEXT:    buffer_store_byte v5, v[9:10], s[0:3], 0 addr64 offset:12
-; LOOP-NEXT:    buffer_store_byte v7, v[9:10], s[0:3], 0 addr64 offset:13
-; LOOP-NEXT:    buffer_store_byte v6, v[9:10], s[0:3], 0 addr64 offset:14
-; LOOP-NEXT:    buffer_store_byte v8, v[9:10], s[0:3], 0 addr64 offset:15
-; LOOP-NEXT:    buffer_store_byte v5, v[9:10], s[0:3], 0 addr64 offset:16
-; LOOP-NEXT:    buffer_store_byte v7, v[9:10], s[0:3], 0 addr64 offset:17
-; LOOP-NEXT:    buffer_store_byte v6, v[9:10], s[0:3], 0 addr64 offset:18
-; LOOP-NEXT:    buffer_store_byte v8, v[9:10], s[0:3], 0 addr64 offset:19
-; LOOP-NEXT:    buffer_store_byte v5, v[9:10], s[0:3], 0 addr64 offset:20
-; LOOP-NEXT:    buffer_store_byte v7, v[9:10], s[0:3], 0 addr64 offset:21
-; LOOP-NEXT:    buffer_store_byte v6, v[9:10], s[0:3], 0 addr64 offset:22
-; LOOP-NEXT:    buffer_store_byte v8, v[9:10], s[0:3], 0 addr64 offset:23
-; LOOP-NEXT:    buffer_store_byte v5, v[9:10], s[0:3], 0 addr64 offset:24
-; LOOP-NEXT:    buffer_store_byte v7, v[9:10], s[0:3], 0 addr64 offset:25
-; LOOP-NEXT:    buffer_store_byte v6, v[9:10], s[0:3], 0 addr64 offset:26
-; LOOP-NEXT:    buffer_store_byte v8, v[9:10], s[0:3], 0 addr64 offset:27
-; LOOP-NEXT:    buffer_store_byte v5, v[9:10], s[0:3], 0 addr64 offset:28
-; LOOP-NEXT:    buffer_store_byte v7, v[9:10], s[0:3], 0 addr64 offset:29
-; LOOP-NEXT:    buffer_store_byte v6, v[9:10], s[0:3], 0 addr64 offset:30
-; LOOP-NEXT:    v_cmp_gt_u32_e32 vcc, 32, v3
-; LOOP-NEXT:    buffer_store_byte v8, v[9:10], s[0:3], 0 addr64 offset:31
-; LOOP-NEXT:    s_cbranch_vccnz .LBB0_1
+; LOOP-NEXT:    buffer_store_byte v3, v[0:1], s[0:3], 0 addr64
+; LOOP-NEXT:    buffer_store_byte v5, v[0:1], s[0:3], 0 addr64 offset:1
+; LOOP-NEXT:    buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:2
+; LOOP-NEXT:    buffer_store_byte v6, v[0:1], s[0:3], 0 addr64 offset:3
+; LOOP-NEXT:    buffer_store_byte v3, v[0:1], s[0:3], 0 addr64 offset:4
+; LOOP-NEXT:    buffer_store_byte v5, v[0:1], s[0:3], 0 addr64 offset:5
+; LOOP-NEXT:    buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:6
+; LOOP-NEXT:    buffer_store_byte v6, v[0:1], s[0:3], 0 addr64 offset:7
+; LOOP-NEXT:    buffer_store_byte v3, v[0:1], s[0:3], 0 addr64 offset:8
+; LOOP-NEXT:    buffer_store_byte v5, v[0:1], s[0:3], 0 addr64 offset:9
+; LOOP-NEXT:    buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:10
+; LOOP-NEXT:    buffer_store_byte v6, v[0:1], s[0:3], 0 addr64 offset:11
+; LOOP-NEXT:    buffer_store_byte v3, v[0:1], s[0:3], 0 addr64 offset:12
+; LOOP-NEXT:    buffer_store_byte v5, v[0:1], s[0:3], 0 addr64 offset:13
+; LOOP-NEXT:    buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:14
+; LOOP-NEXT:    buffer_store_byte v6, v[0:1], s[0:3], 0 addr64 offset:15
+; LOOP-NEXT:    buffer_store_byte v3, v[0:1], s[0:3], 0 addr64 offset:16
+; LOOP-NEXT:    buffer_store_byte v5, v[0:1], s[0:3], 0 addr64 offset:17
+; LOOP-NEXT:    buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:18
+; LOOP-NEXT:    buffer_store_byte v6, v[0:1], s[0:3], 0 addr64 offset:19
+; LOOP-NEXT:    buffer_store_byte v3, v[0:1], s[0:3], 0 addr64 offset:20
+; LOOP-NEXT:    buffer_store_byte v5, v[0:1], s[0:3], 0 addr64 offset:21
+; LOOP-NEXT:    buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:22
+; LOOP-NEXT:    buffer_store_byte v6, v[0:1], s[0:3], 0 addr64 offset:23
+; LOOP-NEXT:    buffer_store_byte v3, v[0:1], s[0:3], 0 addr64 offset:24
+; LOOP-NEXT:    buffer_store_byte v5, v[0:1], s[0:3], 0 addr64 offset:25
+; LOOP-NEXT:    buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:26
+; LOOP-NEXT:    buffer_store_byte v6, v[0:1], s[0:3], 0 addr64 offset:27
+; LOOP-NEXT:    buffer_store_byte v3, v[0:1], s[0:3], 0 addr64 offset:28
+; LOOP-NEXT:    buffer_store_byte v5, v[0:1], s[0:3], 0 addr64 offset:29
+; LOOP-NEXT:    buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:30
+; LOOP-NEXT:    buffer_store_byte v6, v[0:1], s[0:3], 0 addr64 offset:31
+; LOOP-NEXT:    s_add_u32 s0, s0, 32
+; LOOP-NEXT:    s_addc_u32 s1, s1, 0
+; LOOP-NEXT:    s_cmp_lt_u32 s0, 32
+; LOOP-NEXT:    s_cbranch_scc1 .LBB0_1
 ; LOOP-NEXT:  ; %bb.2: ; %static-memset-post-expansion
 ; LOOP-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; LOOP-NEXT:    s_mov_b32 s2, 0
 ; LOOP-NEXT:    s_mov_b32 s3, 0xf000
 ; LOOP-NEXT:    s_mov_b64 s[0:1], 0
+; LOOP-NEXT:    s_waitcnt expcnt(3)
 ; LOOP-NEXT:    v_lshlrev_b32_e32 v3, 8, v2
+; LOOP-NEXT:    s_waitcnt expcnt(1)
 ; LOOP-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; LOOP-NEXT:    s_waitcnt expcnt(3)
 ; LOOP-NEXT:    v_lshlrev_b32_e32 v5, 24, v2
 ; LOOP-NEXT:    v_or_b32_e32 v2, v2, v3
 ; LOOP-NEXT:    v_or_b32_e32 v2, v2, v4
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
index 7bd1ff2201977..1ad3aa063ef5a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN:  llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GFX10 %s
-; RUN:  llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GFX11 %s
+; RUN:  llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -new-reg-bank-select < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GFX10 %s
+; RUN:  llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -new-reg-bank-select < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GFX11 %s
 declare i32 @llvm.amdgcn.workitem.id.x()
 
 ; A 64-bit multiplication where no arguments were zero extended.
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
index 4a22a911c60b7..e677bbf680ab4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck --check-prefix=GFX906 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck --check-prefix=GFX906 %s
 
 define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
 ; GFX906-LABEL: v3i8_liveout:
@@ -480,8 +480,9 @@ define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
 ; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
+; GFX906-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
-; GFX906-NEXT:    s_xor_b64 s[0:1], vcc, -1
+; GFX906-NEXT:    s_xor_b64 s[0:1], vcc, s[0:1]
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    global_load_dwordx2 v[1:2], v3, s[8:9]
 ; GFX906-NEXT:    s_and_saveexec_b64 s[2:3], vcc
@@ -585,32 +586,41 @@ define amdgpu_kernel void @v32i8_loop_carried(ptr addrspace(1) %src1, ptr addrsp
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 5, v0
-; GFX906-NEXT:    v_mov_b32_e32 v3, 8
 ; GFX906-NEXT:    v_mov_b32_e32 v2, 0xff
 ; GFX906-NEXT:    v_cmp_le_u32_e32 vcc, 15, v0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    global_load_dword v1, v1, s[0:1]
 ; GFX906-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v0, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX906-NEXT:    v_and_or_b32 v0, v1, v2, v0
-; GFX906-NEXT:    v_mov_b32_e32 v2, 24
+; GFX906-NEXT:    v_and_b32_sdwa v0, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; GFX906-NEXT:    v_and_or_b32 v3, v1, v2, v0
+; GFX906-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX906-NEXT:  .LBB10_1: ; %bb.1
 ; GFX906-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX906-NEXT:    v_and_b32_e32 v3, 0xff, v1
+; GFX906-NEXT:    v_mov_b32_e32 v0, v4
+; GFX906-NEXT:    v_and_b32_e32 v4, 0xff, v0
 ; GFX906-NEXT:    s_and_b64 s[2:3], exec, vcc
-; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX906-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX906-NEXT:    v_and_b32_sdwa v5, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX906-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
-; GFX906-NEXT:    v_or3_b32 v1, v0, v3, v1
+; GFX906-NEXT:    v_or3_b32 v4, v3, v4, v5
 ; GFX906-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX906-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX906-NEXT:  ; %bb.2: ; %bb.2.loopexit
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX906-NEXT:    v_mov_b32_e32 v2, 0xff
 ; GFX906-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GFX906-NEXT:    v_mov_b32_e32 v0, 0
+; GFX906-NEXT:    v_and_b32_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
+; GFX906-NEXT:    v_and_or_b32 v1, v1, v2, v3
+; GFX906-NEXT:    v_and_b32_e32 v3, 0xff, v0
+; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX906-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX906-NEXT:    v_or3_b32 v0, v1, v3, v0
+; GFX906-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX906-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX906-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
index 472e79f58969f..c99ca010b4585 100644
--- a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GISEL %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GISEL %s
 
 define i128 @fptosi_f64_to_i128(double %x) {
 ; SDAG-LABEL: fptosi_f64_to_i128:
@@ -112,7 +112,6 @@ define i128 @fptosi_f64_to_i128(double %x) {
 ; GISEL-NEXT:    s_cbranch_execz .LBB0_6
 ; GISEL-NEXT:  ; %bb.1: ; %fp-to-i-if-check.exp.size
 ; GISEL-NEXT:    v_cmp_ge_i64_e32 vcc, -1, v[4:5]
-; GISEL-NEXT:    s_mov_b32 s4, 0x100000
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
 ; GISEL-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
@@ -164,13 +163,15 @@ define i128 @fptosi_f64_to_i128(double %x) {
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GISEL-NEXT:    v_lshl_or_b32 v9, v0, 16, v0
+; GISEL-NEXT:    v_lshl_or_b32 v10, v0, 16, v0
 ; GISEL-NEXT:    v_or3_b32 v8, v1, v2, 1
+; GISEL-NEXT:    v_or3_b32 v9, v0, v2, 0
 ; GISEL-NEXT:    v_mov_b32_e32 v0, 0x433
 ; GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0xfffff
+; GISEL-NEXT:    v_mov_b32_e32 v3, 0x100000
 ; GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1]
-; GISEL-NEXT:    v_and_or_b32 v5, v5, v2, s4
+; GISEL-NEXT:    v_and_or_b32 v5, v5, v2, v3
 ; GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
 ; GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GISEL-NEXT:    s_xor_b64 s[12:13], exec, s[4:5]
@@ -179,32 +180,33 @@ define i128 @fptosi_f64_to_i128(double %x) {
 ; GISEL-NEXT:    v_add_u32_e32 v2, 0xfffffbcd, v6
 ; GISEL-NEXT:    v_lshlrev_b64 v[0:1], v2, v[4:5]
 ; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v2
-; GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v0, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v11, 0, v1, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v10, v9, 0
+; GISEL-NEXT:    v_cndmask_b32_e32 v11, 0, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v12, 0, v1, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v11, v10, 0
 ; GISEL-NEXT:    v_add_u32_e32 v3, 0xfffffb8d, v6
 ; GISEL-NEXT:    v_sub_u32_e32 v6, 64, v2
 ; GISEL-NEXT:    v_lshrrev_b64 v[6:7], v6, v[4:5]
 ; GISEL-NEXT:    v_lshlrev_b64 v[3:4], v3, v[4:5]
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v2
 ; GISEL-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[6:7], v11, v9, v[0:1]
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, v3, 0, s[4:5]
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v10, v8, 0
-; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[6:7], v12, v8, v[5:6]
-; GISEL-NEXT:    v_mul_lo_u32 v13, v11, v9
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[6:7], v10, v9, v[1:2]
-; GISEL-NEXT:    v_mul_lo_u32 v10, v10, v9
-; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[8:9], v11, v8, v[5:6]
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[6:7], v12, v9, v[0:1]
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, v3, 0, s[4:5]
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v11, v8, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[6:7], v13, v8, v[5:6]
+; GISEL-NEXT:    v_mul_lo_u32 v14, v12, v10
+; GISEL-NEXT:    v_mul_lo_u32 v10, v11, v10
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[6:7], v11, v9, v[1:2]
+; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[8:9], v12, v8, v[5:6]
 ; GISEL-NEXT:    v_addc_co_u32_e64 v3, s[8:9], v3, v10, s[8:9]
-; GISEL-NEXT:    v_addc_co_u32_e64 v3, s[6:7], v3, v13, s[6:7]
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[6:7], v12, v9, v[3:4]
+; GISEL-NEXT:    v_addc_co_u32_e64 v3, s[6:7], v3, v14, s[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[6:7], v13, v9, v[3:4]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v3, v4, v7, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, v3, 0, s[4:5]
 ; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v7, v8, v[5:6]
 ; GISEL-NEXT:    ; implicit-def: $vgpr6
 ; GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GISEL-NEXT:    ; implicit-def: $vgpr8
+; GISEL-NEXT:    ; implicit-def: $vgpr10
 ; GISEL-NEXT:    ; implicit-def: $vgpr9
 ; GISEL-NEXT:  .LBB0_3: ; %Flow
 ; GISEL-NEXT:    s_andn2_saveexec_b64 s[6:7], s[12:13]
@@ -212,15 +214,15 @@ define i128 @fptosi_f64_to_i128(double %x) {
 ; GISEL-NEXT:  ; %bb.4: ; %fp-to-i-if-exp.small
 ; GISEL-NEXT:    v_sub_co_u32_e32 v0, vcc, 0x433, v6
 ; GISEL-NEXT:    v_lshrrev_b64 v[4:5], v0, v[4:5]
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v4, v9, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v4, v10, 0
 ; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v8, 0
 ; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v5, v9, v[6:7]
-; GISEL-NEXT:    v_mul_lo_u32 v10, v5, v9
+; GISEL-NEXT:    v_mul_lo_u32 v11, v5, v10
 ; GISEL-NEXT:    v_mad_u64_u32 v[6:7], vcc, v4, v9, v[1:2]
-; GISEL-NEXT:    v_mul_lo_u32 v4, v4, v9
+; GISEL-NEXT:    v_mul_lo_u32 v4, v4, v10
 ; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v5, v8, v[6:7]
 ; GISEL-NEXT:    v_addc_co_u32_e64 v3, s[4:5], v3, v4, s[4:5]
-; GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v10, vcc
+; GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v11, vcc
 ; GISEL-NEXT:  .LBB0_5: ; %Flow1
 ; GISEL-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GISEL-NEXT:  .LBB0_6: ; %fp-to-i-cleanup
@@ -309,9 +311,9 @@ define i128 @fptoui_f64_to_i128(double %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v0, 0x433
 ; GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0xfffff
-; GISEL-NEXT:    s_mov_b32 s6, 0x100000
+; GISEL-NEXT:    v_mov_b32_e32 v3, 0x100000
 ; GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1]
-; GISEL-NEXT:    v_and_or_b32 v5, v5, v2, s6
+; GISEL-NEXT:    v_and_or_b32 v5, v5, v2, v3
 ; GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
 ; GISEL-NEXT:    s_and_saveexec_b64 s[6:7], vcc
 ; GISEL-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
@@ -498,8 +500,9 @@ define i128 @fptosi_f32_to_i128(float %x) {
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GISEL-NEXT:    v_lshl_or_b32 v7, v0, 16, v0
-; GISEL-NEXT:    v_or3_b32 v8, v1, v2, 1
+; GISEL-NEXT:    v_lshl_or_b32 v9, v0, 16, v0
+; GISEL-NEXT:    v_or3_b32 v7, v1, v2, 1
+; GISEL-NEXT:    v_or3_b32 v8, v0, v2, 0
 ; GISEL-NEXT:    v_mov_b32_e32 v0, 0x7fffff
 ; GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GISEL-NEXT:    v_and_or_b32 v4, v4, v0, v1
@@ -514,44 +517,45 @@ define i128 @fptosi_f32_to_i128(float %x) {
 ; GISEL-NEXT:    v_add_u32_e32 v2, 0xffffff6a, v6
 ; GISEL-NEXT:    v_lshlrev_b64 v[0:1], v2, v[4:5]
 ; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v2
-; GISEL-NEXT:    v_cndmask_b32_e32 v11, 0, v0, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v12, 0, v1, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v11, v7, 0
+; GISEL-NEXT:    v_cndmask_b32_e32 v12, 0, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v13, 0, v1, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v12, v9, 0
 ; GISEL-NEXT:    v_add_u32_e32 v3, 0xffffff2a, v6
 ; GISEL-NEXT:    v_sub_u32_e32 v6, 64, v2
-; GISEL-NEXT:    v_lshrrev_b64 v[9:10], v6, v[4:5]
+; GISEL-NEXT:    v_lshrrev_b64 v[10:11], v6, v[4:5]
 ; GISEL-NEXT:    v_lshlrev_b64 v[3:4], v3, v[4:5]
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[6:7], v12, v7, v[0:1]
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v3, v9, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[6:7], v13, v8, v[0:1]
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v3, v10, vcc
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v2
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, v3, 0, s[4:5]
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v11, v8, 0
-; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[6:7], v9, v8, v[5:6]
-; GISEL-NEXT:    v_mul_lo_u32 v13, v12, v7
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[6:7], v11, v7, v[1:2]
-; GISEL-NEXT:    v_mul_lo_u32 v11, v11, v7
-; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[8:9], v12, v8, v[5:6]
-; GISEL-NEXT:    v_addc_co_u32_e64 v3, s[8:9], v3, v11, s[8:9]
-; GISEL-NEXT:    v_addc_co_u32_e64 v3, s[6:7], v3, v13, s[6:7]
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[6:7], v9, v7, v[3:4]
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v4, v10, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, v3, 0, s[4:5]
-; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v7, v8, v[5:6]
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, v3, 0, s[4:5]
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v12, v7, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[6:7], v10, v7, v[5:6]
+; GISEL-NEXT:    v_mul_lo_u32 v14, v13, v9
+; GISEL-NEXT:    v_mul_lo_u32 v9, v12, v9
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[6:7], v12, v8, v[1:2]
+; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[8:9], v13, v7, v[5:6]
+; GISEL-NEXT:    v_addc_co_u32_e64 v3, s[8:9], v3, v9, s[8:9]
+; GISEL-NEXT:    v_addc_co_u32_e64 v3, s[6:7], v3, v14, s[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[6:7], v10, v8, v[3:4]
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v4, v11, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, v3, 0, s[4:5]
+; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v8, v7, v[5:6]
 ; GISEL-NEXT:    ; implicit-def: $vgpr6
 ; GISEL-NEXT:    ; implicit-def: $vgpr4
-; GISEL-NEXT:    ; implicit-def: $vgpr8
 ; GISEL-NEXT:    ; implicit-def: $vgpr7
+; GISEL-NEXT:    ; implicit-def: $vgpr9
+; GISEL-NEXT:    ; implicit-def: $vgpr8
 ; GISEL-NEXT:  .LBB2_3: ; %Flow
 ; GISEL-NEXT:    s_andn2_saveexec_b64 s[4:5], s[12:13]
 ; GISEL-NEXT:    s_cbranch_execz .LBB2_5
 ; GISEL-NEXT:  ; %bb.4: ; %fp-to-i-if-exp.small
 ; GISEL-NEXT:    v_sub_u32_e32 v0, 0x96, v6
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v6, v0, v4
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v6, v8, 0
-; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[6:7], v6, v7, 0
-; GISEL-NEXT:    v_mul_lo_u32 v8, v6, v7
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], vcc, v6, v7, v[1:2]
-; GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v8, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v6, v7, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[6:7], v6, v9, 0
+; GISEL-NEXT:    v_mul_lo_u32 v7, v6, v9
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], vcc, v6, v8, v[1:2]
+; GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v7, vcc
 ; GISEL-NEXT:    v_mov_b32_e32 v1, v4
 ; GISEL-NEXT:    v_mov_b32_e32 v2, v5
 ; GISEL-NEXT:  .LBB2_5: ; %Flow1
@@ -866,15 +870,15 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) {
 ; GISEL-NEXT:    v_or_b32_e32 v2, v2, v18
 ; GISEL-NEXT:    v_or_b32_e32 v1, v1, v0
 ; GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
-; GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GISEL-NEXT:    v_lshl_or_b32 v8, v0, 16, v0
-; GISEL-NEXT:    v_and_b32_e32 v0, 0x7f, v4
+; GISEL-NEXT:    v_and_b32_e32 v9, 0xffff, v0
 ; GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v9
+; GISEL-NEXT:    v_or3_b32 v7, v1, v0, 1
+; GISEL-NEXT:    v_or3_b32 v8, v9, v0, 0
+; GISEL-NEXT:    v_and_b32_e32 v0, 0x7f, v4
 ; GISEL-NEXT:    v_or_b32_e32 v4, 0x80, v0
 ; GISEL-NEXT:    v_mov_b32_e32 v0, 0x86
 ; GISEL-NEXT:    v_mov_b32_e32 v5, 0
-; GISEL-NEXT:    v_or3_b32 v7, v1, v2, 1
 ; GISEL-NEXT:    v_cmp_ge_u16_sdwa s[4:5], v6, v0 src0_sel:BYTE_0 src1_sel:DWORD
 ; GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
 ; GISEL-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
@@ -886,25 +890,26 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) {
 ; GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; GISEL-NEXT:    v_lshlrev_b64 v[0:1], v2, v[4:5]
 ; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v2
-; GISEL-NEXT:    v_cndmask_b32_e32 v11, 0, v0, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v12, 0, v1, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v11, v8, 0
+; GISEL-NEXT:    v_lshl_or_b32 v11, v9, 16, v9
+; GISEL-NEXT:    v_cndmask_b32_e32 v12, 0, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v13, 0, v1, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v12, v11, 0
 ; GISEL-NEXT:    v_add_u32_e32 v3, 0xffffffc0, v2
 ; GISEL-NEXT:    v_sub_u32_e32 v6, 64, v2
 ; GISEL-NEXT:    v_lshrrev_b64 v[9:10], v6, v[4:5]
 ; GISEL-NEXT:    v_lshlrev_b64 v[3:4], v3, v[4:5]
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[6:7], v12, v8, v[0:1]
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[6:7], v13, v8, v[0:1]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v3, v3, v9, vcc
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v2
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, v3, 0, s[4:5]
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v11, v7, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v12, v7, 0
 ; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[6:7], v9, v7, v[5:6]
-; GISEL-NEXT:    v_mul_lo_u32 v13, v12, v8
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[6:7], v11, v8, v[1:2]
-; GISEL-NEXT:    v_mul_lo_u32 v11, v11, v8
-; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[8:9], v12, v7, v[5:6]
+; GISEL-NEXT:    v_mul_lo_u32 v14, v13, v11
+; GISEL-NEXT:    v_mul_lo_u32 v11, v12, v11
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[6:7], v12, v8, v[1:2]
+; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[8:9], v13, v7, v[5:6]
 ; GISEL-NEXT:    v_addc_co_u32_e64 v3, s[8:9], v3, v11, s[8:9]
-; GISEL-NEXT:    v_addc_co_u32_e64 v3, s[6:7], v3, v13, s[6:7]
+; GISEL-NEXT:    v_addc_co_u32_e64 v3, s[6:7], v3, v14, s[6:7]
 ; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[6:7], v9, v8, v[3:4]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v3, v4, v10, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, v3, 0, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
index 49de34820c4c0..417446c68b125 100644
--- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
@@ -5,8 +5,8 @@
 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=-real-true16 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=+real-true16 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-TRUE16 %s
 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=-real-true16 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-FAKE16 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=+real-true16 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=-real-true16 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=+real-true16 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=-real-true16 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
 
 ; Test using saddr addressing mode of global_*load_* flat instructions.
 
@@ -3921,6 +3921,9 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_undef_hi(ptr addrspace(
 ; GFX12-GISEL-FAKE16:       ; %bb.0:
 ; GFX12-GISEL-FAKE16-NEXT:    global_load_u16 v0, v0, s[2:3]
 ; GFX12-GISEL-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-FAKE16-NEXT:    v_lshl_or_b32 v0, s0, 16, v0
 ; GFX12-GISEL-FAKE16-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -3959,6 +3962,9 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_undef_hi_immneg128(ptr
 ; GFX12-GISEL-FAKE16:       ; %bb.0:
 ; GFX12-GISEL-FAKE16-NEXT:    global_load_u16 v0, v0, s[2:3] offset:-128
 ; GFX12-GISEL-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-FAKE16-NEXT:    v_lshl_or_b32 v0, s0, 16, v0
 ; GFX12-GISEL-FAKE16-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -3998,9 +4004,7 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zero_hi(ptr addrspace(1
 ; GFX12-GISEL-TRUE16:       ; %bb.0:
 ; GFX12-GISEL-TRUE16-NEXT:    global_load_d16_b16 v0, v0, s[2:3]
 ; GFX12-GISEL-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-GISEL-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-GISEL-TRUE16-NEXT:    v_lshl_or_b32 v0, 0, 16, v0
+; GFX12-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v0.h, 0
 ; GFX12-GISEL-TRUE16-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-GISEL-FAKE16-LABEL: global_load_saddr_i16_d16lo_zero_hi:
@@ -4046,9 +4050,7 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zero_hi_immneg128(ptr a
 ; GFX12-GISEL-TRUE16:       ; %bb.0:
 ; GFX12-GISEL-TRUE16-NEXT:    global_load_d16_b16 v0, v0, s[2:3] offset:-128
 ; GFX12-GISEL-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-GISEL-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-GISEL-TRUE16-NEXT:    v_lshl_or_b32 v0, 0, 16, v0
+; GFX12-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v0.h, 0
 ; GFX12-GISEL-TRUE16-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-GISEL-FAKE16-LABEL: global_load_saddr_i16_d16lo_zero_hi_immneg128:
@@ -4279,8 +4281,10 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_undef_hi(ptr addrspace(
 ; GFX12-GISEL-FAKE16-LABEL: global_load_saddr_i16_d16hi_undef_hi:
 ; GFX12-GISEL-FAKE16:       ; %bb.0:
 ; GFX12-GISEL-FAKE16-NEXT:    global_load_u16 v0, v0, s[2:3]
+; GFX12-GISEL-FAKE16-NEXT:    v_and_b32_e64 v1, 0xffff, s0
 ; GFX12-GISEL-FAKE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-GISEL-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-FAKE16-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
 ; GFX12-GISEL-FAKE16-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -4318,8 +4322,10 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_undef_hi_immneg128(ptr
 ; GFX12-GISEL-FAKE16-LABEL: global_load_saddr_i16_d16hi_undef_hi_immneg128:
 ; GFX12-GISEL-FAKE16:       ; %bb.0:
 ; GFX12-GISEL-FAKE16-NEXT:    global_load_u16 v0, v0, s[2:3] offset:-128
+; GFX12-GISEL-FAKE16-NEXT:    v_and_b32_e64 v1, 0xffff, s0
 ; GFX12-GISEL-FAKE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-GISEL-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-FAKE16-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
 ; GFX12-GISEL-FAKE16-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -4373,9 +4379,9 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zero_hi(ptr addrspace(1
 ;
 ; GFX12-GISEL-TRUE16-LABEL: global_load_saddr_i16_d16hi_zero_hi:
 ; GFX12-GISEL-TRUE16:       ; %bb.0:
-; GFX12-GISEL-TRUE16-NEXT:    global_load_d16_b16 v0, v0, s[2:3]
+; GFX12-GISEL-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v0, s[2:3]
 ; GFX12-GISEL-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-GISEL-TRUE16-NEXT:    v_lshl_or_b32 v0, v0, 16, 0
+; GFX12-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0
 ; GFX12-GISEL-TRUE16-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-GISEL-FAKE16-LABEL: global_load_saddr_i16_d16hi_zero_hi:
@@ -4435,9 +4441,9 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zero_hi_immneg128(ptr a
 ;
 ; GFX12-GISEL-TRUE16-LABEL: global_load_saddr_i16_d16hi_zero_hi_immneg128:
 ; GFX12-GISEL-TRUE16:       ; %bb.0:
-; GFX12-GISEL-TRUE16-NEXT:    global_load_d16_b16 v0, v0, s[2:3] offset:-128
+; GFX12-GISEL-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v0, s[2:3] offset:-128
 ; GFX12-GISEL-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-GISEL-TRUE16-NEXT:    v_lshl_or_b32 v0, v0, 16, 0
+; GFX12-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0
 ; GFX12-GISEL-TRUE16-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-GISEL-FAKE16-LABEL: global_load_saddr_i16_d16hi_zero_hi_immneg128:
@@ -4783,24 +4789,18 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv(ptr addrspace(1) inreg %arg) {
 ;
 ; GFX12-GISEL-LABEL: global_addr_64bit_lsr_iv:
 ; GFX12-GISEL:       ; %bb.0: ; %bb
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-GISEL-NEXT:    s_mov_b64 s[0:1], 0
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v3, s1
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-GISEL-NEXT:  .LBB132_1: ; %bb3
 ; GFX12-GISEL-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_add_co_u32 v4, vcc, v0, v2
-; GFX12-GISEL-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e64 v5, null, v1, v3, vcc
-; GFX12-GISEL-NEXT:    v_add_co_u32 v2, vcc, v2, 4
-; GFX12-GISEL-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc
-; GFX12-GISEL-NEXT:    global_load_b32 v4, v[4:5], off scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    s_add_co_u32 s4, s2, s0
+; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s5, s3, s1
+; GFX12-GISEL-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 4
+; GFX12-GISEL-NEXT:    global_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0x400, v2
-; GFX12-GISEL-NEXT:    s_cbranch_vccz .LBB132_1
+; GFX12-GISEL-NEXT:    s_cmp_eq_u32 s0, 0x400
+; GFX12-GISEL-NEXT:    s_cbranch_scc0 .LBB132_1
 ; GFX12-GISEL-NEXT:  ; %bb.2: ; %bb2
 ; GFX12-GISEL-NEXT:    s_endpgm
 bb:
@@ -4903,26 +4903,20 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv_multiload(ptr addrspace(1) inreg
 ;
 ; GFX12-GISEL-LABEL: global_addr_64bit_lsr_iv_multiload:
 ; GFX12-GISEL:       ; %bb.0: ; %bb
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-GISEL-NEXT:    s_mov_b64 s[0:1], 0
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v3, s1
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-GISEL-NEXT:  .LBB133_1: ; %bb3
 ; GFX12-GISEL-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_add_co_u32 v4, vcc, v0, v2
-; GFX12-GISEL-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e64 v5, null, v1, v3, vcc
-; GFX12-GISEL-NEXT:    v_add_co_u32 v2, vcc, v2, 4
-; GFX12-GISEL-NEXT:    s_wait_alu depctr_va_vcc(0)
-; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc
-; GFX12-GISEL-NEXT:    global_load_b32 v6, v[4:5], off scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    s_add_co_u32 s4, s2, s0
+; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s5, s3, s1
+; GFX12-GISEL-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 4
+; GFX12-GISEL-NEXT:    global_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT:    global_load_b32 v4, v[4:5], off scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    global_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0x400, v2
-; GFX12-GISEL-NEXT:    s_cbranch_vccz .LBB133_1
+; GFX12-GISEL-NEXT:    s_cmp_eq_u32 s0, 0x400
+; GFX12-GISEL-NEXT:    s_cbranch_scc0 .LBB133_1
 ; GFX12-GISEL-NEXT:  ; %bb.2: ; %bb2
 ; GFX12-GISEL-NEXT:    s_endpgm
 bb:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dead.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dead.ll
index 6cc6ba732d805..09e6bb4ce7752 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dead.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dead.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=ASM-DAG %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=ASM-DAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=ASM-GISEL-TRUE16 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=ASM-GISEL-FAKE16 %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=ASM-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=ASM-GISEL-FAKE16 %s
 
 ; Test that we can use v0 for temporaries in the if.then block.
 define i32 @dead_i32(i1 %cond, i32 %x, ptr addrspace(1) %ptr1) #0 {
@@ -40,8 +40,8 @@ define i32 @dead_i32(i1 %cond, i32 %x, ptr addrspace(1) %ptr1) #0 {
 ; ASM-GISEL-TRUE16-NEXT:    v_mov_b32_e32 v0, v1
 ; ASM-GISEL-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; ASM-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; ASM-GISEL-TRUE16-NEXT:    v_and_b16 v1.l, 1, v4.l
-; ASM-GISEL-TRUE16-NEXT:    v_cmpx_ne_u16_e32 0, v1.l
+; ASM-GISEL-TRUE16-NEXT:    v_and_b32_e32 v1, 1, v4
+; ASM-GISEL-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v1
 ; ASM-GISEL-TRUE16-NEXT:    s_cbranch_execz .LBB0_2
 ; ASM-GISEL-TRUE16-NEXT:  ; %bb.1: ; %if.then
 ; ASM-GISEL-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 1, v0
@@ -143,8 +143,8 @@ define %trivial_types @dead_struct(i1 %cond, %trivial_types %x, ptr addrspace(1)
 ; ASM-GISEL-TRUE16-NEXT:    v_dual_mov_b32 v0, v1 :: v_dual_mov_b32 v1, v2
 ; ASM-GISEL-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; ASM-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; ASM-GISEL-TRUE16-NEXT:    v_and_b16 v2.l, 1, v20.l
-; ASM-GISEL-TRUE16-NEXT:    v_cmpx_ne_u16_e32 0, v2.l
+; ASM-GISEL-TRUE16-NEXT:    v_and_b32_e32 v2, 1, v20
+; ASM-GISEL-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v2
 ; ASM-GISEL-TRUE16-NEXT:    s_cbranch_execz .LBB1_2
 ; ASM-GISEL-TRUE16-NEXT:  ; %bb.1: ; %if.then
 ; ASM-GISEL-TRUE16-NEXT:    s_mov_b32 s4, 0
@@ -389,10 +389,10 @@ define [32 x i32] @dead_array(i1 %cond, [32 x i32] %x, ptr addrspace(1) %ptr1, i
 ; ASM-GISEL-TRUE16-NEXT:    scratch_load_b32 v33, off, s32 offset:8
 ; ASM-GISEL-TRUE16-NEXT:    scratch_load_b32 v34, off, s32 offset:12
 ; ASM-GISEL-TRUE16-NEXT:    scratch_load_b32 v35, off, s32 offset:16
-; ASM-GISEL-TRUE16-NEXT:    v_and_b16 v32.l, 1, v32.l
+; ASM-GISEL-TRUE16-NEXT:    v_and_b32_e32 v32, 1, v32
 ; ASM-GISEL-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; ASM-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; ASM-GISEL-TRUE16-NEXT:    v_cmpx_ne_u16_e32 0, v32.l
+; ASM-GISEL-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v32
 ; ASM-GISEL-TRUE16-NEXT:    s_cbranch_execz .LBB2_2
 ; ASM-GISEL-TRUE16-NEXT:  ; %bb.1: ; %if.then
 ; ASM-GISEL-TRUE16-NEXT:    s_mov_b32 s1, 15
@@ -643,10 +643,10 @@ define %non_trivial_types @dead_non_trivial(i1 %cond, %non_trivial_types %x, ptr
 ; ASM-GISEL-TRUE16-NEXT:    scratch_load_b32 v31, off, s32 offset:76
 ; ASM-GISEL-TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:80
 ; ASM-GISEL-TRUE16-NEXT:    scratch_load_b32 v68, off, s32 offset:84
-; ASM-GISEL-TRUE16-NEXT:    v_and_b16 v1.l, 1, v1.l
+; ASM-GISEL-TRUE16-NEXT:    v_and_b32_e32 v1, 1, v1
 ; ASM-GISEL-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; ASM-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; ASM-GISEL-TRUE16-NEXT:    v_cmpx_ne_u16_e32 0, v1.l
+; ASM-GISEL-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v1
 ; ASM-GISEL-TRUE16-NEXT:    s_cbranch_execz .LBB3_2
 ; ASM-GISEL-TRUE16-NEXT:  ; %bb.1: ; %if.then
 ; ASM-GISEL-TRUE16-NEXT:    s_mov_b32 s1, 0
diff --git a/llvm/test/CodeGen/AMDGPU/remat-fp64-constants.ll b/llvm/test/CodeGen/AMDGPU/remat-fp64-constants.ll
index 88a51e9ccf04c..c8b9bb76c41e9 100644
--- a/llvm/test/CodeGen/AMDGPU/remat-fp64-constants.ll
+++ b/llvm/test/CodeGen/AMDGPU/remat-fp64-constants.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --stress-regalloc=10 < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --stress-regalloc=10 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --stress-regalloc=10 < %s | FileCheck -check-prefix=GCN %s
 
 ; Rematerialization test for fp64 constants (w/ intentionally high register pressure).
 ; Check to make sure we have at least six constant MOVs, not necessarily consecutive, inside the loop.