[llvm] [AMDGPU] Compiler should synthesize private buffer resource descriptor from flat_scratch_init (PR #79586)

via llvm-commits llvm-commits at lists.llvm.org
Wed Feb 7 09:26:09 PST 2024


https://github.com/alex-t updated https://github.com/llvm/llvm-project/pull/79586

>From 099d17c53c006ed3931f56f94b74c70936e0c366 Mon Sep 17 00:00:00 2001
From: Alexander <alexander.timofeev at amd.com>
Date: Tue, 16 Jan 2024 22:08:07 +0100
Subject: [PATCH 1/7] SWDEV-409366 WIP

---
 llvm/lib/Target/AMDGPU/SIFrameLowering.cpp | 47 +++++++++++++++-------
 llvm/lib/Target/AMDGPU/SIFrameLowering.h   |  9 ++++-
 2 files changed, 39 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 0f89df1444866..e878c4dc06ac9 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -610,11 +610,16 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
   const SIInstrInfo *TII = ST.getInstrInfo();
   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
   MachineRegisterInfo &MRI = MF.getRegInfo();
-  const Function &F = MF.getFunction();
+  // const Function &F = MF.getFunction();
   MachineFrameInfo &FrameInfo = MF.getFrameInfo();
 
   assert(MFI->isEntryFunction());
 
+  bool NeedsFlatScratchInit =
+      MFI->getUserSGPRInfo().hasFlatScratchInit() &&
+      (MRI.isPhysRegUsed(AMDGPU::FLAT_SCR) || FrameInfo.hasCalls() ||
+       (!allStackObjectsAreDead(FrameInfo) && ST.enableFlatScratch()));
+
   Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg(
       AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
 
@@ -635,12 +640,14 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
         OtherBB.addLiveIn(ScratchRsrcReg);
       }
     }
+  } else {
+     ScratchRsrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;
   }
 
   // Now that we have fixed the reserved SRSRC we need to locate the
   // (potentially) preloaded SRSRC.
   Register PreloadedScratchRsrcReg;
-  if (ST.isAmdHsaOrMesa(F)) {
+  if (ST.isAmdHsaOrMesa(MF.getFunction()) && !NeedsFlatScratchInit) {
     PreloadedScratchRsrcReg =
         MFI->getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
     if (ScratchRsrcReg && PreloadedScratchRsrcReg) {
@@ -696,10 +703,6 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
     BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0);
   }
 
-  bool NeedsFlatScratchInit =
-      MFI->getUserSGPRInfo().hasFlatScratchInit() &&
-      (MRI.isPhysRegUsed(AMDGPU::FLAT_SCR) || FrameInfo.hasCalls() ||
-       (!allStackObjectsAreDead(FrameInfo) && ST.enableFlatScratch()));
 
   if ((NeedsFlatScratchInit || ScratchRsrcReg) &&
       PreloadedScratchWaveOffsetReg && !ST.flatScratchIsArchitected()) {
@@ -712,17 +715,17 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
   }
 
   if (ScratchRsrcReg) {
-    emitEntryFunctionScratchRsrcRegSetup(MF, MBB, I, DL,
-                                         PreloadedScratchRsrcReg,
-                                         ScratchRsrcReg, ScratchWaveOffsetReg);
+    emitEntryFunctionScratchRsrcRegSetup(
+        MF, MBB, I, DL, NeedsFlatScratchInit, ScratchRsrcReg,
+        PreloadedScratchRsrcReg, ScratchWaveOffsetReg);
   }
 }
 
 // Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg`
 void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
     MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
-    const DebugLoc &DL, Register PreloadedScratchRsrcReg,
-    Register ScratchRsrcReg, Register ScratchWaveOffsetReg) const {
+    const DebugLoc &DL, bool HasFlatScratchInit, Register ScratchRsrcReg,
+    Register PreloadedScratchRsrcReg, Register ScratchWaveOffsetReg) const {
 
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
@@ -829,11 +832,25 @@ void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
       .addImm(Rsrc23 >> 32)
       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
   } else if (ST.isAmdHsaOrMesa(Fn)) {
-    assert(PreloadedScratchRsrcReg);
 
-    if (ScratchRsrcReg != PreloadedScratchRsrcReg) {
-      BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
-          .addReg(PreloadedScratchRsrcReg, RegState::Kill);
+    if (HasFlatScratchInit) {
+      if (Register FlatScratchReg = 
+        MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT)) {
+        I = BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1))
+          .addReg(FlatScratchReg)
+          .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+        I = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2_sub3))
+          .addImm(0xf0000000)
+          .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+      }
+      return;
+    } else {
+      assert(PreloadedScratchRsrcReg);
+
+      if (ScratchRsrcReg != PreloadedScratchRsrcReg) {
+        BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
+            .addReg(PreloadedScratchRsrcReg, RegState::Kill);
+      }
     }
   }
 
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
index b3feb759ed811..cab4c81ac6e2f 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
@@ -78,8 +78,13 @@ class SIFrameLowering final : public AMDGPUFrameLowering {
   void emitEntryFunctionScratchRsrcRegSetup(
       MachineFunction &MF, MachineBasicBlock &MBB,
       MachineBasicBlock::iterator I, const DebugLoc &DL,
-      Register PreloadedPrivateBufferReg, Register ScratchRsrcReg,
-      Register ScratchWaveOffsetReg) const;
+      bool HasFlatScratchInit, Register ScratchRsrcReg,
+      Register PreloadedScratchRsrcReg, Register ScratchWaveOffsetReg) const;
+  // void emitEntryFunctionScratchRsrcRegSetup(
+  //     MachineFunction &MF, MachineBasicBlock &MBB,
+  //     MachineBasicBlock::iterator I, const DebugLoc &DL,
+  //     Register PreloadedPrivateBufferReg, Register ScratchRsrcReg,
+  //     Register ScratchWaveOffsetReg) const;
 
 public:
   bool hasFP(const MachineFunction &MF) const override;

>From cc479a91b50465d37967e521706473cd2a0b1759 Mon Sep 17 00:00:00 2001
From: Alexander <alexander.timofeev at amd.com>
Date: Tue, 16 Jan 2024 22:08:07 +0100
Subject: [PATCH 2/7] SWDEV-409366 WIP

---
 llvm/lib/Target/AMDGPU/SIFrameLowering.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index e878c4dc06ac9..dbc4959f1e657 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -773,7 +773,8 @@ void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
           .addImm(21)
           .addReg(Rsrc03);
     }
-  } else if (ST.isMesaGfxShader(Fn) || !PreloadedScratchRsrcReg) {
+  } else if (ST.isMesaGfxShader(Fn) ||
+             (!HasFlatScratchInit && !PreloadedScratchRsrcReg)) {
     assert(!ST.isAmdHsaOrMesa(Fn));
     const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
 

>From cefab2497206f2e8d75f70067c325c7b8d6b57d6 Mon Sep 17 00:00:00 2001
From: Alexander <alexander.timofeev at amd.com>
Date: Tue, 16 Jan 2024 22:08:07 +0100
Subject: [PATCH 3/7] SWDEV-409366 WIP

---
 llvm/lib/Target/AMDGPU/SIFrameLowering.cpp | 20 ++++++++------------
 llvm/lib/Target/AMDGPU/SIFrameLowering.h   |  5 -----
 2 files changed, 8 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index dbc4959f1e657..6ad9c604095f9 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -610,7 +610,6 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
   const SIInstrInfo *TII = ST.getInstrInfo();
   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
   MachineRegisterInfo &MRI = MF.getRegInfo();
-  // const Function &F = MF.getFunction();
   MachineFrameInfo &FrameInfo = MF.getFrameInfo();
 
   assert(MFI->isEntryFunction());
@@ -640,8 +639,6 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
         OtherBB.addLiveIn(ScratchRsrcReg);
       }
     }
-  } else {
-     ScratchRsrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;
   }
 
   // Now that we have fixed the reserved SRSRC we need to locate the
@@ -835,15 +832,14 @@ void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
   } else if (ST.isAmdHsaOrMesa(Fn)) {
 
     if (HasFlatScratchInit) {
-      if (Register FlatScratchReg = 
-        MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT)) {
-        I = BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1))
-          .addReg(FlatScratchReg)
-          .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
-        I = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2_sub3))
-          .addImm(0xf0000000)
-          .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
-      }
+      I = BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY),
+                  TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1))
+              .addReg(AMDGPU::FLAT_SCR)
+              .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+      I = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64),
+                  TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2_sub3))
+              .addImm(0xf0000000)
+              .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
       return;
     } else {
       assert(PreloadedScratchRsrcReg);
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
index cab4c81ac6e2f..dc0b4acf6ddae 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
@@ -80,11 +80,6 @@ class SIFrameLowering final : public AMDGPUFrameLowering {
       MachineBasicBlock::iterator I, const DebugLoc &DL,
       bool HasFlatScratchInit, Register ScratchRsrcReg,
       Register PreloadedScratchRsrcReg, Register ScratchWaveOffsetReg) const;
-  // void emitEntryFunctionScratchRsrcRegSetup(
-  //     MachineFunction &MF, MachineBasicBlock &MBB,
-  //     MachineBasicBlock::iterator I, const DebugLoc &DL,
-  //     Register PreloadedPrivateBufferReg, Register ScratchRsrcReg,
-  //     Register ScratchWaveOffsetReg) const;
 
 public:
   bool hasFP(const MachineFunction &MF) const override;

>From cbecf53314c6d969d6590a62edcf651718a73105 Mon Sep 17 00:00:00 2001
From: Alexander Timofeev <alexander.timofeev at amd.com>
Date: Wed, 24 Jan 2024 21:39:45 +0100
Subject: [PATCH 4/7] SWDEV-409366 WIP 24 Jan

---
 llvm/lib/Target/AMDGPU/SIFrameLowering.cpp    |  40 +--
 llvm/lib/Target/AMDGPU/SIFrameLowering.h      |  12 +-
 .../GlobalISel/call-outgoing-stack-args.ll    |   8 +-
 .../abi-attribute-hints-undefined-behavior.ll |  16 +-
 ...der-no-live-segment-at-def-implicit-def.ll |   4 +-
 .../branch-folding-implicit-def-subreg.ll     |   6 +-
 .../CodeGen/AMDGPU/call-argument-types.ll     | 280 +++++++++---------
 .../CodeGen/AMDGPU/call-reqd-group-size.ll    |  24 +-
 llvm/test/CodeGen/AMDGPU/call-waitcnt.ll      |  24 +-
 .../AMDGPU/callee-special-input-vgprs.ll      |   6 +-
 llvm/test/CodeGen/AMDGPU/cc-update.ll         |  64 ++--
 .../AMDGPU/cross-block-use-is-not-abi-copy.ll |   8 +-
 .../AMDGPU/indirect-call-known-callees.ll     |   8 +-
 llvm/test/CodeGen/AMDGPU/indirect-call.ll     |  16 +-
 .../kernel-vgpr-spill-mubuf-with-voffset.ll   |   4 +-
 llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll  |  48 +--
 .../AMDGPU/llvm.amdgcn.lds.kernel.id.ll       |   4 +-
 .../AMDGPU/lower-module-lds-via-hybrid.ll     |  12 +-
 .../AMDGPU/lower-module-lds-via-table.ll      |  12 +-
 ...ne-sink-temporal-divergence-swdev407790.ll |  12 +-
 .../AMDGPU/need-fp-from-vgpr-spills.ll        |  12 +-
 .../CodeGen/AMDGPU/simple-indirect-call.ll    |   4 +-
 .../AMDGPU/tuple-allocation-failure.ll        |  12 +-
 .../CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll   |   4 +-
 24 files changed, 322 insertions(+), 318 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 6ad9c604095f9..4842cf83af0e1 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -378,7 +378,7 @@ class PrologEpilogSGPRSpillBuilder {
 } // namespace llvm
 
 // Emit flat scratch setup code, assuming `MFI->hasFlatScratchInit()`
-void SIFrameLowering::emitEntryFunctionFlatScratchInit(
+Register SIFrameLowering::emitEntryFunctionFlatScratchInit(
     MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
     const DebugLoc &DL, Register ScratchWaveOffsetReg) const {
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
@@ -398,6 +398,7 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
 
   Register FlatScrInitLo;
   Register FlatScrInitHi;
+  Register FlatScratchInitReg = AMDGPU::NoRegister;
 
   if (ST.isAmdPalOS()) {
     // Extract the scratch offset from the descriptor in the GIT
@@ -407,7 +408,7 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
 
     // Find unused reg to load flat scratch init into
     MachineRegisterInfo &MRI = MF.getRegInfo();
-    Register FlatScrInit = AMDGPU::NoRegister;
+    Register FlatScratchInitReg = AMDGPU::NoRegister;
     ArrayRef<MCPhysReg> AllSGPR64s = TRI->getAllSGPR64(MF);
     unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 1) / 2;
     AllSGPR64s = AllSGPR64s.slice(
@@ -416,16 +417,16 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
     for (MCPhysReg Reg : AllSGPR64s) {
       if (LiveUnits.available(Reg) && !MRI.isReserved(Reg) &&
           MRI.isAllocatable(Reg) && !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) {
-        FlatScrInit = Reg;
+        FlatScratchInitReg = Reg;
         break;
       }
     }
-    assert(FlatScrInit && "Failed to find free register for scratch init");
+    assert(FlatScratchInitReg && "Failed to find free register for scratch init");
 
-    FlatScrInitLo = TRI->getSubReg(FlatScrInit, AMDGPU::sub0);
-    FlatScrInitHi = TRI->getSubReg(FlatScrInit, AMDGPU::sub1);
+    FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
+    FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
 
-    buildGitPtr(MBB, I, DL, TII, FlatScrInit);
+    buildGitPtr(MBB, I, DL, TII, FlatScratchInitReg);
 
     // We now have the GIT ptr - now get the scratch descriptor from the entry
     // at offset 0 (or offset 16 for a compute shader).
@@ -440,8 +441,8 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
         MF.getFunction().getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0;
     const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
     unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset);
-    BuildMI(MBB, I, DL, LoadDwordX2, FlatScrInit)
-        .addReg(FlatScrInit)
+    BuildMI(MBB, I, DL, LoadDwordX2, FlatScratchInitReg)
+        .addReg(FlatScratchInitReg)
         .addImm(EncodedOffset) // offset
         .addImm(0)             // cpol
         .addMemOperand(MMO);
@@ -453,7 +454,7 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
         .addImm(0xffff);
     And->getOperand(3).setIsDead(); // Mark SCC as dead.
   } else {
-    Register FlatScratchInitReg =
+    FlatScratchInitReg =
         MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT);
     assert(FlatScratchInitReg);
 
@@ -485,7 +486,7 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
         addReg(FlatScrInitHi).
         addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_HI |
                        (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_)));
-      return;
+      return FlatScratchInitReg;
     }
 
     // For GFX9.
@@ -498,7 +499,7 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
       .addImm(0);
     Addc->getOperand(3).setIsDead(); // Mark SCC as dead.
 
-    return;
+    return AMDGPU::FLAT_SCR;
   }
 
   assert(ST.getGeneration() < AMDGPUSubtarget::GFX9);
@@ -519,6 +520,7 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
     .addReg(FlatScrInitLo, RegState::Kill)
     .addImm(8);
   LShr->getOperand(3).setIsDead(); // Mark SCC as dead.
+  return AMDGPU::FLAT_SCR;
 }
 
 // Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not
@@ -707,13 +709,15 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
     MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
   }
 
+  Register FlatScratchInit = AMDGPU::NoRegister;
   if (NeedsFlatScratchInit) {
-    emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg);
+    FlatScratchInit =
+        emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg);
   }
 
   if (ScratchRsrcReg) {
     emitEntryFunctionScratchRsrcRegSetup(
-        MF, MBB, I, DL, NeedsFlatScratchInit, ScratchRsrcReg,
+        MF, MBB, I, DL, FlatScratchInit, ScratchRsrcReg,
         PreloadedScratchRsrcReg, ScratchWaveOffsetReg);
   }
 }
@@ -721,7 +725,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
 // Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg`
 void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
     MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
-    const DebugLoc &DL, bool HasFlatScratchInit, Register ScratchRsrcReg,
+    const DebugLoc &DL, Register FlatScratchInit, Register ScratchRsrcReg,
     Register PreloadedScratchRsrcReg, Register ScratchWaveOffsetReg) const {
 
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
@@ -771,7 +775,7 @@ void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
           .addReg(Rsrc03);
     }
   } else if (ST.isMesaGfxShader(Fn) ||
-             (!HasFlatScratchInit && !PreloadedScratchRsrcReg)) {
+             (!FlatScratchInit.isValid() && !PreloadedScratchRsrcReg)) {
     assert(!ST.isAmdHsaOrMesa(Fn));
     const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
 
@@ -831,10 +835,10 @@ void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
   } else if (ST.isAmdHsaOrMesa(Fn)) {
 
-    if (HasFlatScratchInit) {
+    if (FlatScratchInit) {
       I = BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY),
                   TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1))
-              .addReg(AMDGPU::FLAT_SCR)
+              .addReg(FlatScratchInit)
               .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
       I = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64),
                   TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2_sub3))
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
index dc0b4acf6ddae..f706d48b2dc10 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
@@ -67,18 +67,18 @@ class SIFrameLowering final : public AMDGPUFrameLowering {
                                 MachineBasicBlock::iterator MI) const override;
 
 private:
-  void emitEntryFunctionFlatScratchInit(MachineFunction &MF,
-                                        MachineBasicBlock &MBB,
-                                        MachineBasicBlock::iterator I,
-                                        const DebugLoc &DL,
-                                        Register ScratchWaveOffsetReg) const;
+  Register
+  emitEntryFunctionFlatScratchInit(MachineFunction &MF, MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator I,
+                                   const DebugLoc &DL,
+                                   Register ScratchWaveOffsetReg) const;
 
   Register getEntryFunctionReservedScratchRsrcReg(MachineFunction &MF) const;
 
   void emitEntryFunctionScratchRsrcRegSetup(
       MachineFunction &MF, MachineBasicBlock &MBB,
       MachineBasicBlock::iterator I, const DebugLoc &DL,
-      bool HasFlatScratchInit, Register ScratchRsrcReg,
+      Register FlatScratchInit, Register ScratchRsrcReg,
       Register PreloadedScratchRsrcReg, Register ScratchWaveOffsetReg) const;
 
 public:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
index e597ce6f114a6..a7277414391cb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
@@ -14,9 +14,9 @@ define amdgpu_kernel void @kernel_caller_stack() {
 ; MUBUF:       ; %bb.0:
 ; MUBUF-NEXT:    s_add_u32 flat_scratch_lo, s4, s7
 ; MUBUF-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
-; MUBUF-NEXT:    s_add_u32 s0, s0, s7
+; MUBUF-NEXT:    s_mov_b64 s[2:3], 0xf0000000
 ; MUBUF-NEXT:    s_mov_b32 s32, 0
-; MUBUF-NEXT:    s_addc_u32 s1, s1, 0
+; MUBUF-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; MUBUF-NEXT:    v_mov_b32_e32 v0, 9
 ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
 ; MUBUF-NEXT:    v_mov_b32_e32 v0, 10
@@ -62,8 +62,8 @@ define amdgpu_kernel void @kernel_caller_byval() {
 ; MUBUF:       ; %bb.0:
 ; MUBUF-NEXT:    s_add_u32 flat_scratch_lo, s4, s7
 ; MUBUF-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
-; MUBUF-NEXT:    s_add_u32 s0, s0, s7
-; MUBUF-NEXT:    s_addc_u32 s1, s1, 0
+; MUBUF-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; MUBUF-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
 ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:8
 ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:12
diff --git a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
index a439c0f51ffe9..bda25cda4c5f9 100644
--- a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
+++ b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
@@ -48,19 +48,19 @@ define amdgpu_kernel void @parent_kernel_missing_inputs() #0 {
 ; FIXEDABI-SDAG-LABEL: parent_kernel_missing_inputs:
 ; FIXEDABI-SDAG:       ; %bb.0:
 ; FIXEDABI-SDAG-NEXT:    s_add_i32 s4, s4, s9
-; FIXEDABI-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
 ; FIXEDABI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; FIXEDABI-SDAG-NEXT:    s_add_u32 s0, s0, s9
+; FIXEDABI-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s5
+; FIXEDABI-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; FIXEDABI-SDAG-NEXT:    s_mov_b64 s[2:3], 0xf0000000
 ; FIXEDABI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; FIXEDABI-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
-; FIXEDABI-SDAG-NEXT:    s_addc_u32 s1, s1, 0
+; FIXEDABI-SDAG-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; FIXEDABI-SDAG-NEXT:    s_mov_b32 s14, s8
 ; FIXEDABI-SDAG-NEXT:    v_or_b32_e32 v31, v0, v2
 ; FIXEDABI-SDAG-NEXT:    s_mov_b64 s[8:9], 0
 ; FIXEDABI-SDAG-NEXT:    s_mov_b32 s12, s6
 ; FIXEDABI-SDAG-NEXT:    s_mov_b32 s13, s7
 ; FIXEDABI-SDAG-NEXT:    s_mov_b32 s32, 0
-; FIXEDABI-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; FIXEDABI-SDAG-NEXT:    s_getpc_b64 s[4:5]
 ; FIXEDABI-SDAG-NEXT:    s_add_u32 s4, s4, requires_all_inputs at rel32@lo+4
 ; FIXEDABI-SDAG-NEXT:    s_addc_u32 s5, s5, requires_all_inputs at rel32@hi+12
@@ -70,19 +70,19 @@ define amdgpu_kernel void @parent_kernel_missing_inputs() #0 {
 ; FIXEDABI-GISEL-LABEL: parent_kernel_missing_inputs:
 ; FIXEDABI-GISEL:       ; %bb.0:
 ; FIXEDABI-GISEL-NEXT:    s_add_i32 s4, s4, s9
-; FIXEDABI-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
 ; FIXEDABI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; FIXEDABI-GISEL-NEXT:    s_add_u32 s0, s0, s9
+; FIXEDABI-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s5
+; FIXEDABI-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; FIXEDABI-GISEL-NEXT:    s_mov_b64 s[2:3], 0xf0000000
 ; FIXEDABI-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
 ; FIXEDABI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 20, v2
-; FIXEDABI-GISEL-NEXT:    s_addc_u32 s1, s1, 0
+; FIXEDABI-GISEL-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; FIXEDABI-GISEL-NEXT:    s_mov_b32 s14, s8
 ; FIXEDABI-GISEL-NEXT:    v_or_b32_e32 v31, v0, v1
 ; FIXEDABI-GISEL-NEXT:    s_mov_b64 s[8:9], 0
 ; FIXEDABI-GISEL-NEXT:    s_mov_b32 s12, s6
 ; FIXEDABI-GISEL-NEXT:    s_mov_b32 s13, s7
 ; FIXEDABI-GISEL-NEXT:    s_mov_b32 s32, 0
-; FIXEDABI-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; FIXEDABI-GISEL-NEXT:    s_getpc_b64 s[4:5]
 ; FIXEDABI-GISEL-NEXT:    s_add_u32 s4, s4, requires_all_inputs at rel32@lo+4
 ; FIXEDABI-GISEL-NEXT:    s_addc_u32 s5, s5, requires_all_inputs at rel32@hi+12
diff --git a/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll b/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll
index 7c8d40c49bb80..20f60d1db7fb5 100644
--- a/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll
+++ b/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll
@@ -10,8 +10,8 @@ define amdgpu_kernel void @blender_no_live_segment_at_def_error(<4 x float> %ext
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; CHECK-NEXT:    s_load_dwordx8 s[36:43], s[6:7], 0x0
-; CHECK-NEXT:    s_add_u32 s0, s0, s15
-; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT:    s_mov_b64 s[0:1], s[10:11]
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; CHECK-NEXT:    s_mov_b32 s8, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
index 5a128c7541d1e..2baaefb76acb3 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
@@ -5,13 +5,13 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-LABEL: name: f1
   ; GFX90A: bb.0.bb:
   ; GFX90A-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr15, $sgpr10_sgpr11
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr15, $sgpr10_sgpr11
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $sgpr32 = S_MOV_B32 0
   ; GFX90A-NEXT:   $flat_scr_lo = S_ADD_U32 $sgpr10, $sgpr15, implicit-def $scc
   ; GFX90A-NEXT:   $flat_scr_hi = S_ADDC_U32 $sgpr11, 0, implicit-def dead $scc, implicit $scc
-  ; GFX90A-NEXT:   $sgpr0 = S_ADD_U32 $sgpr0, $sgpr15, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
-  ; GFX90A-NEXT:   $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   $sgpr2_sgpr3 = S_MOV_B64 4026531840, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   $sgpr0_sgpr1 = COPY $flat_scr, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT:   renamable $sgpr10_sgpr11 = COPY $sgpr8_sgpr9
   ; GFX90A-NEXT:   renamable $vgpr31 = COPY $vgpr0, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr33 = S_LOAD_DWORD_IMM renamable $sgpr6_sgpr7, 24, 0 :: (dereferenceable invariant load (s32) from %ir.arg4.kernarg.offset.align.down, align 8, addrspace 4)
diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
index a192a1b8dff93..0fe54349215ba 100644
--- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -129,12 +129,12 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_i1_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s7
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 1
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_i1 at rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_i1 at rel32@hi+12
@@ -234,8 +234,8 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0 glc
 ; HSA-NEXT:    s_waitcnt vmcnt(0)
-; HSA-NEXT:    s_add_u32 s0, s0, s9
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_i1_signext at rel32@lo+4
@@ -339,8 +339,8 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0 glc
 ; HSA-NEXT:    s_waitcnt vmcnt(0)
-; HSA-NEXT:    s_add_u32 s0, s0, s9
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_i1_zeroext at rel32@lo+4
@@ -422,12 +422,12 @@ define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 {
 ; HSA-LABEL: test_call_external_void_func_i8_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s6, s6, s9
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s9
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 0x7b
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_i8 at rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_i8 at rel32@hi+12
@@ -525,8 +525,8 @@ define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 {
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    buffer_load_sbyte v0, off, s[4:7], 0 glc
 ; HSA-NEXT:    s_waitcnt vmcnt(0)
-; HSA-NEXT:    s_add_u32 s0, s0, s9
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_i8_signext at rel32@lo+4
@@ -625,8 +625,8 @@ define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 {
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0 glc
 ; HSA-NEXT:    s_waitcnt vmcnt(0)
-; HSA-NEXT:    s_add_u32 s0, s0, s9
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_i8_zeroext at rel32@lo+4
@@ -707,12 +707,12 @@ define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_i16_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s7
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 0x7b
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_i16 at rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_i16 at rel32@hi+12
@@ -809,8 +809,8 @@ define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 {
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    buffer_load_sshort v0, off, s[4:7], 0 glc
 ; HSA-NEXT:    s_waitcnt vmcnt(0)
-; HSA-NEXT:    s_add_u32 s0, s0, s9
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_i16_signext at rel32@lo+4
@@ -909,8 +909,8 @@ define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 {
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    buffer_load_ushort v0, off, s[4:7], 0 glc
 ; HSA-NEXT:    s_waitcnt vmcnt(0)
-; HSA-NEXT:    s_add_u32 s0, s0, s9
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_i16_zeroext at rel32@lo+4
@@ -991,12 +991,12 @@ define amdgpu_kernel void @test_call_external_void_func_i32_imm(i32) #0 {
 ; HSA-LABEL: test_call_external_void_func_i32_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s6, s6, s9
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s9
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 42
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_i32 at rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_i32 at rel32@hi+12
@@ -1078,13 +1078,13 @@ define amdgpu_kernel void @test_call_external_void_func_i64_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_i64_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s7
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 0x7b
 ; HSA-NEXT:    v_mov_b32_e32 v1, 0
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_i64 at rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_i64 at rel32@hi+12
@@ -1182,12 +1182,12 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64() #0 {
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
 ; HSA-NEXT:    s_mov_b32 s4, 0
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
-; HSA-NEXT:    s_add_u32 s0, s0, s7
 ; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    s_mov_b32 s5, s4
 ; HSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v2i64 at rel32@lo+4
@@ -1278,15 +1278,15 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_v2i64_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s7
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 1
 ; HSA-NEXT:    v_mov_b32_e32 v1, 2
 ; HSA-NEXT:    v_mov_b32_e32 v2, 3
 ; HSA-NEXT:    v_mov_b32_e32 v3, 4
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v2i64 at rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i64 at rel32@hi+12
@@ -1391,12 +1391,12 @@ define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 {
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
 ; HSA-NEXT:    s_mov_b32 s4, 0
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
-; HSA-NEXT:    s_add_u32 s0, s0, s7
 ; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    s_mov_b32 s5, s4
 ; HSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v4, 1
 ; HSA-NEXT:    v_mov_b32_e32 v5, 2
 ; HSA-NEXT:    s_mov_b32 s32, 0
@@ -1514,12 +1514,12 @@ define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 {
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
 ; HSA-NEXT:    s_mov_b32 s4, 0
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
-; HSA-NEXT:    s_add_u32 s0, s0, s7
 ; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    s_mov_b32 s5, s4
 ; HSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v4, 1
 ; HSA-NEXT:    v_mov_b32_e32 v5, 2
 ; HSA-NEXT:    v_mov_b32_e32 v6, 3
@@ -1605,12 +1605,12 @@ define amdgpu_kernel void @test_call_external_void_func_f16_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_f16_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s7
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 0x4400
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_f16 at rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_f16 at rel32@hi+12
@@ -1689,12 +1689,12 @@ define amdgpu_kernel void @test_call_external_void_func_f32_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_f32_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s7
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 4.0
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_f32 at rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_f32 at rel32@hi+12
@@ -1776,13 +1776,13 @@ define amdgpu_kernel void @test_call_external_void_func_v2f32_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_v2f32_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s7
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 1.0
 ; HSA-NEXT:    v_mov_b32_e32 v1, 2.0
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v2f32 at rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v2f32 at rel32@hi+12
@@ -1868,14 +1868,14 @@ define amdgpu_kernel void @test_call_external_void_func_v3f32_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_v3f32_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s7
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 1.0
 ; HSA-NEXT:    v_mov_b32_e32 v1, 2.0
 ; HSA-NEXT:    v_mov_b32_e32 v2, 4.0
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v3f32 at rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v3f32 at rel32@hi+12
@@ -1968,16 +1968,16 @@ define amdgpu_kernel void @test_call_external_void_func_v5f32_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_v5f32_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s7
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 1.0
 ; HSA-NEXT:    v_mov_b32_e32 v1, 2.0
 ; HSA-NEXT:    v_mov_b32_e32 v2, 4.0
 ; HSA-NEXT:    v_mov_b32_e32 v3, -1.0
 ; HSA-NEXT:    v_mov_b32_e32 v4, 0.5
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v5f32 at rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v5f32 at rel32@hi+12
@@ -2059,13 +2059,13 @@ define amdgpu_kernel void @test_call_external_void_func_f64_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_f64_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s7
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 0
 ; HSA-NEXT:    v_mov_b32_e32 v1, 0x40100000
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_f64 at rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_f64 at rel32@hi+12
@@ -2154,15 +2154,15 @@ define amdgpu_kernel void @test_call_external_void_func_v2f64_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_v2f64_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s7
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 0
 ; HSA-NEXT:    v_mov_b32_e32 v1, 2.0
 ; HSA-NEXT:    v_mov_b32_e32 v2, 0
 ; HSA-NEXT:    v_mov_b32_e32 v3, 0x40100000
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v2f64 at rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v2f64 at rel32@hi+12
@@ -2258,9 +2258,10 @@ define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_v3f64_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s7
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 0
 ; HSA-NEXT:    v_mov_b32_e32 v1, 2.0
 ; HSA-NEXT:    v_mov_b32_e32 v2, 0
@@ -2268,7 +2269,6 @@ define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 {
 ; HSA-NEXT:    v_mov_b32_e32 v4, 0
 ; HSA-NEXT:    v_mov_b32_e32 v5, 0x40200000
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v3f64 at rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v3f64 at rel32@hi+12
@@ -2357,14 +2357,14 @@ define amdgpu_kernel void @test_call_external_void_func_v2i16() #0 {
 ; HSA-LABEL: test_call_external_void_func_v2i16:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
-; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s7
 ; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
-; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v2i16 at rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i16 at rel32@hi+12
@@ -2456,14 +2456,14 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16() #0 {
 ; HSA-LABEL: test_call_external_void_func_v3i16:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
-; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s7
 ; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
-; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v3i16 at rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i16 at rel32@hi+12
@@ -2556,14 +2556,14 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16() #0 {
 ; HSA-LABEL: test_call_external_void_func_v3f16:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
-; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s7
 ; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
-; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v3f16 at rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v3f16 at rel32@hi+12
@@ -2647,13 +2647,13 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_v3i16_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s7
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 0x20001
 ; HSA-NEXT:    v_mov_b32_e32 v1, 3
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v3i16 at rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i16 at rel32@hi+12
@@ -2737,13 +2737,13 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_v3f16_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s7
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 0x40003c00
 ; HSA-NEXT:    v_mov_b32_e32 v1, 0x4400
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v3f16 at rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v3f16 at rel32@hi+12
@@ -2835,14 +2835,14 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16() #0 {
 ; HSA-LABEL: test_call_external_void_func_v4i16:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
-; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s7
 ; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
-; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v4i16 at rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i16 at rel32@hi+12
@@ -2928,13 +2928,13 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_v4i16_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s7
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 0x20001
 ; HSA-NEXT:    v_mov_b32_e32 v1, 0x40003
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v4i16 at rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i16 at rel32@hi+12
@@ -3025,14 +3025,14 @@ define amdgpu_kernel void @test_call_external_void_func_v2f16() #0 {
 ; HSA-LABEL: test_call_external_void_func_v2f16:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
-; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s7
 ; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
-; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v2f16 at rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v2f16 at rel32@hi+12
@@ -3120,14 +3120,14 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32() #0 {
 ; HSA-LABEL: test_call_external_void_func_v2i32:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
-; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s7
 ; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
-; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v2i32 at rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i32 at rel32@hi+12
@@ -3210,13 +3210,13 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_v2i32_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s7
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 1
 ; HSA-NEXT:    v_mov_b32_e32 v1, 2
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v2i32 at rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i32 at rel32@hi+12
@@ -3302,14 +3302,14 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_imm(i32) #0 {
 ; HSA-LABEL: test_call_external_void_func_v3i32_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s6, s6, s9
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s9
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 3
 ; HSA-NEXT:    v_mov_b32_e32 v1, 4
 ; HSA-NEXT:    v_mov_b32_e32 v2, 5
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v3i32 at rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i32 at rel32@hi+12
@@ -3398,15 +3398,15 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_i32(i32) #0 {
 ; HSA-LABEL: test_call_external_void_func_v3i32_i32:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s6, s6, s9
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s9
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 3
 ; HSA-NEXT:    v_mov_b32_e32 v1, 4
 ; HSA-NEXT:    v_mov_b32_e32 v2, 5
 ; HSA-NEXT:    v_mov_b32_e32 v3, 6
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v3i32_i32 at rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i32_i32 at rel32@hi+12
@@ -3493,14 +3493,14 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32() #0 {
 ; HSA-LABEL: test_call_external_void_func_v4i32:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
-; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s7
 ; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
-; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v4i32 at rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i32 at rel32@hi+12
@@ -3590,15 +3590,15 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_v4i32_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s7
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 1
 ; HSA-NEXT:    v_mov_b32_e32 v1, 2
 ; HSA-NEXT:    v_mov_b32_e32 v2, 3
 ; HSA-NEXT:    v_mov_b32_e32 v3, 4
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v4i32 at rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i32 at rel32@hi+12
@@ -3691,16 +3691,16 @@ define amdgpu_kernel void @test_call_external_void_func_v5i32_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_v5i32_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s7
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 1
 ; HSA-NEXT:    v_mov_b32_e32 v1, 2
 ; HSA-NEXT:    v_mov_b32_e32 v2, 3
 ; HSA-NEXT:    v_mov_b32_e32 v3, 4
 ; HSA-NEXT:    v_mov_b32_e32 v4, 5
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v5i32 at rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v5i32 at rel32@hi+12
@@ -3803,13 +3803,13 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32() #0 {
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
 ; HSA-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; HSA-NEXT:    s_add_u32 s0, s0, s7
 ; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; HSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
 ; HSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v8i32 at rel32@lo+4
@@ -3915,9 +3915,10 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_v8i32_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s7
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 1
 ; HSA-NEXT:    v_mov_b32_e32 v1, 2
 ; HSA-NEXT:    v_mov_b32_e32 v2, 3
@@ -3927,7 +3928,6 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 {
 ; HSA-NEXT:    v_mov_b32_e32 v6, 7
 ; HSA-NEXT:    v_mov_b32_e32 v7, 8
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v8i32 at rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v8i32 at rel32@hi+12
@@ -4038,7 +4038,6 @@ define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 {
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
 ; HSA-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; HSA-NEXT:    s_add_u32 s0, s0, s7
 ; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4046,7 +4045,8 @@ define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 {
 ; HSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
 ; HSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32
 ; HSA-NEXT:    buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v16i32 at rel32@lo+4
@@ -4183,7 +4183,6 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
 ; HSA-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; HSA-NEXT:    s_add_u32 s0, s0, s7
 ; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4195,8 +4194,9 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
 ; HSA-NEXT:    buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64
 ; HSA-NEXT:    buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80
 ; HSA-NEXT:    buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
+; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    s_getpc_b64 s[8:9]
 ; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v32i32 at rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v32i32 at rel32@hi+12
@@ -4359,9 +4359,9 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 {
 ; HSA-NEXT:    buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64
 ; HSA-NEXT:    buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80
 ; HSA-NEXT:    buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
-; HSA-NEXT:    s_add_u32 s0, s0, s9
+; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v32i32_i32 at rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v32i32_i32 at rel32@hi+12
@@ -4468,14 +4468,14 @@ define amdgpu_kernel void @test_call_external_i32_func_i32_imm(ptr addrspace(1)
 ;
 ; HSA-LABEL: test_call_external_i32_func_i32_imm:
 ; HSA:       ; %bb.0:
-; HSA-NEXT:    s_add_i32 s6, s6, s9
 ; HSA-NEXT:    s_load_dwordx2 s[36:37], s[4:5], 0x0
+; HSA-NEXT:    s_add_i32 s6, s6, s9
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s9
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 42
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; HSA-NEXT:    s_mov_b32 s39, 0x1100f000
 ; HSA-NEXT:    s_mov_b32 s38, -1
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
@@ -4583,13 +4583,13 @@ define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 {
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
 ; HSA-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; HSA-NEXT:    s_add_u32 s0, s0, s7
 ; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; HSA-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
 ; HSA-NEXT:    buffer_load_dword v1, off, s[4:7], 0 offset:4
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_struct_i8_i32 at rel32@lo+4
@@ -4704,9 +4704,10 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0
 ; HSA-LABEL: test_call_external_void_func_byval_struct_i8_i32:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s7
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 3
 ; HSA-NEXT:    buffer_store_byte v0, off, s[0:3], 0 offset:8
 ; HSA-NEXT:    v_mov_b32_e32 v0, 8
@@ -4714,7 +4715,6 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0
 ; HSA-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:12
 ; HSA-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:8
 ; HSA-NEXT:    s_movk_i32 s32, 0x400
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_byval_struct_i8_i32 at rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_byval_struct_i8_i32 at rel32@hi+12
@@ -4879,9 +4879,10 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval
 ; HSA-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s6, s6, s9
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s9
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 3
 ; HSA-NEXT:    buffer_store_byte v0, off, s[0:3], 0 offset:8
 ; HSA-NEXT:    v_mov_b32_e32 v0, 8
@@ -4889,7 +4890,6 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval
 ; HSA-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:12
 ; HSA-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:8
 ; HSA-NEXT:    s_movk_i32 s32, 0x800
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@hi+12
@@ -5087,10 +5087,10 @@ define amdgpu_kernel void @test_call_external_void_func_v16i8() #0 {
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
 ; HSA-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; HSA-NEXT:    s_add_u32 s0, s0, s7
 ; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
 ; HSA-NEXT:    s_mov_b32 s6, -1
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; HSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
 ; HSA-NEXT:    s_mov_b32 s32, 0
@@ -5341,14 +5341,14 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val
 ; HSA-LABEL: stack_passed_arg_alignment_v32i32_f64:
 ; HSA:       ; %bb.0: ; %entry
 ; HSA-NEXT:    s_add_i32 s6, s6, s9
-; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
-; HSA-NEXT:    s_add_u32 s0, s0, s9
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
 ; HSA-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x40
 ; HSA-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x80
 ; HSA-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0x0
+; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; HSA-NEXT:    v_mov_b32_e32 v0, s23
 ; HSA-NEXT:    v_mov_b32_e32 v1, s6
diff --git a/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll b/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll
index c62a082459105..efa412d6b0a32 100644
--- a/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll
@@ -12,9 +12,9 @@ define amdgpu_kernel void @known_x_0(ptr addrspace(1) %out) !reqd_work_group_siz
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
 ; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
-; CHECK-NEXT:    s_add_u32 s0, s0, s9
+; CHECK-NEXT:    s_mov_b64 s[2:3], 0xf0000000
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 20, v2
-; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; CHECK-NEXT:    v_lshl_or_b32 v31, v1, 10, v0
 ; CHECK-NEXT:    s_mov_b32 s32, 0
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
@@ -32,8 +32,8 @@ define amdgpu_kernel void @known_y_0(ptr addrspace(1) %out) !reqd_work_group_siz
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
 ; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
-; CHECK-NEXT:    s_add_u32 s0, s0, s9
-; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; CHECK-NEXT:    v_lshl_or_b32 v31, v2, 20, v0
 ; CHECK-NEXT:    s_mov_b32 s32, 0
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
@@ -51,8 +51,8 @@ define amdgpu_kernel void @known_z_0(ptr addrspace(1) %out) !reqd_work_group_siz
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
 ; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
-; CHECK-NEXT:    s_add_u32 s0, s0, s9
-; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; CHECK-NEXT:    v_lshl_or_b32 v31, v1, 10, v0
 ; CHECK-NEXT:    s_mov_b32 s32, 0
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
@@ -70,8 +70,8 @@ define amdgpu_kernel void @known_yz_0(ptr addrspace(1) %out) !reqd_work_group_si
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
 ; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
-; CHECK-NEXT:    s_add_u32 s0, s0, s9
-; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; CHECK-NEXT:    v_mov_b32_e32 v31, v0
 ; CHECK-NEXT:    s_mov_b32 s32, 0
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
@@ -89,8 +89,8 @@ define amdgpu_kernel void @known_xz_0(ptr addrspace(1) %out) !reqd_work_group_si
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
 ; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
-; CHECK-NEXT:    s_add_u32 s0, s0, s9
-; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v31, 10, v1
 ; CHECK-NEXT:    s_mov_b32 s32, 0
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
@@ -109,8 +109,8 @@ define amdgpu_kernel void @known_xyz_0(ptr addrspace(1) %out) !reqd_work_group_s
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
 ; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
-; CHECK-NEXT:    s_add_u32 s0, s0, s9
-; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; CHECK-NEXT:    v_mov_b32_e32 v31, 0
 ; CHECK-NEXT:    s_mov_b32 s32, 0
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll b/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll
index 616e5f00fc1e5..8e4baf4a8e5a4 100644
--- a/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll
@@ -8,8 +8,8 @@ define amdgpu_kernel void @call_memory_arg_load(ptr addrspace(3) %ptr, i32) #0 {
 ; GCN-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GCN-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
 ; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
-; GCN-NEXT:    s_add_u32 s0, s0, s9
-; GCN-NEXT:    s_addc_u32 s1, s1, 0
+; GCN-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; GCN-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    ds_read_b32 v0, v0
@@ -31,9 +31,9 @@ define amdgpu_kernel void @call_memory_no_dep(ptr addrspace(1) %ptr, i32) #0 {
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; GCN-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
 ; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
-; GCN-NEXT:    s_add_u32 s0, s0, s9
+; GCN-NEXT:    s_mov_b64 s[2:3], 0xf0000000
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_addc_u32 s1, s1, 0
+; GCN-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    global_store_dword v0, v0, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
@@ -52,11 +52,11 @@ define amdgpu_kernel void @call_memory_no_dep(ptr addrspace(1) %ptr, i32) #0 {
 define amdgpu_kernel void @call_no_wait_after_call(ptr addrspace(1) %ptr, i32) #0 {
 ; GCN-LABEL: call_no_wait_after_call:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
 ; GCN-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x0
+; GCN-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
 ; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
-; GCN-NEXT:    s_add_u32 s0, s0, s9
-; GCN-NEXT:    s_addc_u32 s1, s1, 0
+; GCN-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; GCN-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_mov_b32 s32, 0
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
@@ -74,11 +74,11 @@ define amdgpu_kernel void @call_no_wait_after_call(ptr addrspace(1) %ptr, i32) #
 define amdgpu_kernel void @call_no_wait_after_call_return_val(ptr addrspace(1) %ptr, i32) #0 {
 ; GCN-LABEL: call_no_wait_after_call_return_val:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
 ; GCN-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x0
+; GCN-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
 ; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
-; GCN-NEXT:    s_add_u32 s0, s0, s9
-; GCN-NEXT:    s_addc_u32 s1, s1, 0
+; GCN-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; GCN-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_mov_b32 s32, 0
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
@@ -99,12 +99,12 @@ define amdgpu_kernel void @call_got_load(ptr addrspace(1) %ptr, i32) #0 {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
 ; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
-; GCN-NEXT:    s_add_u32 s0, s0, s9
-; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
 ; GCN-NEXT:    s_add_u32 s4, s4, got.func at gotpcrel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s5, s5, got.func at gotpcrel32@hi+12
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; GCN-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_mov_b32 s32, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
index 6d603ef039769..49bf48a3687c9 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
@@ -165,7 +165,7 @@ define amdgpu_kernel void @kern_indirect_use_workitem_id_z() #1 {
 ; FIXEDABI-NOT: v1
 ; FIXEDABI-NOT: v2
 ; FIXEDABI: v_lshlrev_b32_e32 v1, 10, v1
-; FIXEDABI-NEXT: v_or_b32_e32 v31, v0, v1
+; FIXEDABI: v_or_b32_e32 v31, v0, v1
 ; FIXEDABI-NOT: v0
 ; FIXEDABI-NOT: v1
 ; FIXEDABI-NOT: v2
@@ -181,7 +181,7 @@ define amdgpu_kernel void @kern_indirect_use_workitem_id_xy() #1 {
 ; FIXEDABI-NOT: v1
 ; FIXEDABI-NOT: v2
 ; FIXEDABI: v_lshlrev_b32_e32 v1, 20, v2
-; FIXEDABI-NEXT: v_or_b32_e32 v31, v0, v1
+; FIXEDABI: v_or_b32_e32 v31, v0, v1
 ; FIXEDABI-NOT: v0
 ; FIXEDABI-NOT: v1
 ; FIXEDABI-NOT: v2
@@ -198,7 +198,7 @@ define amdgpu_kernel void @kern_indirect_use_workitem_id_xz() #1 {
 ; FIXEDABI-NOT: v2
 ; FIXEDABI:v_lshlrev_b32_e32 v0, 20, v2
 ; FIXEDABI-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; FIXEDABI-NEXT: v_or_b32_e32 v31, v1, v0
+; FIXEDABI: v_or_b32_e32 v31, v1, v0
 ; FIXEDABI-NOT: v0
 ; FIXEDABI-NOT: v1
 ; FIXEDABI-NOT: v2
diff --git a/llvm/test/CodeGen/AMDGPU/cc-update.ll b/llvm/test/CodeGen/AMDGPU/cc-update.ll
index ca09163b20afc..98f60a76a9e82 100644
--- a/llvm/test/CodeGen/AMDGPU/cc-update.ll
+++ b/llvm/test/CodeGen/AMDGPU/cc-update.ll
@@ -68,13 +68,13 @@ define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 {
 ; GFX803-LABEL: test_kern_call:
 ; GFX803:       ; %bb.0: ; %entry
 ; GFX803-NEXT:    s_add_i32 s10, s10, s15
-; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX803-NEXT:    s_add_u32 s0, s0, s15
+; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX803-NEXT:    s_mov_b64 s[2:3], 0xf0000000
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX803-NEXT:    s_addc_u32 s1, s1, 0
+; GFX803-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GFX803-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GFX803-NEXT:    v_or_b32_e32 v31, v0, v2
 ; GFX803-NEXT:    s_mov_b64 s[8:9], s[6:7]
@@ -89,10 +89,10 @@ define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 {
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
-; GFX900-NEXT:    s_add_u32 s0, s0, s15
+; GFX900-NEXT:    s_mov_b64 s[2:3], 0xf0000000
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX900-NEXT:    s_addc_u32 s1, s1, 0
+; GFX900-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GFX900-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GFX900-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX900-NEXT:    s_mov_b64 s[8:9], s[6:7]
@@ -112,8 +112,8 @@ define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 {
 ; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX1010-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX1010-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1010-NEXT:    s_add_u32 s0, s0, s15
-; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
+; GFX1010-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; GFX1010-NEXT:    s_mov_b64 s[0:1], s[10:11]
 ; GFX1010-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GFX1010-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX1010-NEXT:    v_or3_b32 v31, v0, v1, v2
@@ -148,13 +148,13 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 {
 ; GFX803-LABEL: test_kern_stack_and_call:
 ; GFX803:       ; %bb.0: ; %entry
 ; GFX803-NEXT:    s_add_i32 s10, s10, s15
-; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX803-NEXT:    s_add_u32 s0, s0, s15
+; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX803-NEXT:    s_mov_b64 s[2:3], 0xf0000000
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX803-NEXT:    s_addc_u32 s1, s1, 0
+; GFX803-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GFX803-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GFX803-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX803-NEXT:    v_or_b32_e32 v31, v0, v2
@@ -172,10 +172,10 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 {
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
-; GFX900-NEXT:    s_add_u32 s0, s0, s15
+; GFX900-NEXT:    s_mov_b64 s[2:3], 0xf0000000
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX900-NEXT:    s_addc_u32 s1, s1, 0
+; GFX900-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GFX900-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GFX900-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX900-NEXT:    v_or3_b32 v31, v0, v1, v2
@@ -199,8 +199,8 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 {
 ; GFX1010-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX1010-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX1010-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1010-NEXT:    s_add_u32 s0, s0, s15
-; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
+; GFX1010-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; GFX1010-NEXT:    s_mov_b64 s[0:1], s[10:11]
 ; GFX1010-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GFX1010-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1010-NEXT:    s_mov_b64 s[8:9], s[6:7]
@@ -311,13 +311,13 @@ define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 {
 ; GFX803-LABEL: test_force_fp_kern_call:
 ; GFX803:       ; %bb.0: ; %entry
 ; GFX803-NEXT:    s_add_i32 s10, s10, s15
-; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX803-NEXT:    s_add_u32 s0, s0, s15
+; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX803-NEXT:    s_mov_b64 s[2:3], 0xf0000000
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX803-NEXT:    s_addc_u32 s1, s1, 0
+; GFX803-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GFX803-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GFX803-NEXT:    v_or_b32_e32 v31, v0, v2
 ; GFX803-NEXT:    s_mov_b64 s[8:9], s[6:7]
@@ -333,10 +333,10 @@ define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 {
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
-; GFX900-NEXT:    s_add_u32 s0, s0, s15
+; GFX900-NEXT:    s_mov_b64 s[2:3], 0xf0000000
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX900-NEXT:    s_addc_u32 s1, s1, 0
+; GFX900-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GFX900-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GFX900-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX900-NEXT:    s_mov_b64 s[8:9], s[6:7]
@@ -358,8 +358,8 @@ define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 {
 ; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX1010-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX1010-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1010-NEXT:    s_add_u32 s0, s0, s15
-; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
+; GFX1010-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; GFX1010-NEXT:    s_mov_b64 s[0:1], s[10:11]
 ; GFX1010-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GFX1010-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX1010-NEXT:    v_or3_b32 v31, v0, v1, v2
@@ -413,14 +413,14 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_add
 ; GFX803-LABEL: test_force_fp_kern_stack_and_call:
 ; GFX803:       ; %bb.0: ; %entry
 ; GFX803-NEXT:    s_add_i32 s10, s10, s15
-; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX803-NEXT:    s_add_u32 s0, s0, s15
+; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX803-NEXT:    s_mov_b64 s[2:3], 0xf0000000
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX803-NEXT:    s_mov_b32 s33, 0
-; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX803-NEXT:    s_addc_u32 s1, s1, 0
+; GFX803-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GFX803-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GFX803-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX803-NEXT:    v_or_b32_e32 v31, v0, v2
@@ -438,11 +438,11 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_add
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
-; GFX900-NEXT:    s_add_u32 s0, s0, s15
+; GFX900-NEXT:    s_mov_b64 s[2:3], 0xf0000000
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX900-NEXT:    s_mov_b32 s33, 0
-; GFX900-NEXT:    s_addc_u32 s1, s1, 0
+; GFX900-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GFX900-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GFX900-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX900-NEXT:    v_or3_b32 v31, v0, v1, v2
@@ -467,8 +467,8 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_add
 ; GFX1010-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX1010-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX1010-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1010-NEXT:    s_add_u32 s0, s0, s15
-; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
+; GFX1010-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; GFX1010-NEXT:    s_mov_b64 s[0:1], s[10:11]
 ; GFX1010-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GFX1010-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1010-NEXT:    s_mov_b64 s[8:9], s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
index 11871db1ef656..408d3b2c4c229 100644
--- a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
@@ -180,8 +180,8 @@ define amdgpu_kernel void @v3i16_registers(i1 %cond) #0 {
 ; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GCN-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GCN-NEXT:    s_load_dword s8, s[6:7], 0x0
-; GCN-NEXT:    s_add_u32 s0, s0, s15
-; GCN-NEXT:    s_addc_u32 s1, s1, 0
+; GCN-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; GCN-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GCN-NEXT:    s_mov_b32 s32, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_bitcmp1_b32 s8, 0
@@ -229,8 +229,8 @@ define amdgpu_kernel void @v3f16_registers(i1 %cond) #0 {
 ; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GCN-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GCN-NEXT:    s_load_dword s8, s[6:7], 0x0
-; GCN-NEXT:    s_add_u32 s0, s0, s15
-; GCN-NEXT:    s_addc_u32 s1, s1, 0
+; GCN-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; GCN-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GCN-NEXT:    s_mov_b32 s32, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_bitcmp1_b32 s8, 0
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll
index fe7323eeadf8a..955c0605f0536 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll
@@ -12,8 +12,6 @@ define amdgpu_kernel void @indirect_call_known_no_special_inputs() {
 ; CHECK:       ; %bb.0: ; %bb
 ; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s4, s7
 ; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
-; CHECK-NEXT:    s_add_u32 s0, s0, s7
-; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    s_mov_b64 s[4:5], 0
 ; CHECK-NEXT:    s_load_dword s7, s[4:5], 0x0
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
@@ -24,14 +22,16 @@ define amdgpu_kernel void @indirect_call_known_no_special_inputs() {
 ; CHECK-NEXT:    s_addc_u32 s9, s9, snork at gotpcrel32@hi+12
 ; CHECK-NEXT:    s_load_dwordx2 s[10:11], s[8:9], 0x0
 ; CHECK-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x0
-; CHECK-NEXT:    s_mov_b64 s[8:9], 0
+; CHECK-NEXT:    s_mov_b64 s[2:3], 0xf0000000
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_and_b32 s4, 1, s7
 ; CHECK-NEXT:    s_cmp_eq_u32 s4, 1
-; CHECK-NEXT:    v_mov_b32_e32 v31, v0
+; CHECK-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; CHECK-NEXT:    s_cselect_b32 s5, s13, s11
 ; CHECK-NEXT:    s_cselect_b32 s4, s12, s10
+; CHECK-NEXT:    s_mov_b64 s[8:9], 0
 ; CHECK-NEXT:    s_mov_b32 s12, s6
+; CHECK-NEXT:    v_mov_b32_e32 v31, v0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v4, 0
 ; CHECK-NEXT:    s_mov_b32 s32, 0
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
index 3aaf04c94cda5..594f3b33bfb36 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
@@ -12,8 +12,8 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr(i8) {
 ; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; GCN-NEXT:    s_add_i32 s12, s12, s17
 ; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; GCN-NEXT:    s_add_u32 s0, s0, s17
-; GCN-NEXT:    s_addc_u32 s1, s1, 0
+; GCN-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; GCN-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GCN-NEXT:    s_mov_b32 s13, s15
 ; GCN-NEXT:    s_mov_b32 s12, s14
 ; GCN-NEXT:    s_getpc_b64 s[14:15]
@@ -37,8 +37,8 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr(i8) {
 ; GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; GISEL-NEXT:    s_add_i32 s12, s12, s17
 ; GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; GISEL-NEXT:    s_add_u32 s0, s0, s17
-; GISEL-NEXT:    s_addc_u32 s1, s1, 0
+; GISEL-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; GISEL-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GISEL-NEXT:    s_mov_b32 s13, s15
 ; GISEL-NEXT:    s_mov_b32 s12, s14
 ; GISEL-NEXT:    s_getpc_b64 s[14:15]
@@ -67,8 +67,8 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr_arg(i8) {
 ; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; GCN-NEXT:    s_add_i32 s12, s12, s17
 ; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; GCN-NEXT:    s_add_u32 s0, s0, s17
-; GCN-NEXT:    s_addc_u32 s1, s1, 0
+; GCN-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; GCN-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GCN-NEXT:    s_mov_b32 s13, s15
 ; GCN-NEXT:    s_mov_b32 s12, s14
 ; GCN-NEXT:    s_getpc_b64 s[14:15]
@@ -93,8 +93,8 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr_arg(i8) {
 ; GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; GISEL-NEXT:    s_add_i32 s12, s12, s17
 ; GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; GISEL-NEXT:    s_add_u32 s0, s0, s17
-; GISEL-NEXT:    s_addc_u32 s1, s1, 0
+; GISEL-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; GISEL-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GISEL-NEXT:    s_mov_b32 s13, s15
 ; GISEL-NEXT:    s_mov_b32 s12, s14
 ; GISEL-NEXT:    s_getpc_b64 s[14:15]
diff --git a/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll b/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll
index 6e905542ce53c..4944a6db486ba 100644
--- a/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll
@@ -11,8 +11,8 @@ define amdgpu_kernel void @test_kernel(i32 %val) #0 {
 ; CHECK-NEXT:    s_mov_b32 s33, 0
 ; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
-; CHECK-NEXT:    s_add_u32 s0, s0, s15
-; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; CHECK-NEXT:    ; implicit-def: $vgpr3 : SGPR spill to VGPR lane
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; CHECK-NEXT:    v_mov_b32_e32 v3, v2
diff --git a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
index 66f31bbf7afe0..96dfb1fbf8017 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
@@ -118,10 +118,10 @@ define amdgpu_kernel void @module_1_kernel_normal_extern_normal(i32 %idx) {
 ; CHECK-NEXT:    s_addc_u32 s7, s7, 0
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
-; CHECK-NEXT:    s_add_u32 s0, s0, s9
-; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    s_add_u32 s8, s4, 8
+; CHECK-NEXT:    s_mov_b64 s[2:3], 0xf0000000
 ; CHECK-NEXT:    s_addc_u32 s9, s5, 0
+; CHECK-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; CHECK-NEXT:    s_getpc_b64 s[6:7]
 ; CHECK-NEXT:    s_add_u32 s6, s6, use_module at gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s7, s7, use_module at gotpcrel32@hi+12
@@ -177,10 +177,10 @@ define amdgpu_kernel void @module_1_kernel_overalign_extern_normal(i32 %idx) {
 ; CHECK-NEXT:    s_addc_u32 s7, s7, 0
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
-; CHECK-NEXT:    s_add_u32 s0, s0, s9
-; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    s_add_u32 s8, s4, 8
+; CHECK-NEXT:    s_mov_b64 s[2:3], 0xf0000000
 ; CHECK-NEXT:    s_addc_u32 s9, s5, 0
+; CHECK-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; CHECK-NEXT:    s_getpc_b64 s[6:7]
 ; CHECK-NEXT:    s_add_u32 s6, s6, use_module at gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s7, s7, use_module at gotpcrel32@hi+12
@@ -236,10 +236,10 @@ define amdgpu_kernel void @module_1_kernel_normal_extern_overalign(i32 %idx) {
 ; CHECK-NEXT:    s_addc_u32 s7, s7, 0
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
-; CHECK-NEXT:    s_add_u32 s0, s0, s9
-; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    s_add_u32 s8, s4, 8
+; CHECK-NEXT:    s_mov_b64 s[2:3], 0xf0000000
 ; CHECK-NEXT:    s_addc_u32 s9, s5, 0
+; CHECK-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; CHECK-NEXT:    s_getpc_b64 s[6:7]
 ; CHECK-NEXT:    s_add_u32 s6, s6, use_module at gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s7, s7, use_module at gotpcrel32@hi+12
@@ -295,10 +295,10 @@ define amdgpu_kernel void @module_1_kernel_overalign_extern_overalign(i32 %idx)
 ; CHECK-NEXT:    s_addc_u32 s7, s7, 0
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
-; CHECK-NEXT:    s_add_u32 s0, s0, s9
-; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    s_add_u32 s8, s4, 8
+; CHECK-NEXT:    s_mov_b64 s[2:3], 0xf0000000
 ; CHECK-NEXT:    s_addc_u32 s9, s5, 0
+; CHECK-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; CHECK-NEXT:    s_getpc_b64 s[6:7]
 ; CHECK-NEXT:    s_add_u32 s6, s6, use_module at gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s7, s7, use_module at gotpcrel32@hi+12
@@ -341,8 +341,6 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_normal(i32 %id
 ; CHECK-NEXT:    s_addc_u32 s7, s7, 0
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
-; CHECK-NEXT:    s_add_u32 s0, s0, s9
-; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    s_add_u32 s8, s4, 8
 ; CHECK-NEXT:    s_addc_u32 s9, s5, 0
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
@@ -351,6 +349,8 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_normal(i32 %id
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 2
+; CHECK-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; CHECK-NEXT:    s_mov_b32 s15, 0
 ; CHECK-NEXT:    ds_write_b16 v0, v1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
@@ -370,14 +370,14 @@ define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_normal(i32 %id
 ; CHECK-NEXT:    s_addc_u32 s7, s7, 0
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
-; CHECK-NEXT:    s_add_u32 s0, s0, s9
-; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    s_add_u32 s8, s4, 8
 ; CHECK-NEXT:    s_addc_u32 s9, s5, 0
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, use_module at gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, use_module at gotpcrel32@hi+12
+; CHECK-NEXT:    s_mov_b64 s[2:3], 0xf0000000
 ; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CHECK-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
@@ -410,8 +410,6 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_normal(i32
 ; CHECK-NEXT:    s_addc_u32 s7, s7, 0
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
-; CHECK-NEXT:    s_add_u32 s0, s0, s9
-; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    s_add_u32 s8, s4, 8
 ; CHECK-NEXT:    s_addc_u32 s9, s5, 0
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
@@ -420,6 +418,8 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_normal(i32
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 2
+; CHECK-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; CHECK-NEXT:    s_mov_b32 s15, 2
 ; CHECK-NEXT:    ds_write_b16 v0, v1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
@@ -439,14 +439,14 @@ define amdgpu_kernel void @module_1_kernel_overalign_indirect_extern_normal(i32
 ; CHECK-NEXT:    s_addc_u32 s7, s7, 0
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
-; CHECK-NEXT:    s_add_u32 s0, s0, s9
-; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    s_add_u32 s8, s4, 8
 ; CHECK-NEXT:    s_addc_u32 s9, s5, 0
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, use_module at gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, use_module at gotpcrel32@hi+12
+; CHECK-NEXT:    s_mov_b64 s[2:3], 0xf0000000
 ; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CHECK-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
@@ -479,8 +479,6 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_overalign(i32
 ; CHECK-NEXT:    s_addc_u32 s7, s7, 0
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
-; CHECK-NEXT:    s_add_u32 s0, s0, s9
-; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    s_add_u32 s8, s4, 8
 ; CHECK-NEXT:    s_addc_u32 s9, s5, 0
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
@@ -489,6 +487,8 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_overalign(i32
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 2
+; CHECK-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; CHECK-NEXT:    s_mov_b32 s15, 1
 ; CHECK-NEXT:    ds_write_b16 v0, v1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
@@ -508,14 +508,14 @@ define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_overalign(i32
 ; CHECK-NEXT:    s_addc_u32 s7, s7, 0
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
-; CHECK-NEXT:    s_add_u32 s0, s0, s9
-; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    s_add_u32 s8, s4, 8
 ; CHECK-NEXT:    s_addc_u32 s9, s5, 0
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, use_module at gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, use_module at gotpcrel32@hi+12
+; CHECK-NEXT:    s_mov_b64 s[2:3], 0xf0000000
 ; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CHECK-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
@@ -548,8 +548,6 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_overalign(i
 ; CHECK-NEXT:    s_addc_u32 s7, s7, 0
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
-; CHECK-NEXT:    s_add_u32 s0, s0, s9
-; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    s_add_u32 s8, s4, 8
 ; CHECK-NEXT:    s_addc_u32 s9, s5, 0
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
@@ -558,6 +556,8 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_overalign(i
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 2
+; CHECK-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; CHECK-NEXT:    s_mov_b32 s15, 3
 ; CHECK-NEXT:    ds_write_b16 v0, v1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
@@ -577,14 +577,14 @@ define amdgpu_kernel void @module_1_kernel_overalign_indirect_extern_overalign(i
 ; CHECK-NEXT:    s_addc_u32 s7, s7, 0
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
-; CHECK-NEXT:    s_add_u32 s0, s0, s9
-; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    s_add_u32 s8, s4, 8
 ; CHECK-NEXT:    s_addc_u32 s9, s5, 0
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, use_module at gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, use_module at gotpcrel32@hi+12
+; CHECK-NEXT:    s_mov_b64 s[2:3], 0xf0000000
 ; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CHECK-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll
index 61818dafd2b84..5c33d50382174 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll
@@ -45,8 +45,8 @@ define amdgpu_kernel void @indirect_lds_id(ptr addrspace(1) %out) !llvm.amdgcn.l
 ; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; GCN-NEXT:    s_add_i32 s6, s6, s9
 ; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; GCN-NEXT:    s_add_u32 s0, s0, s9
-; GCN-NEXT:    s_addc_u32 s1, s1, 0
+; GCN-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; GCN-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; GCN-NEXT:    s_getpc_b64 s[6:7]
 ; GCN-NEXT:    s_add_u32 s6, s6, function_lds_id at gotpcrel32@lo+4
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll
index 41551d5fb9060..34ca2793997a4 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll
@@ -164,8 +164,8 @@ define amdgpu_kernel void @k01() {
 ; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; GCN-NEXT:    s_add_i32 s6, s6, s9
 ; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; GCN-NEXT:    s_add_u32 s0, s0, s9
-; GCN-NEXT:    s_addc_u32 s1, s1, 0
+; GCN-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; GCN-NEXT:    s_mov_b64 s[0:1], flat_scratch  
 ; GCN-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
 ; GCN-NEXT:    s_add_u32 s4, s4, f0 at gotpcrel32@lo+4
@@ -198,8 +198,8 @@ define amdgpu_kernel void @k23() {
 ; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; GCN-NEXT:    s_add_i32 s6, s6, s9
 ; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; GCN-NEXT:    s_add_u32 s0, s0, s9
-; GCN-NEXT:    s_addc_u32 s1, s1, 0
+; GCN-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; GCN-NEXT:    s_mov_b64 s[0:1], flat_scratch 
 ; GCN-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
 ; GCN-NEXT:    s_add_u32 s4, s4, f2 at gotpcrel32@lo+4
@@ -240,8 +240,8 @@ define amdgpu_kernel void @k123() {
 ; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; GCN-NEXT:    s_add_i32 s6, s6, s9
 ; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; GCN-NEXT:    s_add_u32 s0, s0, s9
-; GCN-NEXT:    s_addc_u32 s1, s1, 0
+; GCN-NEXT:    s[2:3], 0xf0000000
+; GCN-NEXT:    s[0:1], flat_scratch
 ; GCN-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
 ; GCN-NEXT:    s_add_u32 s4, s4, f1 at gotpcrel32@lo+4
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll
index 38d6039670ab4..9029e433700f5 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll
@@ -229,8 +229,8 @@ define amdgpu_kernel void @k01() {
 ; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; GCN-NEXT:    s_add_i32 s6, s6, s9
 ; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; GCN-NEXT:    s_add_u32 s0, s0, s9
-; GCN-NEXT:    s_addc_u32 s1, s1, 0
+; GCN-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; GCN-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GCN-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
 ; GCN-NEXT:    s_add_u32 s4, s4, f0 at gotpcrel32@lo+4
@@ -268,8 +268,8 @@ define amdgpu_kernel void @k23() {
 ; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; GCN-NEXT:    s_add_i32 s6, s6, s9
 ; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; GCN-NEXT:    s_add_u32 s0, s0, s9
-; GCN-NEXT:    s_addc_u32 s1, s1, 0
+; GCN-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; GCN-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GCN-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
 ; GCN-NEXT:    s_add_u32 s4, s4, f2 at gotpcrel32@lo+4
@@ -310,8 +310,8 @@ define amdgpu_kernel void @k123() {
 ; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; GCN-NEXT:    s_add_i32 s6, s6, s9
 ; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; GCN-NEXT:    s_add_u32 s0, s0, s9
-; GCN-NEXT:    s_addc_u32 s1, s1, 0
+; GCN-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; GCN-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GCN-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
 ; GCN-NEXT:    s_add_u32 s4, s4, f1 at gotpcrel32@lo+4
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
index 1c75a2fc3dce6..e88b76b30197c 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
@@ -44,17 +44,17 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; CHECK-NEXT:    s_load_dwordx8 s[44:51], s[6:7], 0x0
-; CHECK-NEXT:    s_add_u32 s0, s0, s15
 ; CHECK-NEXT:    s_mov_b64 s[34:35], s[6:7]
-; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v40, v0
 ; CHECK-NEXT:    s_add_u32 s42, s34, 40
 ; CHECK-NEXT:    v_mov_b32_e32 v31, v0
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    s_mov_b64 s[36:37], s[8:9]
 ; CHECK-NEXT:    s_addc_u32 s43, s35, 0
-; CHECK-NEXT:    s_mov_b64 s[10:11], s[36:37]
+; CHECK-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT:    s_mov_b64 s[0:1], s[10:11]
 ; CHECK-NEXT:    s_mov_b64 s[8:9], s[42:43]
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[36:37]
 ; CHECK-NEXT:    s_mov_b32 s33, s14
 ; CHECK-NEXT:    s_mov_b32 s40, s13
 ; CHECK-NEXT:    s_mov_b32 s41, s12
@@ -766,17 +766,17 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; CHECK-NEXT:    s_load_dwordx2 s[46:47], s[6:7], 0x10
-; CHECK-NEXT:    s_add_u32 s0, s0, s15
 ; CHECK-NEXT:    s_mov_b64 s[36:37], s[6:7]
-; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v40, v0
 ; CHECK-NEXT:    s_add_u32 s42, s36, 40
 ; CHECK-NEXT:    v_mov_b32_e32 v31, v0
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    s_mov_b64 s[34:35], s[8:9]
 ; CHECK-NEXT:    s_addc_u32 s43, s37, 0
-; CHECK-NEXT:    s_mov_b64 s[10:11], s[34:35]
+; CHECK-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT:    s_mov_b64 s[0:1], s[10:11]
 ; CHECK-NEXT:    s_mov_b64 s[8:9], s[42:43]
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[34:35]
 ; CHECK-NEXT:    s_mov_b32 s33, s14
 ; CHECK-NEXT:    s_mov_b32 s40, s13
 ; CHECK-NEXT:    s_mov_b32 s41, s12
diff --git a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll
index f70441e87a74b..46b69537798e5 100644
--- a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll
+++ b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll
@@ -69,8 +69,8 @@ define amdgpu_kernel void @kernel_call() {
 ; CHECK-NEXT:    s_mov_b32 s32, 0
 ; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
-; CHECK-NEXT:    s_add_u32 s0, s0, s15
-; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; CHECK-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; CHECK-NEXT:    s_getpc_b64 s[16:17]
@@ -128,8 +128,8 @@ define amdgpu_kernel void @kernel_tailcall() {
 ; CHECK-NEXT:    s_mov_b32 s32, 0
 ; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
-; CHECK-NEXT:    s_add_u32 s0, s0, s15
-; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; CHECK-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; CHECK-NEXT:    s_getpc_b64 s[16:17]
@@ -240,8 +240,8 @@ define protected amdgpu_kernel void @kernel() {
 ; CHECK-NEXT:    s_mov_b32 s32, 0
 ; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
-; CHECK-NEXT:    s_add_u32 s0, s0, s15
-; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; CHECK-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; CHECK-NEXT:    s_getpc_b64 s[16:17]
diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
index dcc90c0dcd407..92fbbdb9c4e87 100644
--- a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
@@ -45,8 +45,8 @@ define amdgpu_kernel void @test_simple_indirect_call() {
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x4
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
-; GFX9-NEXT:    s_add_u32 s0, s0, s17
-; GFX9-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; GFX9-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshr_b32 s4, s4, 16
 ; GFX9-NEXT:    s_mul_i32 s4, s4, s5
diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
index dd8ff64a4eec2..9510b400d1d6c 100644
--- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
+++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
@@ -45,10 +45,8 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_load_dwordx2 s[4:5], s[38:39], 0x18
 ; GLOBALNESS1-NEXT:    s_load_dword s7, s[38:39], 0x20
 ; GLOBALNESS1-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GLOBALNESS1-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
-; GLOBALNESS1-NEXT:    s_add_u32 s0, s0, s15
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v0, 0
-; GLOBALNESS1-NEXT:    s_addc_u32 s1, s1, 0
+; GLOBALNESS1-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v1, 0x40994400
 ; GLOBALNESS1-NEXT:    s_bitcmp1_b32 s74, 0
 ; GLOBALNESS1-NEXT:    s_waitcnt lgkmcnt(0)
@@ -78,6 +76,8 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[50:51], 1, v0
 ; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[42:43], 1, v1
 ; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[44:45], 1, v3
+; GLOBALNESS1-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; GLOBALNESS1-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GLOBALNESS1-NEXT:    s_mov_b32 s68, s14
 ; GLOBALNESS1-NEXT:    s_mov_b32 s69, s13
 ; GLOBALNESS1-NEXT:    s_mov_b32 s70, s12
@@ -327,10 +327,8 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_load_dwordx2 s[4:5], s[38:39], 0x18
 ; GLOBALNESS0-NEXT:    s_load_dword s7, s[38:39], 0x20
 ; GLOBALNESS0-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GLOBALNESS0-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
-; GLOBALNESS0-NEXT:    s_add_u32 s0, s0, s15
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v0, 0
-; GLOBALNESS0-NEXT:    s_addc_u32 s1, s1, 0
+; GLOBALNESS0-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v1, 0x40994400
 ; GLOBALNESS0-NEXT:    s_bitcmp1_b32 s74, 0
 ; GLOBALNESS0-NEXT:    s_waitcnt lgkmcnt(0)
@@ -360,6 +358,8 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[50:51], 1, v0
 ; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[42:43], 1, v1
 ; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[44:45], 1, v3
+; GLOBALNESS0-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; GLOBALNESS0-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GLOBALNESS0-NEXT:    s_mov_b32 s66, s14
 ; GLOBALNESS0-NEXT:    s_mov_b32 s67, s13
 ; GLOBALNESS0-NEXT:    s_mov_b32 s68, s12
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll b/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll
index 7840559c78eb6..c8ca99926eab4 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll
@@ -14,8 +14,8 @@ define protected amdgpu_kernel void @kern(ptr %addr) !llvm.amdgcn.lds.kernel.id
 ; CHECK-NEXT:    s_addc_u32 s11, s11, 0
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
-; CHECK-NEXT:    s_add_u32 s0, s0, s15
-; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT:    s_mov_b64 s[0:1], s[10:11]
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; CHECK-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
 ; CHECK-NEXT:    v_mov_b32_e32 v5, 42

>From 10de7a0ce6eb332048477f50dd54c9ba49a1a664 Mon Sep 17 00:00:00 2001
From: Alexander Timofeev <alexander.timofeev at amd.com>
Date: Fri, 2 Feb 2024 19:49:58 +0100
Subject: [PATCH 5/7] [AMDGPU] Compiler should synthesize private buffer
 resource descriptor from flat_scratch_init

---
 llvm/lib/Target/AMDGPU/SIFrameLowering.cpp    |  32 ++--
 .../GlobalISel/call-outgoing-stack-args.ll    |   6 +-
 .../abi-attribute-hints-undefined-behavior.ll |   6 +-
 ...der-no-live-segment-at-def-implicit-def.ll |   3 +-
 .../branch-folding-implicit-def-subreg.ll     |   3 +-
 .../CodeGen/AMDGPU/call-argument-types.ll     | 149 ++++++++++++------
 .../CodeGen/AMDGPU/call-reqd-group-size.ll    |  18 ++-
 llvm/test/CodeGen/AMDGPU/call-waitcnt.ll      |  17 +-
 llvm/test/CodeGen/AMDGPU/cc-update.ll         |  44 ++++--
 .../AMDGPU/cross-block-use-is-not-abi-copy.ll |   6 +-
 .../AMDGPU/indirect-call-known-callees.ll     |   3 +-
 llvm/test/CodeGen/AMDGPU/indirect-call.ll     |  12 +-
 .../kernel-vgpr-spill-mubuf-with-voffset.ll   |   3 +-
 llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll  |  36 +++--
 .../AMDGPU/llvm.amdgcn.lds.kernel.id.ll       |   3 +-
 .../AMDGPU/lower-module-lds-via-hybrid.ll     |  11 +-
 .../AMDGPU/lower-module-lds-via-table.ll      |   9 +-
 ...ne-sink-temporal-divergence-swdev407790.ll |   6 +-
 .../AMDGPU/need-fp-from-vgpr-spills.ll        |   9 +-
 .../CodeGen/AMDGPU/simple-indirect-call.ll    |   7 +-
 .../AMDGPU/tuple-allocation-failure.ll        |   6 +-
 .../CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll   |   3 +-
 22 files changed, 256 insertions(+), 136 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 4842cf83af0e1..f5c3efecd7316 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -398,7 +398,7 @@ Register SIFrameLowering::emitEntryFunctionFlatScratchInit(
 
   Register FlatScrInitLo;
   Register FlatScrInitHi;
-  Register FlatScratchInitReg = AMDGPU::NoRegister;
+  Register FlatScratchInitReg;
 
   if (ST.isAmdPalOS()) {
     // Extract the scratch offset from the descriptor in the GIT
@@ -408,7 +408,6 @@ Register SIFrameLowering::emitEntryFunctionFlatScratchInit(
 
     // Find unused reg to load flat scratch init into
     MachineRegisterInfo &MRI = MF.getRegInfo();
-    Register FlatScratchInitReg = AMDGPU::NoRegister;
     ArrayRef<MCPhysReg> AllSGPR64s = TRI->getAllSGPR64(MF);
     unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 1) / 2;
     AllSGPR64s = AllSGPR64s.slice(
@@ -709,7 +708,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
     MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
   }
 
-  Register FlatScratchInit = AMDGPU::NoRegister;
+  Register FlatScratchInit;
   if (NeedsFlatScratchInit) {
     FlatScratchInit =
         emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg);
@@ -836,22 +835,29 @@ void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
   } else if (ST.isAmdHsaOrMesa(Fn)) {
 
     if (FlatScratchInit) {
+      const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
+      Register Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
+      Register Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
+      uint64_t Rsrc23 = TII->getScratchRsrcWords23();
       I = BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY),
                   TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1))
               .addReg(FlatScratchInit)
               .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
-      I = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64),
-                  TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2_sub3))
-              .addImm(0xf0000000)
-              .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+      BuildMI(MBB, I, DL, SMovB32, Rsrc2)
+          .addImm(Rsrc23 & 0xffffffff)
+          .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+
+      BuildMI(MBB, I, DL, SMovB32, Rsrc3)
+          .addImm(Rsrc23 >> 32)
+          .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
       return;
-    } else {
-      assert(PreloadedScratchRsrcReg);
+    }
 
-      if (ScratchRsrcReg != PreloadedScratchRsrcReg) {
-        BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
-            .addReg(PreloadedScratchRsrcReg, RegState::Kill);
-      }
+    assert(PreloadedScratchRsrcReg);
+
+    if (ScratchRsrcReg != PreloadedScratchRsrcReg) {
+      BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
+          .addReg(PreloadedScratchRsrcReg, RegState::Kill);
     }
   }
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
index a7277414391cb..6e49a5a4ec0e5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
@@ -13,8 +13,9 @@ define amdgpu_kernel void @kernel_caller_stack() {
 ; MUBUF-LABEL: kernel_caller_stack:
 ; MUBUF:       ; %bb.0:
 ; MUBUF-NEXT:    s_add_u32 flat_scratch_lo, s4, s7
+; MUBUF-NEXT:    s_mov_b32 s2, -1
 ; MUBUF-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
-; MUBUF-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; MUBUF-NEXT:    s_mov_b32 s3, 0xe00000
 ; MUBUF-NEXT:    s_mov_b32 s32, 0
 ; MUBUF-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; MUBUF-NEXT:    v_mov_b32_e32 v0, 9
@@ -61,8 +62,9 @@ define amdgpu_kernel void @kernel_caller_byval() {
 ; MUBUF-LABEL: kernel_caller_byval:
 ; MUBUF:       ; %bb.0:
 ; MUBUF-NEXT:    s_add_u32 flat_scratch_lo, s4, s7
+; MUBUF-NEXT:    s_mov_b32 s2, -1
 ; MUBUF-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
-; MUBUF-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; MUBUF-NEXT:    s_mov_b32 s3, 0xe00000
 ; MUBUF-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
 ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:8
diff --git a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
index bda25cda4c5f9..609b5e6f49ef1 100644
--- a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
+++ b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
@@ -48,10 +48,11 @@ define amdgpu_kernel void @parent_kernel_missing_inputs() #0 {
 ; FIXEDABI-SDAG-LABEL: parent_kernel_missing_inputs:
 ; FIXEDABI-SDAG:       ; %bb.0:
 ; FIXEDABI-SDAG-NEXT:    s_add_i32 s4, s4, s9
+; FIXEDABI-SDAG-NEXT:    s_mov_b32 s2, -1
 ; FIXEDABI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; FIXEDABI-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; FIXEDABI-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; FIXEDABI-SDAG-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; FIXEDABI-SDAG-NEXT:    s_mov_b32 s3, 0x11e80000
 ; FIXEDABI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; FIXEDABI-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
 ; FIXEDABI-SDAG-NEXT:    s_mov_b64 s[0:1], flat_scratch
@@ -70,10 +71,11 @@ define amdgpu_kernel void @parent_kernel_missing_inputs() #0 {
 ; FIXEDABI-GISEL-LABEL: parent_kernel_missing_inputs:
 ; FIXEDABI-GISEL:       ; %bb.0:
 ; FIXEDABI-GISEL-NEXT:    s_add_i32 s4, s4, s9
+; FIXEDABI-GISEL-NEXT:    s_mov_b32 s2, -1
 ; FIXEDABI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; FIXEDABI-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; FIXEDABI-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; FIXEDABI-GISEL-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; FIXEDABI-GISEL-NEXT:    s_mov_b32 s3, 0x11e80000
 ; FIXEDABI-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
 ; FIXEDABI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 20, v2
 ; FIXEDABI-GISEL-NEXT:    s_mov_b64 s[0:1], flat_scratch
diff --git a/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll b/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll
index 20f60d1db7fb5..74c6bb599cb9b 100644
--- a/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll
+++ b/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll
@@ -10,7 +10,8 @@ define amdgpu_kernel void @blender_no_live_segment_at_def_error(<4 x float> %ext
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; CHECK-NEXT:    s_load_dwordx8 s[36:43], s[6:7], 0x0
-; CHECK-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT:    s_mov_b32 s2, -1
+; CHECK-NEXT:    s_mov_b32 s3, 0x31c16000
 ; CHECK-NEXT:    s_mov_b64 s[0:1], s[10:11]
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; CHECK-NEXT:    s_mov_b32 s8, 0
diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
index 2baaefb76acb3..c06f213b9eb66 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
@@ -10,7 +10,8 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   $sgpr32 = S_MOV_B32 0
   ; GFX90A-NEXT:   $flat_scr_lo = S_ADD_U32 $sgpr10, $sgpr15, implicit-def $scc
   ; GFX90A-NEXT:   $flat_scr_hi = S_ADDC_U32 $sgpr11, 0, implicit-def dead $scc, implicit $scc
-  ; GFX90A-NEXT:   $sgpr2_sgpr3 = S_MOV_B64 4026531840, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   $sgpr2 = S_MOV_B32 4294967295, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   $sgpr3 = S_MOV_B32 14680064, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT:   $sgpr0_sgpr1 = COPY $flat_scr, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT:   renamable $sgpr10_sgpr11 = COPY $sgpr8_sgpr9
   ; GFX90A-NEXT:   renamable $vgpr31 = COPY $vgpr0, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
index 0fe54349215ba..c8eb797a0081a 100644
--- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -129,9 +129,10 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_i1_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_mov_b32 s2, -1
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
 ; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 1
 ; HSA-NEXT:    s_mov_b32 s32, 0
@@ -234,7 +235,8 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0 glc
 ; HSA-NEXT:    s_waitcnt vmcnt(0)
-; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b32 s2, -1
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
 ; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
@@ -339,7 +341,8 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0 glc
 ; HSA-NEXT:    s_waitcnt vmcnt(0)
-; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b32 s2, -1
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
 ; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
@@ -422,9 +425,10 @@ define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 {
 ; HSA-LABEL: test_call_external_void_func_i8_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s6, s6, s9
+; HSA-NEXT:    s_mov_b32 s2, -1
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
 ; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 0x7b
 ; HSA-NEXT:    s_mov_b32 s32, 0
@@ -525,7 +529,8 @@ define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 {
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    buffer_load_sbyte v0, off, s[4:7], 0 glc
 ; HSA-NEXT:    s_waitcnt vmcnt(0)
-; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b32 s2, -1
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
 ; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
@@ -625,7 +630,8 @@ define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 {
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0 glc
 ; HSA-NEXT:    s_waitcnt vmcnt(0)
-; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b32 s2, -1
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
 ; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
@@ -707,9 +713,10 @@ define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_i16_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_mov_b32 s2, -1
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
 ; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 0x7b
 ; HSA-NEXT:    s_mov_b32 s32, 0
@@ -809,7 +816,8 @@ define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 {
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    buffer_load_sshort v0, off, s[4:7], 0 glc
 ; HSA-NEXT:    s_waitcnt vmcnt(0)
-; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b32 s2, -1
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
 ; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
@@ -909,7 +917,8 @@ define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 {
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    buffer_load_ushort v0, off, s[4:7], 0 glc
 ; HSA-NEXT:    s_waitcnt vmcnt(0)
-; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b32 s2, -1
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
 ; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
@@ -991,9 +1000,10 @@ define amdgpu_kernel void @test_call_external_void_func_i32_imm(i32) #0 {
 ; HSA-LABEL: test_call_external_void_func_i32_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s6, s6, s9
+; HSA-NEXT:    s_mov_b32 s2, -1
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
 ; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 42
 ; HSA-NEXT:    s_mov_b32 s32, 0
@@ -1078,9 +1088,10 @@ define amdgpu_kernel void @test_call_external_void_func_i64_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_i64_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_mov_b32 s2, -1
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
 ; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 0x7b
 ; HSA-NEXT:    v_mov_b32_e32 v1, 0
@@ -1186,7 +1197,8 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64() #0 {
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    s_mov_b32 s5, s4
 ; HSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b32 s2, -1
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
 ; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
@@ -1278,9 +1290,10 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_v2i64_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_mov_b32 s2, -1
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
 ; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 1
 ; HSA-NEXT:    v_mov_b32_e32 v1, 2
@@ -1395,7 +1408,8 @@ define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 {
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    s_mov_b32 s5, s4
 ; HSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b32 s2, -1
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
 ; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v4, 1
 ; HSA-NEXT:    v_mov_b32_e32 v5, 2
@@ -1518,7 +1532,8 @@ define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 {
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    s_mov_b32 s5, s4
 ; HSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b32 s2, -1
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
 ; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v4, 1
 ; HSA-NEXT:    v_mov_b32_e32 v5, 2
@@ -1605,9 +1620,10 @@ define amdgpu_kernel void @test_call_external_void_func_f16_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_f16_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_mov_b32 s2, -1
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
 ; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 0x4400
 ; HSA-NEXT:    s_mov_b32 s32, 0
@@ -1689,9 +1705,10 @@ define amdgpu_kernel void @test_call_external_void_func_f32_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_f32_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_mov_b32 s2, -1
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
 ; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 4.0
 ; HSA-NEXT:    s_mov_b32 s32, 0
@@ -1776,9 +1793,10 @@ define amdgpu_kernel void @test_call_external_void_func_v2f32_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_v2f32_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_mov_b32 s2, -1
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
 ; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 1.0
 ; HSA-NEXT:    v_mov_b32_e32 v1, 2.0
@@ -1868,9 +1886,10 @@ define amdgpu_kernel void @test_call_external_void_func_v3f32_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_v3f32_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_mov_b32 s2, -1
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
 ; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 1.0
 ; HSA-NEXT:    v_mov_b32_e32 v1, 2.0
@@ -1968,9 +1987,10 @@ define amdgpu_kernel void @test_call_external_void_func_v5f32_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_v5f32_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_mov_b32 s2, -1
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
 ; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 1.0
 ; HSA-NEXT:    v_mov_b32_e32 v1, 2.0
@@ -2059,9 +2079,10 @@ define amdgpu_kernel void @test_call_external_void_func_f64_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_f64_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_mov_b32 s2, -1
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
 ; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 0
 ; HSA-NEXT:    v_mov_b32_e32 v1, 0x40100000
@@ -2154,9 +2175,10 @@ define amdgpu_kernel void @test_call_external_void_func_v2f64_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_v2f64_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_mov_b32 s2, -1
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
 ; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 0
 ; HSA-NEXT:    v_mov_b32_e32 v1, 2.0
@@ -2258,9 +2280,10 @@ define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_v3f64_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_mov_b32 s2, -1
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
 ; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 0
 ; HSA-NEXT:    v_mov_b32_e32 v1, 2.0
@@ -2360,9 +2383,10 @@ define amdgpu_kernel void @test_call_external_void_func_v2i16() #0 {
 ; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; HSA-NEXT:    s_mov_b32 s2, -1
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
 ; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
@@ -2459,9 +2483,10 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16() #0 {
 ; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; HSA-NEXT:    s_mov_b32 s2, -1
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
 ; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
@@ -2559,9 +2584,10 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16() #0 {
 ; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; HSA-NEXT:    s_mov_b32 s2, -1
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
 ; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
@@ -2647,9 +2673,10 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_v3i16_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_mov_b32 s2, -1
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
 ; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 0x20001
 ; HSA-NEXT:    v_mov_b32_e32 v1, 3
@@ -2737,9 +2764,10 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_v3f16_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_mov_b32 s2, -1
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
 ; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 0x40003c00
 ; HSA-NEXT:    v_mov_b32_e32 v1, 0x4400
@@ -2838,9 +2866,10 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16() #0 {
 ; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; HSA-NEXT:    s_mov_b32 s2, -1
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
 ; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
@@ -2928,9 +2957,10 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_v4i16_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_mov_b32 s2, -1
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
 ; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 0x20001
 ; HSA-NEXT:    v_mov_b32_e32 v1, 0x40003
@@ -3028,9 +3058,10 @@ define amdgpu_kernel void @test_call_external_void_func_v2f16() #0 {
 ; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; HSA-NEXT:    s_mov_b32 s2, -1
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
 ; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
@@ -3123,9 +3154,10 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32() #0 {
 ; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; HSA-NEXT:    s_mov_b32 s2, -1
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
 ; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
@@ -3210,9 +3242,10 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_v2i32_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_mov_b32 s2, -1
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
 ; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 1
 ; HSA-NEXT:    v_mov_b32_e32 v1, 2
@@ -3302,9 +3335,10 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_imm(i32) #0 {
 ; HSA-LABEL: test_call_external_void_func_v3i32_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s6, s6, s9
+; HSA-NEXT:    s_mov_b32 s2, -1
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
 ; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 3
 ; HSA-NEXT:    v_mov_b32_e32 v1, 4
@@ -3398,9 +3432,10 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_i32(i32) #0 {
 ; HSA-LABEL: test_call_external_void_func_v3i32_i32:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s6, s6, s9
+; HSA-NEXT:    s_mov_b32 s2, -1
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
 ; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 3
 ; HSA-NEXT:    v_mov_b32_e32 v1, 4
@@ -3496,9 +3531,10 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32() #0 {
 ; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; HSA-NEXT:    s_mov_b32 s2, -1
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
 ; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
@@ -3590,9 +3626,10 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_v4i32_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_mov_b32 s2, -1
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
 ; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 1
 ; HSA-NEXT:    v_mov_b32_e32 v1, 2
@@ -3691,9 +3728,10 @@ define amdgpu_kernel void @test_call_external_void_func_v5i32_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_v5i32_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_mov_b32 s2, -1
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
 ; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 1
 ; HSA-NEXT:    v_mov_b32_e32 v1, 2
@@ -3808,7 +3846,8 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32() #0 {
 ; HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; HSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
 ; HSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
-; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b32 s2, -1
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
 ; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
@@ -3915,9 +3954,10 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_v8i32_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_mov_b32 s2, -1
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
 ; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 1
 ; HSA-NEXT:    v_mov_b32_e32 v1, 2
@@ -4045,7 +4085,8 @@ define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 {
 ; HSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
 ; HSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32
 ; HSA-NEXT:    buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48
-; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b32 s2, -1
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
 ; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
@@ -4194,7 +4235,8 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
 ; HSA-NEXT:    buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64
 ; HSA-NEXT:    buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80
 ; HSA-NEXT:    buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
-; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b32 s2, -1
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    s_getpc_b64 s[8:9]
@@ -4359,7 +4401,8 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 {
 ; HSA-NEXT:    buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64
 ; HSA-NEXT:    buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80
 ; HSA-NEXT:    buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
-; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b32 s2, -1
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
@@ -4470,9 +4513,10 @@ define amdgpu_kernel void @test_call_external_i32_func_i32_imm(ptr addrspace(1)
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_load_dwordx2 s[36:37], s[4:5], 0x0
 ; HSA-NEXT:    s_add_i32 s6, s6, s9
+; HSA-NEXT:    s_mov_b32 s2, -1
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
 ; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 42
 ; HSA-NEXT:    s_mov_b32 s32, 0
@@ -4588,7 +4632,8 @@ define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 {
 ; HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; HSA-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
 ; HSA-NEXT:    buffer_load_dword v1, off, s[4:7], 0 offset:4
-; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b32 s2, -1
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
 ; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
@@ -4704,9 +4749,10 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0
 ; HSA-LABEL: test_call_external_void_func_byval_struct_i8_i32:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_mov_b32 s2, -1
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
 ; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 3
 ; HSA-NEXT:    buffer_store_byte v0, off, s[0:3], 0 offset:8
@@ -4879,9 +4925,10 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval
 ; HSA-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s6, s6, s9
+; HSA-NEXT:    s_mov_b32 s2, -1
 ; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
 ; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 3
 ; HSA-NEXT:    buffer_store_byte v0, off, s[0:3], 0 offset:8
@@ -5089,10 +5136,11 @@ define amdgpu_kernel void @test_call_external_void_func_v16i8() #0 {
 ; HSA-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
 ; HSA-NEXT:    s_mov_b32 s6, -1
-; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
-; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; HSA-NEXT:    s_mov_b32 s2, -1
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
 ; HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; HSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v16i8 at rel32@lo+4
@@ -5346,7 +5394,8 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val
 ; HSA-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x40
 ; HSA-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x80
 ; HSA-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0x0
-; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b32 s2, -1
+; HSA-NEXT:    s_mov_b32 s3, 0x11e80000
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll b/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll
index efa412d6b0a32..8e2fca554e28c 100644
--- a/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll
@@ -11,8 +11,9 @@ define amdgpu_kernel void @known_x_0(ptr addrspace(1) %out) !reqd_work_group_siz
 ; CHECK-LABEL: known_x_0:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
+; CHECK-NEXT:    s_mov_b32 s2, -1
 ; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
-; CHECK-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT:    s_mov_b32 s3, 0xe00000
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 20, v2
 ; CHECK-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; CHECK-NEXT:    v_lshl_or_b32 v31, v1, 10, v0
@@ -31,8 +32,9 @@ define amdgpu_kernel void @known_y_0(ptr addrspace(1) %out) !reqd_work_group_siz
 ; CHECK-LABEL: known_y_0:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
+; CHECK-NEXT:    s_mov_b32 s2, -1
 ; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
-; CHECK-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT:    s_mov_b32 s3, 0xe00000
 ; CHECK-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; CHECK-NEXT:    v_lshl_or_b32 v31, v2, 20, v0
 ; CHECK-NEXT:    s_mov_b32 s32, 0
@@ -50,8 +52,9 @@ define amdgpu_kernel void @known_z_0(ptr addrspace(1) %out) !reqd_work_group_siz
 ; CHECK-LABEL: known_z_0:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
+; CHECK-NEXT:    s_mov_b32 s2, -1
 ; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
-; CHECK-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT:    s_mov_b32 s3, 0xe00000
 ; CHECK-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; CHECK-NEXT:    v_lshl_or_b32 v31, v1, 10, v0
 ; CHECK-NEXT:    s_mov_b32 s32, 0
@@ -69,8 +72,9 @@ define amdgpu_kernel void @known_yz_0(ptr addrspace(1) %out) !reqd_work_group_si
 ; CHECK-LABEL: known_yz_0:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
+; CHECK-NEXT:    s_mov_b32 s2, -1
 ; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
-; CHECK-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT:    s_mov_b32 s3, 0xe00000
 ; CHECK-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; CHECK-NEXT:    v_mov_b32_e32 v31, v0
 ; CHECK-NEXT:    s_mov_b32 s32, 0
@@ -88,8 +92,9 @@ define amdgpu_kernel void @known_xz_0(ptr addrspace(1) %out) !reqd_work_group_si
 ; CHECK-LABEL: known_xz_0:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
+; CHECK-NEXT:    s_mov_b32 s2, -1
 ; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
-; CHECK-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT:    s_mov_b32 s3, 0xe00000
 ; CHECK-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v31, 10, v1
 ; CHECK-NEXT:    s_mov_b32 s32, 0
@@ -108,8 +113,9 @@ define amdgpu_kernel void @known_xyz_0(ptr addrspace(1) %out) !reqd_work_group_s
 ; CHECK-LABEL: known_xyz_0:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
+; CHECK-NEXT:    s_mov_b32 s2, -1
 ; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
-; CHECK-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT:    s_mov_b32 s3, 0xe00000
 ; CHECK-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; CHECK-NEXT:    v_mov_b32_e32 v31, 0
 ; CHECK-NEXT:    s_mov_b32 s32, 0
diff --git a/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll b/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll
index 8e4baf4a8e5a4..6db5effdf04ed 100644
--- a/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll
@@ -7,12 +7,13 @@ define amdgpu_kernel void @call_memory_arg_load(ptr addrspace(3) %ptr, i32) #0 {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GCN-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
-; GCN-NEXT:    s_mov_b64 s[2:3], 0xf0000000
-; GCN-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; GCN-NEXT:    s_mov_b32 s3, 0xe00000
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    ds_read_b32 v0, v0
+; GCN-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GCN-NEXT:    s_mov_b32 s32, 0
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
 ; GCN-NEXT:    s_add_u32 s4, s4, func at rel32@lo+4
@@ -30,8 +31,9 @@ define amdgpu_kernel void @call_memory_no_dep(ptr addrspace(1) %ptr, i32) #0 {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; GCN-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
-; GCN-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; GCN-NEXT:    s_mov_b32 s3, 0xe00000
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
@@ -54,8 +56,9 @@ define amdgpu_kernel void @call_no_wait_after_call(ptr addrspace(1) %ptr, i32) #
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x0
 ; GCN-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
-; GCN-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; GCN-NEXT:    s_mov_b32 s3, 0xe00000
 ; GCN-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_mov_b32 s32, 0
@@ -76,8 +79,9 @@ define amdgpu_kernel void @call_no_wait_after_call_return_val(ptr addrspace(1) %
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x0
 ; GCN-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
-; GCN-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; GCN-NEXT:    s_mov_b32 s3, 0xe00000
 ; GCN-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_mov_b32 s32, 0
@@ -103,7 +107,8 @@ define amdgpu_kernel void @call_got_load(ptr addrspace(1) %ptr, i32) #0 {
 ; GCN-NEXT:    s_add_u32 s4, s4, got.func at gotpcrel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s5, s5, got.func at gotpcrel32@hi+12
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GCN-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_mov_b32 s3, 0xe00000
 ; GCN-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_mov_b32 s32, 0
diff --git a/llvm/test/CodeGen/AMDGPU/cc-update.ll b/llvm/test/CodeGen/AMDGPU/cc-update.ll
index 98f60a76a9e82..42beb1c8ae256 100644
--- a/llvm/test/CodeGen/AMDGPU/cc-update.ll
+++ b/llvm/test/CodeGen/AMDGPU/cc-update.ll
@@ -68,10 +68,11 @@ define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 {
 ; GFX803-LABEL: test_kern_call:
 ; GFX803:       ; %bb.0: ; %entry
 ; GFX803-NEXT:    s_add_i32 s10, s10, s15
+; GFX803-NEXT:    s_mov_b32 s2, -1
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GFX803-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; GFX803-NEXT:    s_mov_b32 s3, 0x11e80000
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX803-NEXT:    s_mov_b64 s[0:1], flat_scratch
@@ -88,8 +89,9 @@ define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 {
 ; GFX900-LABEL: test_kern_call:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX900-NEXT:    s_mov_b32 s2, -1
 ; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
-; GFX900-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; GFX900-NEXT:    s_mov_b32 s3, 0xe00000
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX900-NEXT:    s_mov_b64 s[0:1], flat_scratch
@@ -112,11 +114,12 @@ define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 {
 ; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX1010-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX1010-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1010-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; GFX1010-NEXT:    s_mov_b32 s2, -1
+; GFX1010-NEXT:    s_mov_b32 s3, 0x31c16000
 ; GFX1010-NEXT:    s_mov_b64 s[0:1], s[10:11]
 ; GFX1010-NEXT:    s_mov_b64 s[10:11], s[8:9]
-; GFX1010-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX1010-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1010-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX1010-NEXT:    s_getpc_b64 s[16:17]
 ; GFX1010-NEXT:    s_add_u32 s16, s16, ex at rel32@lo+4
 ; GFX1010-NEXT:    s_addc_u32 s17, s17, ex at rel32@hi+12
@@ -148,10 +151,11 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 {
 ; GFX803-LABEL: test_kern_stack_and_call:
 ; GFX803:       ; %bb.0: ; %entry
 ; GFX803-NEXT:    s_add_i32 s10, s10, s15
+; GFX803-NEXT:    s_mov_b32 s2, -1
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GFX803-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; GFX803-NEXT:    s_mov_b32 s3, 0x11e80000
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX803-NEXT:    s_mov_b64 s[0:1], flat_scratch
@@ -171,8 +175,9 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 {
 ; GFX900-LABEL: test_kern_stack_and_call:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX900-NEXT:    s_mov_b32 s2, -1
 ; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
-; GFX900-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; GFX900-NEXT:    s_mov_b32 s3, 0xe00000
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX900-NEXT:    s_mov_b64 s[0:1], flat_scratch
@@ -199,10 +204,11 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 {
 ; GFX1010-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX1010-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX1010-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1010-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; GFX1010-NEXT:    s_mov_b32 s2, -1
+; GFX1010-NEXT:    s_mov_b32 s3, 0x31c16000
 ; GFX1010-NEXT:    s_mov_b64 s[0:1], s[10:11]
-; GFX1010-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GFX1010-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1010-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GFX1010-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX1010-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:4
 ; GFX1010-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -311,10 +317,11 @@ define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 {
 ; GFX803-LABEL: test_force_fp_kern_call:
 ; GFX803:       ; %bb.0: ; %entry
 ; GFX803-NEXT:    s_add_i32 s10, s10, s15
+; GFX803-NEXT:    s_mov_b32 s2, -1
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GFX803-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; GFX803-NEXT:    s_mov_b32 s3, 0x11e80000
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX803-NEXT:    s_mov_b64 s[0:1], flat_scratch
@@ -332,8 +339,9 @@ define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 {
 ; GFX900-LABEL: test_force_fp_kern_call:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX900-NEXT:    s_mov_b32 s2, -1
 ; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
-; GFX900-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; GFX900-NEXT:    s_mov_b32 s3, 0xe00000
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX900-NEXT:    s_mov_b64 s[0:1], flat_scratch
@@ -358,11 +366,12 @@ define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 {
 ; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX1010-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX1010-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1010-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; GFX1010-NEXT:    s_mov_b32 s2, -1
+; GFX1010-NEXT:    s_mov_b32 s3, 0x31c16000
 ; GFX1010-NEXT:    s_mov_b64 s[0:1], s[10:11]
 ; GFX1010-NEXT:    s_mov_b64 s[10:11], s[8:9]
-; GFX1010-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX1010-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1010-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX1010-NEXT:    s_getpc_b64 s[16:17]
 ; GFX1010-NEXT:    s_add_u32 s16, s16, ex at rel32@lo+4
 ; GFX1010-NEXT:    s_addc_u32 s17, s17, ex at rel32@hi+12
@@ -413,10 +422,11 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_add
 ; GFX803-LABEL: test_force_fp_kern_stack_and_call:
 ; GFX803:       ; %bb.0: ; %entry
 ; GFX803-NEXT:    s_add_i32 s10, s10, s15
+; GFX803-NEXT:    s_mov_b32 s2, -1
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GFX803-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; GFX803-NEXT:    s_mov_b32 s3, 0x11e80000
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX803-NEXT:    s_mov_b32 s33, 0
@@ -437,8 +447,9 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_add
 ; GFX900-LABEL: test_force_fp_kern_stack_and_call:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX900-NEXT:    s_mov_b32 s2, -1
 ; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
-; GFX900-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; GFX900-NEXT:    s_mov_b32 s3, 0xe00000
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX900-NEXT:    s_mov_b32 s33, 0
@@ -467,10 +478,11 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_add
 ; GFX1010-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX1010-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX1010-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1010-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; GFX1010-NEXT:    s_mov_b32 s2, -1
+; GFX1010-NEXT:    s_mov_b32 s3, 0x31c16000
 ; GFX1010-NEXT:    s_mov_b64 s[0:1], s[10:11]
-; GFX1010-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GFX1010-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1010-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GFX1010-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX1010-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:4
 ; GFX1010-NEXT:    s_waitcnt_vscnt null, 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
index 408d3b2c4c229..68c632a0bf6f4 100644
--- a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
@@ -180,7 +180,8 @@ define amdgpu_kernel void @v3i16_registers(i1 %cond) #0 {
 ; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GCN-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GCN-NEXT:    s_load_dword s8, s[6:7], 0x0
-; GCN-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_mov_b32 s3, 0xe00000
 ; GCN-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GCN-NEXT:    s_mov_b32 s32, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
@@ -229,7 +230,8 @@ define amdgpu_kernel void @v3f16_registers(i1 %cond) #0 {
 ; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GCN-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GCN-NEXT:    s_load_dword s8, s[6:7], 0x0
-; GCN-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_mov_b32 s3, 0xe00000
 ; GCN-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GCN-NEXT:    s_mov_b32 s32, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll
index 955c0605f0536..ceeea64e2f9ea 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll
@@ -22,9 +22,10 @@ define amdgpu_kernel void @indirect_call_known_no_special_inputs() {
 ; CHECK-NEXT:    s_addc_u32 s9, s9, snork at gotpcrel32@hi+12
 ; CHECK-NEXT:    s_load_dwordx2 s[10:11], s[8:9], 0x0
 ; CHECK-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x0
-; CHECK-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT:    s_mov_b32 s2, -1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_and_b32 s4, 1, s7
+; CHECK-NEXT:    s_mov_b32 s3, 0xe00000
 ; CHECK-NEXT:    s_cmp_eq_u32 s4, 1
 ; CHECK-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; CHECK-NEXT:    s_cselect_b32 s5, s13, s11
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
index 594f3b33bfb36..2a8c009062228 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
@@ -12,7 +12,8 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr(i8) {
 ; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; GCN-NEXT:    s_add_i32 s12, s12, s17
 ; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; GCN-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_mov_b32 s3, 0x1e8f000
 ; GCN-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GCN-NEXT:    s_mov_b32 s13, s15
 ; GCN-NEXT:    s_mov_b32 s12, s14
@@ -37,7 +38,8 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr(i8) {
 ; GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; GISEL-NEXT:    s_add_i32 s12, s12, s17
 ; GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; GISEL-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; GISEL-NEXT:    s_mov_b32 s2, -1
+; GISEL-NEXT:    s_mov_b32 s3, 0x1e8f000
 ; GISEL-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GISEL-NEXT:    s_mov_b32 s13, s15
 ; GISEL-NEXT:    s_mov_b32 s12, s14
@@ -67,7 +69,8 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr_arg(i8) {
 ; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; GCN-NEXT:    s_add_i32 s12, s12, s17
 ; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; GCN-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_mov_b32 s3, 0x1e8f000
 ; GCN-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GCN-NEXT:    s_mov_b32 s13, s15
 ; GCN-NEXT:    s_mov_b32 s12, s14
@@ -93,7 +96,8 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr_arg(i8) {
 ; GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; GISEL-NEXT:    s_add_i32 s12, s12, s17
 ; GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; GISEL-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; GISEL-NEXT:    s_mov_b32 s2, -1
+; GISEL-NEXT:    s_mov_b32 s3, 0x1e8f000
 ; GISEL-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GISEL-NEXT:    s_mov_b32 s13, s15
 ; GISEL-NEXT:    s_mov_b32 s12, s14
diff --git a/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll b/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll
index 4944a6db486ba..8843efd2c3c79 100644
--- a/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll
@@ -11,7 +11,8 @@ define amdgpu_kernel void @test_kernel(i32 %val) #0 {
 ; CHECK-NEXT:    s_mov_b32 s33, 0
 ; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
-; CHECK-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT:    s_mov_b32 s2, -1
+; CHECK-NEXT:    s_mov_b32 s3, 0xe00000
 ; CHECK-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; CHECK-NEXT:    ; implicit-def: $vgpr3 : SGPR spill to VGPR lane
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
diff --git a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
index 96dfb1fbf8017..4851c4f73456a 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
@@ -118,8 +118,9 @@ define amdgpu_kernel void @module_1_kernel_normal_extern_normal(i32 %idx) {
 ; CHECK-NEXT:    s_addc_u32 s7, s7, 0
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
+; CHECK-NEXT:    s_mov_b32 s2, -1
 ; CHECK-NEXT:    s_add_u32 s8, s4, 8
-; CHECK-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT:    s_mov_b32 s3, 0x31c16000
 ; CHECK-NEXT:    s_addc_u32 s9, s5, 0
 ; CHECK-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; CHECK-NEXT:    s_getpc_b64 s[6:7]
@@ -177,8 +178,9 @@ define amdgpu_kernel void @module_1_kernel_overalign_extern_normal(i32 %idx) {
 ; CHECK-NEXT:    s_addc_u32 s7, s7, 0
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
+; CHECK-NEXT:    s_mov_b32 s2, -1
 ; CHECK-NEXT:    s_add_u32 s8, s4, 8
-; CHECK-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT:    s_mov_b32 s3, 0x31c16000
 ; CHECK-NEXT:    s_addc_u32 s9, s5, 0
 ; CHECK-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; CHECK-NEXT:    s_getpc_b64 s[6:7]
@@ -236,8 +238,9 @@ define amdgpu_kernel void @module_1_kernel_normal_extern_overalign(i32 %idx) {
 ; CHECK-NEXT:    s_addc_u32 s7, s7, 0
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
+; CHECK-NEXT:    s_mov_b32 s2, -1
 ; CHECK-NEXT:    s_add_u32 s8, s4, 8
-; CHECK-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT:    s_mov_b32 s3, 0x31c16000
 ; CHECK-NEXT:    s_addc_u32 s9, s5, 0
 ; CHECK-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; CHECK-NEXT:    s_getpc_b64 s[6:7]
@@ -295,8 +298,9 @@ define amdgpu_kernel void @module_1_kernel_overalign_extern_overalign(i32 %idx)
 ; CHECK-NEXT:    s_addc_u32 s7, s7, 0
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
+; CHECK-NEXT:    s_mov_b32 s2, -1
 ; CHECK-NEXT:    s_add_u32 s8, s4, 8
-; CHECK-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT:    s_mov_b32 s3, 0x31c16000
 ; CHECK-NEXT:    s_addc_u32 s9, s5, 0
 ; CHECK-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; CHECK-NEXT:    s_getpc_b64 s[6:7]
@@ -349,7 +353,8 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_normal(i32 %id
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 2
-; CHECK-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT:    s_mov_b32 s2, -1
+; CHECK-NEXT:    s_mov_b32 s3, 0x31c16000
 ; CHECK-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; CHECK-NEXT:    s_mov_b32 s15, 0
 ; CHECK-NEXT:    ds_write_b16 v0, v1
@@ -375,8 +380,9 @@ define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_normal(i32 %id
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, use_module at gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, use_module at gotpcrel32@hi+12
-; CHECK-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT:    s_mov_b32 s2, -1
 ; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CHECK-NEXT:    s_mov_b32 s3, 0x31c16000
 ; CHECK-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[4:5]
@@ -418,7 +424,8 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_normal(i32
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 2
-; CHECK-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT:    s_mov_b32 s2, -1
+; CHECK-NEXT:    s_mov_b32 s3, 0x31c16000
 ; CHECK-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; CHECK-NEXT:    s_mov_b32 s15, 2
 ; CHECK-NEXT:    ds_write_b16 v0, v1
@@ -444,8 +451,9 @@ define amdgpu_kernel void @module_1_kernel_overalign_indirect_extern_normal(i32
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, use_module at gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, use_module at gotpcrel32@hi+12
-; CHECK-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT:    s_mov_b32 s2, -1
 ; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CHECK-NEXT:    s_mov_b32 s3, 0x31c16000
 ; CHECK-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[4:5]
@@ -487,7 +495,8 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_overalign(i32
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 2
-; CHECK-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT:    s_mov_b32 s2, -1
+; CHECK-NEXT:    s_mov_b32 s3, 0x31c16000
 ; CHECK-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; CHECK-NEXT:    s_mov_b32 s15, 1
 ; CHECK-NEXT:    ds_write_b16 v0, v1
@@ -513,8 +522,9 @@ define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_overalign(i32
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, use_module at gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, use_module at gotpcrel32@hi+12
-; CHECK-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT:    s_mov_b32 s2, -1
 ; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CHECK-NEXT:    s_mov_b32 s3, 0x31c16000
 ; CHECK-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[4:5]
@@ -556,7 +566,8 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_overalign(i
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 2
-; CHECK-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT:    s_mov_b32 s2, -1
+; CHECK-NEXT:    s_mov_b32 s3, 0x31c16000
 ; CHECK-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; CHECK-NEXT:    s_mov_b32 s15, 3
 ; CHECK-NEXT:    ds_write_b16 v0, v1
@@ -582,8 +593,9 @@ define amdgpu_kernel void @module_1_kernel_overalign_indirect_extern_overalign(i
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, use_module at gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, use_module at gotpcrel32@hi+12
-; CHECK-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT:    s_mov_b32 s2, -1
 ; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CHECK-NEXT:    s_mov_b32 s3, 0x31c16000
 ; CHECK-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll
index 5c33d50382174..26271a0a68652 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll
@@ -45,7 +45,8 @@ define amdgpu_kernel void @indirect_lds_id(ptr addrspace(1) %out) !llvm.amdgcn.l
 ; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; GCN-NEXT:    s_add_i32 s6, s6, s9
 ; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; GCN-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_mov_b32 s3, 0x1e8f000
 ; GCN-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; GCN-NEXT:    s_getpc_b64 s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll
index 34ca2793997a4..b4a09810b0404 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll
@@ -164,7 +164,8 @@ define amdgpu_kernel void @k01() {
 ; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; GCN-NEXT:    s_add_i32 s6, s6, s9
 ; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; GCN-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_mov_b32 s3, 0x1e8f000
 ; GCN-NEXT:    s_mov_b64 s[0:1], flat_scratch  
 ; GCN-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
@@ -198,7 +199,8 @@ define amdgpu_kernel void @k23() {
 ; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; GCN-NEXT:    s_add_i32 s6, s6, s9
 ; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; GCN-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_mov_b32 s3, 0x1e8f000
 ; GCN-NEXT:    s_mov_b64 s[0:1], flat_scratch 
 ; GCN-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
@@ -240,8 +242,9 @@ define amdgpu_kernel void @k123() {
 ; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; GCN-NEXT:    s_add_i32 s6, s6, s9
 ; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; GCN-NEXT:    s[2:3], 0xf0000000
-; GCN-NEXT:    s[0:1], flat_scratch
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_mov_b32 s3, 0x1e8f000
+; GCN-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GCN-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
 ; GCN-NEXT:    s_add_u32 s4, s4, f1 at gotpcrel32@lo+4
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll
index 9029e433700f5..46a2c6a14f695 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll
@@ -229,7 +229,8 @@ define amdgpu_kernel void @k01() {
 ; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; GCN-NEXT:    s_add_i32 s6, s6, s9
 ; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; GCN-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_mov_b32 s3, 0x1e8f000
 ; GCN-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GCN-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
@@ -268,7 +269,8 @@ define amdgpu_kernel void @k23() {
 ; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; GCN-NEXT:    s_add_i32 s6, s6, s9
 ; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; GCN-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_mov_b32 s3, 0x1e8f000
 ; GCN-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GCN-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
@@ -310,7 +312,8 @@ define amdgpu_kernel void @k123() {
 ; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; GCN-NEXT:    s_add_i32 s6, s6, s9
 ; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; GCN-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_mov_b32 s3, 0x1e8f000
 ; GCN-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GCN-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
index e88b76b30197c..1f5abe363d70e 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
@@ -49,9 +49,10 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    s_add_u32 s42, s34, 40
 ; CHECK-NEXT:    v_mov_b32_e32 v31, v0
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    s_mov_b32 s2, -1
 ; CHECK-NEXT:    s_mov_b64 s[36:37], s[8:9]
 ; CHECK-NEXT:    s_addc_u32 s43, s35, 0
-; CHECK-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT:    s_mov_b32 s3, 0x31c16000
 ; CHECK-NEXT:    s_mov_b64 s[0:1], s[10:11]
 ; CHECK-NEXT:    s_mov_b64 s[8:9], s[42:43]
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[36:37]
@@ -771,9 +772,10 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
 ; CHECK-NEXT:    s_add_u32 s42, s36, 40
 ; CHECK-NEXT:    v_mov_b32_e32 v31, v0
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    s_mov_b32 s2, -1
 ; CHECK-NEXT:    s_mov_b64 s[34:35], s[8:9]
 ; CHECK-NEXT:    s_addc_u32 s43, s37, 0
-; CHECK-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT:    s_mov_b32 s3, 0x31c16000
 ; CHECK-NEXT:    s_mov_b64 s[0:1], s[10:11]
 ; CHECK-NEXT:    s_mov_b64 s[8:9], s[42:43]
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[34:35]
diff --git a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll
index 46b69537798e5..70a9bbbd47a3e 100644
--- a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll
+++ b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll
@@ -69,7 +69,8 @@ define amdgpu_kernel void @kernel_call() {
 ; CHECK-NEXT:    s_mov_b32 s32, 0
 ; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
-; CHECK-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT:    s_mov_b32 s2, -1
+; CHECK-NEXT:    s_mov_b32 s3, 0xe00000
 ; CHECK-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; CHECK-NEXT:    s_mov_b64 s[8:9], s[6:7]
@@ -128,7 +129,8 @@ define amdgpu_kernel void @kernel_tailcall() {
 ; CHECK-NEXT:    s_mov_b32 s32, 0
 ; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
-; CHECK-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT:    s_mov_b32 s2, -1
+; CHECK-NEXT:    s_mov_b32 s3, 0xe00000
 ; CHECK-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; CHECK-NEXT:    s_mov_b64 s[8:9], s[6:7]
@@ -240,7 +242,8 @@ define protected amdgpu_kernel void @kernel() {
 ; CHECK-NEXT:    s_mov_b32 s32, 0
 ; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
-; CHECK-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT:    s_mov_b32 s2, -1
+; CHECK-NEXT:    s_mov_b32 s3, 0xe00000
 ; CHECK-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; CHECK-NEXT:    s_mov_b64 s[8:9], s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
index 92fbbdb9c4e87..fa2cefc6bd3e0 100644
--- a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
@@ -45,8 +45,8 @@ define amdgpu_kernel void @test_simple_indirect_call() {
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x4
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
-; GFX9-NEXT:    s_mov_b64 s[2:3], 0xf0000000
-; GFX9-NEXT:    s_mov_b64 s[0:1], flat_scratch
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s3, 0xe00000
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshr_b32 s4, s4, 16
 ; GFX9-NEXT:    s_mul_i32 s4, s4, s5
@@ -55,8 +55,9 @@ define amdgpu_kernel void @test_simple_indirect_call() {
 ; GFX9-NEXT:    s_add_u32 s6, s6, indirect at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s7, s7, indirect at rel32@hi+12
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s6
-; GFX9-NEXT:    v_mov_b32_e32 v4, s7
+; GFX9-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GFX9-NEXT:    v_mad_u32_u24 v0, v1, s5, v0
+; GFX9-NEXT:    v_mov_b32_e32 v4, s7
 ; GFX9-NEXT:    v_add_lshl_u32 v0, v0, v2, 3
 ; GFX9-NEXT:    s_mov_b32 s32, 0
 ; GFX9-NEXT:    ds_write_b64 v0, v[3:4]
diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
index 9510b400d1d6c..ecb4dbf5dfac9 100644
--- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
+++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
@@ -75,8 +75,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_load_dwordx2 s[72:73], s[6:7], 0x0
 ; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[50:51], 1, v0
 ; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[42:43], 1, v1
+; GLOBALNESS1-NEXT:    s_mov_b32 s2, -1
 ; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[44:45], 1, v3
-; GLOBALNESS1-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; GLOBALNESS1-NEXT:    s_mov_b32 s3, 0xe00000
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GLOBALNESS1-NEXT:    s_mov_b32 s68, s14
 ; GLOBALNESS1-NEXT:    s_mov_b32 s69, s13
@@ -357,8 +358,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_load_dwordx2 s[72:73], s[6:7], 0x0
 ; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[50:51], 1, v0
 ; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[42:43], 1, v1
+; GLOBALNESS0-NEXT:    s_mov_b32 s2, -1
 ; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[44:45], 1, v3
-; GLOBALNESS0-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; GLOBALNESS0-NEXT:    s_mov_b32 s3, 0xe00000
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; GLOBALNESS0-NEXT:    s_mov_b32 s66, s14
 ; GLOBALNESS0-NEXT:    s_mov_b32 s67, s13
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll b/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll
index c8ca99926eab4..7d759089a7c0c 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll
@@ -14,7 +14,8 @@ define protected amdgpu_kernel void @kern(ptr %addr) !llvm.amdgcn.lds.kernel.id
 ; CHECK-NEXT:    s_addc_u32 s11, s11, 0
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
-; CHECK-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT:    s_mov_b32 s2, -1
+; CHECK-NEXT:    s_mov_b32 s3, 0x31c16000
 ; CHECK-NEXT:    s_mov_b64 s[0:1], s[10:11]
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; CHECK-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0

>From 50312359081a87a646c1cbd96f82edcfd946584b Mon Sep 17 00:00:00 2001
From: Alexander Timofeev <alexander.timofeev at amd.com>
Date: Fri, 2 Feb 2024 19:49:58 +0100
Subject: [PATCH 6/7] [AMDGPU] Compiler should synthesize private buffer
 resource descriptor from flat_scratch_init

---
 llvm/lib/Target/AMDGPU/SIFrameLowering.cpp | 49 ++++++++++++----------
 1 file changed, 26 insertions(+), 23 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index f5c3efecd7316..f12b83403e118 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -378,6 +378,7 @@ class PrologEpilogSGPRSpillBuilder {
 } // namespace llvm
 
 // Emit flat scratch setup code, assuming `MFI->hasFlatScratchInit()`
+// and return the FlatScratchInit Register used
 Register SIFrameLowering::emitEntryFunctionFlatScratchInit(
     MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
     const DebugLoc &DL, Register ScratchWaveOffsetReg) const {
@@ -420,10 +421,22 @@ Register SIFrameLowering::emitEntryFunctionFlatScratchInit(
         break;
       }
     }
-    assert(FlatScratchInitReg && "Failed to find free register for scratch init");
 
-    FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
-    FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
+  } else {
+    FlatScratchInitReg =
+        MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT);
+
+    MachineRegisterInfo &MRI = MF.getRegInfo();
+    MRI.addLiveIn(FlatScratchInitReg);
+    MBB.addLiveIn(FlatScratchInitReg);
+  }
+
+  assert(FlatScratchInitReg && "Failed to find free register for scratch init");
+
+  FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
+  FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
+
+  if (ST.isAmdPalOS()) {
 
     buildGitPtr(MBB, I, DL, TII, FlatScratchInitReg);
 
@@ -449,20 +462,9 @@ Register SIFrameLowering::emitEntryFunctionFlatScratchInit(
     // Mask the offset in [47:0] of the descriptor
     const MCInstrDesc &SAndB32 = TII->get(AMDGPU::S_AND_B32);
     auto And = BuildMI(MBB, I, DL, SAndB32, FlatScrInitHi)
-        .addReg(FlatScrInitHi)
-        .addImm(0xffff);
+                   .addReg(FlatScrInitHi)
+                   .addImm(0xffff);
     And->getOperand(3).setIsDead(); // Mark SCC as dead.
-  } else {
-    FlatScratchInitReg =
-        MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT);
-    assert(FlatScratchInitReg);
-
-    MachineRegisterInfo &MRI = MF.getRegInfo();
-    MRI.addLiveIn(FlatScratchInitReg);
-    MBB.addLiveIn(FlatScratchInitReg);
-
-    FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
-    FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
   }
 
   // Do a 64-bit pointer add.
@@ -488,10 +490,11 @@ Register SIFrameLowering::emitEntryFunctionFlatScratchInit(
       return FlatScratchInitReg;
     }
 
-    // For GFX9.
+    assert(ST.getGeneration() == AMDGPUSubtarget::GFX9);
+
     BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO)
-      .addReg(FlatScrInitLo)
-      .addReg(ScratchWaveOffsetReg);
+        .addReg(FlatScrInitLo)
+        .addReg(ScratchWaveOffsetReg);
     auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32),
                         AMDGPU::FLAT_SCR_HI)
       .addReg(FlatScrInitHi)
@@ -836,18 +839,18 @@ void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
 
     if (FlatScratchInit) {
       const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
-      Register Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
-      Register Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
+      Register Lo_32 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
+      Register Hi_32 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
       uint64_t Rsrc23 = TII->getScratchRsrcWords23();
       I = BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY),
                   TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1))
               .addReg(FlatScratchInit)
               .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
-      BuildMI(MBB, I, DL, SMovB32, Rsrc2)
+      BuildMI(MBB, I, DL, SMovB32, Lo_32)
           .addImm(Rsrc23 & 0xffffffff)
           .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
 
-      BuildMI(MBB, I, DL, SMovB32, Rsrc3)
+      BuildMI(MBB, I, DL, SMovB32, Hi_32)
           .addImm(Rsrc23 >> 32)
           .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
       return;

>From de3555e7d6ccc67c535a2ca56a930605f6de8465 Mon Sep 17 00:00:00 2001
From: Alexander Timofeev <alexander.timofeev at amd.com>
Date: Fri, 2 Feb 2024 19:49:58 +0100
Subject: [PATCH 7/7] [AMDGPU] Compiler should synthesize private buffer
 resource descriptor from flat_scratch_init

---
 llvm/docs/AMDGPUUsage.rst | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 5fcf651046943..a1a494e9df723 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -5468,9 +5468,13 @@ If the *Target Properties* column of :ref:`amdgpu-processor-table` specifies
 Instead the flat SCRATCH instructions are used.
 
 Otherwise, Private Segment Buffer SGPR register is used to initialize 4 SGPRs
-that are used as a V# to access scratch. CP uses the value provided by the
-runtime. It is used, together with Scratch Wavefront Offset as an offset, to
-access the private memory space using a segment address. See
+that are used as a V# to access scratch. 
+FIXME: The compiler synthetizes the initialization value for the Private Segment
+Buffer in the kernel prologue, using the Flat Scratch Init to initialize low
+64-bit and a known constant for the high ones. If the Flat Scratch Init is not
+available, CP uses the value provided by the runtime. It is used, together with
+Scratch Wavefront Offset as an offset, to access the private memory space using
+a segment address. See
 :ref:`amdgpu-amdhsa-initial-kernel-execution-state`.
 
 The scratch V# is a four-aligned SGPR and always selected for the kernel as



More information about the llvm-commits mailing list