[llvm] 273a0c8 - PrologEpilogInserter: Use explicit control for scavenge slot placement

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Tue Nov 23 15:01:16 PST 2021


Author: Matt Arsenault
Date: 2021-11-23T18:01:12-05:00
New Revision: 273a0c8bc9c774aa0d5982c23dc3d62b68ef4338

URL: https://github.com/llvm/llvm-project/commit/273a0c8bc9c774aa0d5982c23dc3d62b68ef4338
DIFF: https://github.com/llvm/llvm-project/commit/273a0c8bc9c774aa0d5982c23dc3d62b68ef4338.diff

LOG: PrologEpilogInserter: Use explicit control for scavenge slot placement

AMDGPU is unusual in that the both stack is indexed in the same
direction as stack growth (up). We therefore always need the emergency
stack slots placed as low as possible to ensure they are in range of
load/store instruction immediate offsets. The existing logic is mostly
OK, but failed if we required stack realignment.

I don't understand what the existing control isFPCloseToIncomingSP is
supposed to mean, but can only be used to stop placing the scavenge
slots earlier. Make this explicit so that targets can opt-in rather
than opt-out only.

Added: 
    

Modified: 
    llvm/include/llvm/CodeGen/TargetFrameLowering.h
    llvm/lib/CodeGen/PrologEpilogInserter.cpp
    llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp
    llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
    llvm/lib/Target/AMDGPU/SIFrameLowering.h
    llvm/lib/Target/Mips/MipsFrameLowering.h
    llvm/lib/Target/SystemZ/SystemZFrameLowering.h
    llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
    llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
    llvm/test/CodeGen/AMDGPU/flat-scratch.ll
    llvm/test/CodeGen/AMDGPU/load-hi16.ll
    llvm/test/CodeGen/AMDGPU/load-lo16.ll
    llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll
    llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll
    llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir
    llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir
    llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr.mir
    llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir
    llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll
    llvm/test/CodeGen/AMDGPU/store-hi16.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/CodeGen/TargetFrameLowering.h b/llvm/include/llvm/CodeGen/TargetFrameLowering.h
index fa22ca6a98ac2..fb463c9a57008 100644
--- a/llvm/include/llvm/CodeGen/TargetFrameLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetFrameLowering.h
@@ -139,10 +139,13 @@ class TargetFrameLowering {
   ///
   int getOffsetOfLocalArea() const { return LocalAreaOffset; }
 
-  /// isFPCloseToIncomingSP - Return true if the frame pointer is close to
-  /// the incoming stack pointer, false if it is close to the post-prologue
-  /// stack pointer.
-  virtual bool isFPCloseToIncomingSP() const { return true; }
+  /// Control the placement of special register scavenging spill slots when
+  /// allocating a stack frame.
+  ///
+  /// If this returns true, the frame indexes used by the RegScavenger will be
+  /// allocated closest to the incoming stack pointer.
+  virtual bool allocateScavengingFrameIndexesNearIncomingSP(
+    const MachineFunction &MF) const;
 
   /// assignCalleeSavedSpillSlots - Allows target to override spill slot
   /// assignment logic.  If implemented, assignCalleeSavedSpillSlots() should

diff  --git a/llvm/lib/CodeGen/PrologEpilogInserter.cpp b/llvm/lib/CodeGen/PrologEpilogInserter.cpp
index afc70a9c7343c..29a88480fd9fe 100644
--- a/llvm/lib/CodeGen/PrologEpilogInserter.cpp
+++ b/llvm/lib/CodeGen/PrologEpilogInserter.cpp
@@ -901,9 +901,7 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &MF) {
   // incoming stack pointer if a frame pointer is required and is closer
   // to the incoming rather than the final stack pointer.
   const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
-  bool EarlyScavengingSlots = (TFI.hasFP(MF) && TFI.isFPCloseToIncomingSP() &&
-                               RegInfo->useFPForScavengingIndex(MF) &&
-                               !RegInfo->hasStackRealignment(MF));
+  bool EarlyScavengingSlots = TFI.allocateScavengingFrameIndexesNearIncomingSP(MF);
   if (RS && EarlyScavengingSlots) {
     SmallVector<int, 2> SFIs;
     RS->getScavengingFrameIndices(SFIs);

diff  --git a/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp b/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp
index b0594ec086b28..fbf190a52585b 100644
--- a/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp
+++ b/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp
@@ -136,6 +136,16 @@ unsigned TargetFrameLowering::getStackAlignmentSkew(
   return 0;
 }
 
+bool TargetFrameLowering::allocateScavengingFrameIndexesNearIncomingSP(
+  const MachineFunction &MF) const {
+  if (!hasFP(MF))
+    return false;
+
+  const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
+  return RegInfo->useFPForScavengingIndex(MF) &&
+         !RegInfo->hasStackRealignment(MF);
+}
+
 bool TargetFrameLowering::isSafeForNoCSROpt(const Function &F) {
   if (!F.hasLocalLinkage() || F.hasAddressTaken() ||
       !F.hasFnAttribute(Attribute::NoRecurse))

diff  --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 882b9a203755e..4706c74be721d 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -1364,6 +1364,34 @@ bool SIFrameLowering::assignCalleeSavedSpillSlots(
   return false;
 }
 
+bool SIFrameLowering::allocateScavengingFrameIndexesNearIncomingSP(
+  const MachineFunction &MF) const {
+
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  uint64_t EstStackSize = MFI.estimateStackSize(MF);
+  uint64_t MaxOffset = EstStackSize - 1;
+
+  // We need the emergency stack slots to be allocated in range of the
+  // MUBUF/flat scratch immediate offset from the base register, so assign these
+  // first at the incoming SP position.
+  //
+  // TODO: We could try sorting the objects to find a hole in the first bytes
+  // rather than allocating as close to possible. This could save a lot of space
+  // on frames with alignment requirements.
+  if (ST.enableFlatScratch()) {
+    const SIInstrInfo *TII = ST.getInstrInfo();
+    if (TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS,
+                               SIInstrFlags::FlatScratch))
+      return false;
+  } else {
+    if (SIInstrInfo::isLegalMUBUFImmOffset(MaxOffset))
+      return false;
+  }
+
+  return true;
+}
+
 MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr(
   MachineFunction &MF,
   MachineBasicBlock &MBB,

diff  --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
index 951ea79b28090..56fbb875ffd92 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
@@ -43,6 +43,9 @@ class SIFrameLowering final : public AMDGPUFrameLowering {
                               const TargetRegisterInfo *TRI,
                               std::vector<CalleeSavedInfo> &CSI) const override;
 
+  bool allocateScavengingFrameIndexesNearIncomingSP(
+    const MachineFunction &MF) const override;
+
   bool isSupportedStackID(TargetStackID::Value ID) const override;
 
   void processFunctionBeforeFrameFinalized(

diff  --git a/llvm/lib/Target/Mips/MipsFrameLowering.h b/llvm/lib/Target/Mips/MipsFrameLowering.h
index 612b2b712fa88..710a3d40c38ef 100644
--- a/llvm/lib/Target/Mips/MipsFrameLowering.h
+++ b/llvm/lib/Target/Mips/MipsFrameLowering.h
@@ -34,7 +34,10 @@ class MipsFrameLowering : public TargetFrameLowering {
 
   bool hasBP(const MachineFunction &MF) const;
 
-  bool isFPCloseToIncomingSP() const override { return false; }
+  bool allocateScavengingFrameIndexesNearIncomingSP(
+    const MachineFunction &MF) const override {
+    return false;
+  }
 
   bool enableShrinkWrapping(const MachineFunction &MF) const override {
     return true;

diff  --git a/llvm/lib/Target/SystemZ/SystemZFrameLowering.h b/llvm/lib/Target/SystemZ/SystemZFrameLowering.h
index 6fddb4f81c416..af219da79c328 100644
--- a/llvm/lib/Target/SystemZ/SystemZFrameLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZFrameLowering.h
@@ -29,7 +29,18 @@ class SystemZFrameLowering : public TargetFrameLowering {
   create(const SystemZSubtarget &STI);
 
   // Override TargetFrameLowering.
-  bool isFPCloseToIncomingSP() const override { return false; }
+  bool allocateScavengingFrameIndexesNearIncomingSP(
+    const MachineFunction &MF) const override {
+    // SystemZ wants normal register scavenging slots, as close to the stack or
+    // frame pointer as possible.
+    // The default implementation assumes an x86-like layout, where the frame
+    // pointer is at the opposite end of the frame from the stack pointer.
+    // This meant that when frame pointer elimination was disabled,
+    // the slots ended up being as close as possible to the incoming
+    // stack pointer, which is the opposite of what we want on SystemZ.
+    return false;
+  }
+
   bool hasReservedCallFrame(const MachineFunction &MF) const override;
   MachineBasicBlock::iterator
   eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
@@ -43,7 +54,6 @@ class SystemZELFFrameLowering : public SystemZFrameLowering {
   SystemZELFFrameLowering();
 
   // Override TargetFrameLowering.
-  bool isFPCloseToIncomingSP() const override { return false; }
   bool
   assignCalleeSavedSpillSlots(MachineFunction &MF,
                               const TargetRegisterInfo *TRI,

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
index 20bb529566d5e..04cc0c56292e5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
@@ -465,9 +465,9 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ; GFX9-LABEL: store_load_vindex_large_offset_foo:
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    scratch_load_dword v1, off, s32 glc
+; GFX9-NEXT:    scratch_load_dword v1, off, s32 offset:4 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 0x4000
+; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 0x4004
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, vcc_hi
 ; GFX9-NEXT:    v_and_b32_e32 v0, 15, v0
@@ -486,14 +486,14 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_and_b32_e32 v1, 15, v0
-; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x4000
+; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x4004
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, vcc_lo
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 15
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, v2, v0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v1, v2, v1
-; GFX10-NEXT:    scratch_load_dword v2, off, s32 glc dlc
+; GFX10-NEXT:    scratch_load_dword v2, off, s32 offset:4 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    scratch_store_dword v0, v3, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -568,10 +568,11 @@ define void @store_load_large_imm_offset_foo() {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 13
 ; GFX9-NEXT:    s_movk_i32 s0, 0x3e80
-; GFX9-NEXT:    scratch_store_dword off, v0, s32
+; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 4
+; GFX9-NEXT:    scratch_store_dword off, v0, s32 offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 15
-; GFX9-NEXT:    s_add_i32 s0, s0, s32
+; GFX9-NEXT:    s_add_i32 s0, s0, vcc_hi
 ; GFX9-NEXT:    scratch_store_dword off, v0, s0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
@@ -585,8 +586,9 @@ define void @store_load_large_imm_offset_foo() {
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 13
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX10-NEXT:    s_movk_i32 s0, 0x3e80
-; GFX10-NEXT:    s_add_i32 s0, s0, s32
-; GFX10-NEXT:    scratch_store_dword off, v0, s32
+; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 4
+; GFX10-NEXT:    s_add_i32 s0, s0, vcc_lo
+; GFX10-NEXT:    scratch_store_dword off, v0, s32 offset:4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_store_dword off, v1, s0
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0

diff  --git a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
index 9c05d5880572e..e357b5adc8d42 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
@@ -55,8 +55,8 @@ define void @callee_with_stack() #0 {
 ; MUBUF-NEXT:   s_addk_i32 s32, 0x200
 ; FLATSCR-NEXT: s_add_i32 s32, s32, 8
 ; GCN-NEXT: v_mov_b32_e32 v0, 0{{$}}
-; MUBUF-NEXT:   buffer_store_dword v0, off, s[0:3], s33 offset:4{{$}}
-; FLATSCR-NEXT: scratch_store_dword off, v0, s33 offset:4{{$}}
+; MUBUF-NEXT:   buffer_store_dword v0, off, s[0:3], s33{{$}}
+; FLATSCR-NEXT: scratch_store_dword off, v0, s33{{$}}
 ; GCN-NEXT: s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:   s_addk_i32 s32, 0xfe00
 ; FLATSCR-NEXT: s_add_i32 s32, s32, -8
@@ -242,8 +242,8 @@ define void @spill_only_csr_sgpr() {
 ; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
 ; MUBUF-DAG:   buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
 ; FLATSCR-DAG: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill
-; MUBUF-DAG:   buffer_store_dword [[ZERO]], off, s[0:3], s33 offset:8
-; FLATSCR-DAG: scratch_store_dword off, [[ZERO]], s33 offset:8
+; MUBUF-DAG:   buffer_store_dword [[ZERO]], off, s[0:3], s33 offset:4
+; FLATSCR-DAG: scratch_store_dword off, [[ZERO]], s33 offset:4
 
 ; GCN:	;;#ASMSTART
 ; GCN-NEXT: ; clobber v41
@@ -270,8 +270,8 @@ define void @callee_with_stack_no_fp_elim_csr_vgpr() #1 {
 ; GCN-LABEL: {{^}}last_lane_vgpr_for_fp_csr:
 ; GCN: s_waitcnt
 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; MUBUF-NEXT:   buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 offset:12 ; 4-byte Folded Spill
+; MUBUF-NEXT:   buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 offset:8 ; 4-byte Folded Spill
 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
 ; GCN-NEXT: v_writelane_b32 v0, s33, 63
 ; GCN-COUNT-60: v_writelane_b32 v0
@@ -280,8 +280,8 @@ define void @callee_with_stack_no_fp_elim_csr_vgpr() #1 {
 ; MUBUF:   buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
 ; FLATSCR: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill
 ; GCN: v_writelane_b32 v0
-; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset:8
-; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s33 offset:8
+; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset:4
+; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s33 offset:4
 ; GCN: ;;#ASMSTART
 ; GCN: v_writelane_b32 v0
 
@@ -291,8 +291,8 @@ define void @callee_with_stack_no_fp_elim_csr_vgpr() #1 {
 ; FLATSCR:      s_add_i32 s32, s32, -16
 ; GCN-NEXT: v_readlane_b32 s33, v0, 63
 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; MUBUF-NEXT:   buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 offset:12 ; 4-byte Folded Reload
+; MUBUF-NEXT:   buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 offset:8 ; 4-byte Folded Reload
 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
 ; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: s_setpc_b64
@@ -316,8 +316,8 @@ define void @last_lane_vgpr_for_fp_csr() #1 {
 ; GCN-LABEL: {{^}}no_new_vgpr_for_fp_csr:
 ; GCN: s_waitcnt
 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; MUBUF-NEXT:   buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 offset:12 ; 4-byte Folded Spill
+; MUBUF-NEXT:   buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 offset:8 ; 4-byte Folded Spill
 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
 ; GCN-COUNT-61: v_writelane_b32 v0,
 ; FLATSCR: s_mov_b32 [[FP_COPY:s[0-9]+]], s33
@@ -340,8 +340,8 @@ define void @last_lane_vgpr_for_fp_csr() #1 {
 ; FLATSCR-NEXT: s_add_i32 s32, s32, -16
 ; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]]
 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; MUBUF-NEXT:   buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 offset:12 ; 4-byte Folded Reload
+; MUBUF-NEXT:   buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 offset:8 ; 4-byte Folded Reload
 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
 ; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: s_setpc_b64
@@ -369,14 +369,16 @@ define void @no_new_vgpr_for_fp_csr() #1 {
 ; FLATSCR-NEXT: s_add_i32 s33, s32, 0x1fff
 ; MUBUF-NEXT:   s_and_b32 s33, s33, 0xfff80000
 ; FLATSCR-NEXT: s_and_b32 s33, s33, 0xffffe000
-; MUBUF-NEXT:   s_add_i32 s32, s32, 0x100000
-; FLATSCR-NEXT: s_addk_i32 s32, 0x4000
+; MUBUF-NEXT:   s_add_i32 s32, s32, 0x180000
+; FLATSCR-NEXT: s_addk_i32 s32, 0x6000
 ; GCN-NEXT:     v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
-; MUBUF-NEXT:   buffer_store_dword [[ZERO]], off, s[0:3], s33
-; FLATSCR-NEXT: scratch_store_dword off, [[ZERO]], s33
+; MUBUF-NEXT:   v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x2000{{$}}
+; MUBUF-NEXT:   buffer_store_dword [[ZERO]], [[OFFSET]], s[0:3], s33 offen{{$}}
+; FLATSCR-NEXT: s_add_i32 vcc_hi, s33, 0x2000
+; FLATSCR-NEXT: scratch_store_dword off, [[ZERO]], vcc_hi
 ; GCN-NEXT: s_waitcnt vmcnt(0)
-; MUBUF-NEXT:   s_add_i32 s32, s32, 0xfff00000
-; FLATSCR-NEXT: s_addk_i32 s32, 0xc000
+; MUBUF-NEXT:   s_add_i32 s32, s32, 0xffe80000
+; FLATSCR-NEXT: s_addk_i32 s32, 0xa000
 ; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]]
 ; GCN-NEXT: s_setpc_b64
 define void @realign_stack_no_fp_elim() #1 {
@@ -388,16 +390,16 @@ define void @realign_stack_no_fp_elim() #1 {
 ; GCN-LABEL: {{^}}no_unused_non_csr_sgpr_for_fp:
 ; GCN: s_waitcnt
 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; MUBUF-NEXT:   buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 offset:8 ; 4-byte Folded Spill
+; MUBUF-NEXT:   buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 offset:4 ; 4-byte Folded Spill
 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
 ; GCN-NEXT: v_writelane_b32 v0, s33, 2
 ; GCN-NEXT: s_mov_b32 s33, s32
 ; GCN-NEXT: v_writelane_b32 v0, s30, 0
 ; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
 ; GCN: v_writelane_b32 v0, s31, 1
-; MUBUF:   buffer_store_dword [[ZERO]], off, s[0:3], s33 offset:4
-; FLATSCR: scratch_store_dword off, [[ZERO]], s33 offset:4
+; MUBUF:   buffer_store_dword [[ZERO]], off, s[0:3], s33{{$}}
+; FLATSCR: scratch_store_dword off, [[ZERO]], s33{{$}}
 ; GCN-NEXT:     s_waitcnt vmcnt(0)
 ; GCN: ;;#ASMSTART
 ; MUBUF:        s_addk_i32 s32, 0x300
@@ -410,8 +412,8 @@ define void @realign_stack_no_fp_elim() #1 {
 ; FLATSCR-NEXT: s_add_i32 s32, s32, -12
 ; GCN-NEXT:     v_readlane_b32 s33, v0, 2
 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; MUBUF-NEXT:   buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 offset:8 ; 4-byte Folded Reload
+; MUBUF-NEXT:   buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
 ; GCN-NEXT: s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:   s_setpc_b64 s[4:5]
@@ -434,8 +436,8 @@ define void @no_unused_non_csr_sgpr_for_fp() #1 {
 ; GCN-LABEL: {{^}}no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr:
 ; GCN: s_waitcnt
 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; MUBUF-NEXT:   buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 offset:8 ; 4-byte Folded Spill
+; MUBUF-NEXT:   buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 offset:4 ; 4-byte Folded Spill
 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
 ; GCN-NEXT: v_writelane_b32 [[CSR_VGPR]], s33, 2
 ; GCN-NEXT: s_mov_b32 s33, s32
@@ -456,8 +458,8 @@ define void @no_unused_non_csr_sgpr_for_fp() #1 {
 ; FLATSCR-NEXT: s_add_i32 s32, s32, -12{{$}}
 ; GCN-NEXT: v_readlane_b32 s33, [[CSR_VGPR]], 2
 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; MUBUF-NEXT:   buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 offset:8 ; 4-byte Folded Reload
+; MUBUF-NEXT:   buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
 ; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: s_setpc_b64
@@ -486,9 +488,9 @@ define void @no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr() #1 {
 ; GCN-LABEL: {{^}}scratch_reg_needed_mubuf_offset:
 ; GCN: s_waitcnt
 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; MUBUF-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s32, 0x40200
+; MUBUF-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s32, 0x40100
 ; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], [[SCRATCH_SGPR]] ; 4-byte Folded Spill
-; FLATSCR-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s32, 0x1008
+; FLATSCR-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s32, 0x1004
 ; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], [[SCRATCH_SGPR]] ; 4-byte Folded Spill
 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
 ; GCN-NEXT: v_writelane_b32 [[CSR_VGPR]], s33, 2
@@ -509,9 +511,9 @@ define void @no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr() #1 {
 ; FLATSCR-NEXT: s_addk_i32 s32, 0xeff4{{$}}
 ; GCN-NEXT: v_readlane_b32 s33, [[CSR_VGPR]], 2
 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; MUBUF-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s32, 0x40200
+; MUBUF-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s32, 0x40100
 ; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], [[SCRATCH_SGPR]] ; 4-byte Folded Reload
-; FLATSCR-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s32, 0x1008
+; FLATSCR-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s32, 0x1004
 ; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, [[SCRATCH_SGPR]] ; 4-byte Folded Reload
 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
 ; GCN-NEXT: s_waitcnt vmcnt(0)
@@ -567,10 +569,10 @@ define void @ipra_call_with_stack() #0 {
 ; With no free registers, we must spill the FP to memory.
 ; GCN-LABEL: {{^}}callee_need_to_spill_fp_to_memory:
 ; MUBUF:   v_mov_b32_e32 [[TMP_VGPR1:v[0-9]+]], s33
-; MUBUF:   buffer_store_dword [[TMP_VGPR1]], off, s[0:3], s32 offset:4
+; MUBUF:   buffer_store_dword [[TMP_VGPR1]], off, s[0:3], s32 ; 4-byte Folded Spill
 ; FLATSCR: s_mov_b32 s0, s33
 ; GCN:     s_mov_b32 s33, s32
-; MUBUF:   buffer_load_dword [[TMP_VGPR2:v[0-9]+]], off, s[0:3], s32 offset:4
+; MUBUF:   buffer_load_dword [[TMP_VGPR2:v[0-9]+]], off, s[0:3], s32 ; 4-byte Folded Reload
 ; FLATSCR: s_mov_b32 s33, s0
 ; MUBUF:   s_waitcnt vmcnt(0)
 ; MUBUF:   v_readfirstlane_b32 s33, [[TMP_VGPR2]]
@@ -669,14 +671,14 @@ define void @callee_need_to_spill_fp_to_reg() #1 {
 ; scratch VGPR to hold the offset.
 ; GCN-LABEL: {{^}}spill_fp_to_memory_scratch_reg_needed_mubuf_offset
 ; MUBUF: s_or_saveexec_b64 s[4:5], -1
-; MUBUF-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s32, 0x40200
+; MUBUF-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s32, 0x40100
 ; MUBUF-NEXT: buffer_store_dword v39, off, s[0:3], [[SCRATCH_SGPR]] ; 4-byte Folded Spill
 ; MUBUF: v_mov_b32_e32 v0, s33
 ; GCN-NOT: v_mov_b32_e32 v0, 0x100c
-; MUBUF-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s32, 0x40300
+; MUBUF-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s32, 0x40200
 ; MUBUF: buffer_store_dword v0, off, s[0:3], [[SCRATCH_SGPR]] ; 4-byte Folded Spill
 ; FLATSCR: v_mov_b32_e32 v0, 0
-; FLATSCR: s_add_i32 [[SOFF:s[0-9]+]], s33, 0x1004
+; FLATSCR: s_add_i32 [[SOFF:s[0-9]+]], s33, 0x1000
 ; FLATSCR: scratch_store_dword off, v0, [[SOFF]]
 define void @spill_fp_to_memory_scratch_reg_needed_mubuf_offset([4096 x i8] addrspace(5)* byval([4096 x i8]) align 4 %arg) #3 {
   %alloca = alloca i32, addrspace(5)

diff  --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
index 5b607f976f538..238c9d043a4a1 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
@@ -1485,7 +1485,7 @@ define void @zero_init_large_offset_foo() {
 ; GFX9-LABEL: zero_init_large_offset_foo:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    scratch_load_dword v0, off, s32 glc
+; GFX9-NEXT:    scratch_load_dword v0, off, s32 offset:16 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s0, 0
 ; GFX9-NEXT:    s_mov_b32 s1, s0
@@ -1495,13 +1495,13 @@ define void @zero_init_large_offset_foo() {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 0x4000
+; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 0x4010
 ; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi
-; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 0x4000
+; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 0x4010
 ; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
-; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 0x4000
+; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 0x4010
 ; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32
-; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 0x4000
+; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 0x4010
 ; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -1510,10 +1510,10 @@ define void @zero_init_large_offset_foo() {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    scratch_load_dword v0, off, s32 glc dlc
+; GFX10-NEXT:    scratch_load_dword v0, off, s32 offset:16 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s0, 0
-; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x4000
+; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
 ; GFX10-NEXT:    s_mov_b32 s1, s0
 ; GFX10-NEXT:    s_mov_b32 s2, s0
 ; GFX10-NEXT:    s_mov_b32 s3, s0
@@ -1522,11 +1522,11 @@ define void @zero_init_large_offset_foo() {
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo
-; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x4000
+; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
 ; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
-; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x4000
+; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
 ; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
-; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x4000
+; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
 ; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -1534,7 +1534,7 @@ define void @zero_init_large_offset_foo() {
 ; GFX9-PAL-LABEL: zero_init_large_offset_foo:
 ; GFX9-PAL:       ; %bb.0:
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s32 glc
+; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s32 offset:16 glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
 ; GFX9-PAL-NEXT:    s_mov_b32 s1, s0
@@ -1544,13 +1544,13 @@ define void @zero_init_large_offset_foo() {
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-PAL-NEXT:    s_add_i32 vcc_hi, s32, 0x4000
+; GFX9-PAL-NEXT:    s_add_i32 vcc_hi, s32, 0x4010
 ; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi
-; GFX9-PAL-NEXT:    s_add_i32 vcc_hi, s32, 0x4000
+; GFX9-PAL-NEXT:    s_add_i32 vcc_hi, s32, 0x4010
 ; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
-; GFX9-PAL-NEXT:    s_add_i32 vcc_hi, s32, 0x4000
+; GFX9-PAL-NEXT:    s_add_i32 vcc_hi, s32, 0x4010
 ; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32
-; GFX9-PAL-NEXT:    s_add_i32 vcc_hi, s32, 0x4000
+; GFX9-PAL-NEXT:    s_add_i32 vcc_hi, s32, 0x4010
 ; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
@@ -1559,10 +1559,10 @@ define void @zero_init_large_offset_foo() {
 ; GFX1010-PAL:       ; %bb.0:
 ; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s32 glc dlc
+; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s32 offset:16 glc dlc
 ; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1010-PAL-NEXT:    s_mov_b32 s0, 0
-; GFX1010-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4000
+; GFX1010-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
 ; GFX1010-PAL-NEXT:    s_mov_b32 s1, s0
 ; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
 ; GFX1010-PAL-NEXT:    s_mov_b32 s3, s0
@@ -1572,13 +1572,13 @@ define void @zero_init_large_offset_foo() {
 ; GFX1010-PAL-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo
 ; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX1010-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4000
+; GFX1010-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
 ; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
 ; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX1010-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4000
+; GFX1010-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
 ; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
 ; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX1010-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4000
+; GFX1010-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
 ; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
 ; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1010-PAL-NEXT:    s_setpc_b64 s[30:31]
@@ -1587,10 +1587,10 @@ define void @zero_init_large_offset_foo() {
 ; GFX1030-PAL:       ; %bb.0:
 ; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, s32 glc dlc
+; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, s32 offset:16 glc dlc
 ; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1030-PAL-NEXT:    s_mov_b32 s0, 0
-; GFX1030-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4000
+; GFX1030-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
 ; GFX1030-PAL-NEXT:    s_mov_b32 s1, s0
 ; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
 ; GFX1030-PAL-NEXT:    s_mov_b32 s3, s0
@@ -1599,11 +1599,11 @@ define void @zero_init_large_offset_foo() {
 ; GFX1030-PAL-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX1030-PAL-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo
-; GFX1030-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4000
+; GFX1030-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
 ; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
-; GFX1030-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4000
+; GFX1030-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
 ; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
-; GFX1030-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4000
+; GFX1030-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
 ; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
 ; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1030-PAL-NEXT:    s_setpc_b64 s[30:31]
@@ -2015,9 +2015,9 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ; GFX9-LABEL: store_load_vindex_large_offset_foo:
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    scratch_load_dword v1, off, s32 glc
+; GFX9-NEXT:    scratch_load_dword v1, off, s32 offset:4 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 0x4000
+; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 0x4004
 ; GFX9-NEXT:    v_mov_b32_e32 v1, vcc_hi
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 15
 ; GFX9-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
@@ -2034,12 +2034,12 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 15
-; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x4000
+; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x4004
 ; GFX10-NEXT:    v_mov_b32_e32 v2, vcc_lo
 ; GFX10-NEXT:    v_and_b32_e32 v3, v0, v1
 ; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, v2
 ; GFX10-NEXT:    v_lshl_add_u32 v2, v3, 2, v2
-; GFX10-NEXT:    scratch_load_dword v3, off, s32 glc dlc
+; GFX10-NEXT:    scratch_load_dword v3, off, s32 offset:4 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    scratch_store_dword v0, v1, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -2050,9 +2050,9 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ; GFX9-PAL-LABEL: store_load_vindex_large_offset_foo:
 ; GFX9-PAL:       ; %bb.0: ; %bb
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-PAL-NEXT:    scratch_load_dword v1, off, s32 glc
+; GFX9-PAL-NEXT:    scratch_load_dword v1, off, s32 offset:4 glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-PAL-NEXT:    s_add_i32 vcc_hi, s32, 0x4000
+; GFX9-PAL-NEXT:    s_add_i32 vcc_hi, s32, 0x4004
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, vcc_hi
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 15
 ; GFX9-PAL-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
@@ -2069,12 +2069,12 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
-; GFX10-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4000
+; GFX10-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4004
 ; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, vcc_lo
 ; GFX10-PAL-NEXT:    v_and_b32_e32 v3, v0, v1
 ; GFX10-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v2
 ; GFX10-PAL-NEXT:    v_lshl_add_u32 v2, v3, 2, v2
-; GFX10-PAL-NEXT:    scratch_load_dword v3, off, s32 glc dlc
+; GFX10-PAL-NEXT:    scratch_load_dword v3, off, s32 offset:4 glc dlc
 ; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-PAL-NEXT:    scratch_store_dword v0, v1, off
 ; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -2218,9 +2218,10 @@ define void @store_load_large_imm_offset_foo() {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 13
 ; GFX9-NEXT:    s_movk_i32 s0, 0x3000
-; GFX9-NEXT:    scratch_store_dword off, v0, s32
+; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 4
+; GFX9-NEXT:    scratch_store_dword off, v0, s32 offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_add_i32 s0, s0, s32
+; GFX9-NEXT:    s_add_i32 s0, s0, vcc_hi
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX9-NEXT:    scratch_store_dword off, v0, s0 offset:3712
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -2235,8 +2236,9 @@ define void @store_load_large_imm_offset_foo() {
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 13
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX10-NEXT:    s_movk_i32 s0, 0x3800
-; GFX10-NEXT:    s_add_i32 s0, s0, s32
-; GFX10-NEXT:    scratch_store_dword off, v0, s32
+; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 4
+; GFX10-NEXT:    s_add_i32 s0, s0, vcc_lo
+; GFX10-NEXT:    scratch_store_dword off, v0, s32 offset:4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_store_dword off, v1, s0 offset:1664
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -2249,9 +2251,10 @@ define void @store_load_large_imm_offset_foo() {
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 13
 ; GFX9-PAL-NEXT:    s_movk_i32 s0, 0x3000
-; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s32
+; GFX9-PAL-NEXT:    s_add_i32 vcc_hi, s32, 4
+; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s32 offset:4
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-PAL-NEXT:    s_add_i32 s0, s0, s32
+; GFX9-PAL-NEXT:    s_add_i32 s0, s0, vcc_hi
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s0 offset:3712
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
@@ -2266,8 +2269,9 @@ define void @store_load_large_imm_offset_foo() {
 ; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, 13
 ; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX10-PAL-NEXT:    s_movk_i32 s0, 0x3800
-; GFX10-PAL-NEXT:    s_add_i32 s0, s0, s32
-; GFX10-PAL-NEXT:    scratch_store_dword off, v0, s32
+; GFX10-PAL-NEXT:    s_add_i32 vcc_lo, s32, 4
+; GFX10-PAL-NEXT:    s_add_i32 s0, s0, vcc_lo
+; GFX10-PAL-NEXT:    scratch_store_dword off, v0, s32 offset:4
 ; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-PAL-NEXT:    scratch_store_dword off, v1, s0 offset:1664
 ; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0

diff  --git a/llvm/test/CodeGen/AMDGPU/load-hi16.ll b/llvm/test/CodeGen/AMDGPU/load-hi16.ll
index 9feadc16b681d..0a796227c8abb 100644
--- a/llvm/test/CodeGen/AMDGPU/load-hi16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-hi16.ll
@@ -806,15 +806,14 @@ entry:
 ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_to_offset:
 ; GFX900-MUBUF:        buffer_store_dword
 ; GFX900-MUBUF-NEXT:   s_waitcnt vmcnt(0)
-; GFX900-MUBUF-NEXT:   buffer_load_short_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4094
+; GFX900-MUBUF-NEXT:   buffer_load_short_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4058
 ; GFX900-MUBUF-NEXT:   s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR:      scratch_store_dword
 ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX900-FLATSCR-NEXT: scratch_load_short_d16_hi v{{[0-9]+}}, off, s32 offset:4094
+; GFX900-FLATSCR-NEXT: scratch_load_short_d16_hi v{{[0-9]+}}, off, s32 offset:4058
 ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-define void @load_private_hi_v2i16_reglo_vreg_to_offset(i16 %reg) #0 {
+define void @load_private_hi_v2i16_reglo_vreg_to_offset(i16 %reg, [10 x i32] addrspace(5)* %obj0) #0 {
 entry:
-  %obj0 = alloca [10 x i32], align 4, addrspace(5)
   %obj1 = alloca [4096 x i16], align 2, addrspace(5)
   %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
   store volatile i32 123, i32 addrspace(5)* %bc
@@ -829,15 +828,14 @@ entry:
 ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_sexti8_to_offset:
 ; GFX900-MUBUF:        buffer_store_dword
 ; GFX900-MUBUF-NEXT:   s_waitcnt vmcnt(0)
-; GFX900-MUBUF-NEXT:   buffer_load_sbyte_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4095
+; GFX900-MUBUF-NEXT:   buffer_load_sbyte_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4059
 ; GFX900-MUBUF-NEXT:   s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR:      scratch_store_dword
 ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX900-FLATSCR-NEXT: scratch_load_sbyte_d16_hi v{{[0-9]+}}, off, s32 offset:4095
+; GFX900-FLATSCR-NEXT: scratch_load_sbyte_d16_hi v{{[0-9]+}}, off, s32 offset:4059
 ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-define void @load_private_hi_v2i16_reglo_vreg_sexti8_to_offset(i16 %reg) #0 {
+define void @load_private_hi_v2i16_reglo_vreg_sexti8_to_offset(i16 %reg, [10 x i32] addrspace(5)* %obj0) #0 {
 entry:
-  %obj0 = alloca [10 x i32], align 4, addrspace(5)
   %obj1 = alloca [4096 x i8], align 2, addrspace(5)
   %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
   store volatile i32 123, i32 addrspace(5)* %bc
@@ -853,15 +851,14 @@ entry:
 ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_zexti8_to_offset:
 ; GFX900-MUBUF:        buffer_store_dword
 ; GFX900-MUBUF-NEXT:   s_waitcnt vmcnt(0)
-; GFX900-MUBUF-NEXT:   buffer_load_ubyte_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4095
+; GFX900-MUBUF-NEXT:   buffer_load_ubyte_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4059
 ; GFX900-MUBUF-NEXT:   s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR:      scratch_store_dword
 ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16_hi v{{[0-9]+}}, off, s32 offset:4095
+; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16_hi v{{[0-9]+}}, off, s32 offset:4059
 ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-define void @load_private_hi_v2i16_reglo_vreg_zexti8_to_offset(i16 %reg) #0 {
+define void @load_private_hi_v2i16_reglo_vreg_zexti8_to_offset(i16 %reg, [10 x i32] addrspace(5)* %obj0) #0 {
 entry:
-  %obj0 = alloca [10 x i32], align 4, addrspace(5)
   %obj1 = alloca [4096 x i8], align 2, addrspace(5)
   %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
   store volatile i32 123, i32 addrspace(5)* %bc

diff  --git a/llvm/test/CodeGen/AMDGPU/load-lo16.ll b/llvm/test/CodeGen/AMDGPU/load-lo16.ll
index 600d3ea38e1f8..d38084b96b4fd 100644
--- a/llvm/test/CodeGen/AMDGPU/load-lo16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-lo16.ll
@@ -1913,9 +1913,10 @@ define void @load_private_lo_v2i16_reglo_vreg_to_offset(i32 %reg) #0 {
 ; GFX900-MUBUF:       ; %bb.0: ; %entry
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-MUBUF-NEXT:    v_mov_b32_e32 v1, 0x7b
-; GFX900-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s32
+; GFX900-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-MUBUF-NEXT:    buffer_load_short_d16 v0, off, s[0:3], s32 offset:4094 glc
+; GFX900-MUBUF-NEXT:    v_mov_b32_e32 v1, 44
+; GFX900-MUBUF-NEXT:    buffer_load_short_d16 v0, v1, s[0:3], s32 offen offset:4054 glc
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v0, off
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
@@ -1925,9 +1926,10 @@ define void @load_private_lo_v2i16_reglo_vreg_to_offset(i32 %reg) #0 {
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_mov_b32_e32 v1, 0x7b
-; GFX906-NEXT:    buffer_store_dword v1, off, s[0:3], s32
+; GFX906-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:4094 glc
+; GFX906-NEXT:    v_mov_b32_e32 v3, 44
+; GFX906-NEXT:    buffer_load_ushort v1, v3, s[0:3], s32 offen offset:4054 glc
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_mov_b32_e32 v2, 0xffff
 ; GFX906-NEXT:    v_bfi_b32 v0, v2, v1, v0
@@ -1939,9 +1941,10 @@ define void @load_private_lo_v2i16_reglo_vreg_to_offset(i32 %reg) #0 {
 ; GFX803:       ; %bb.0: ; %entry
 ; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    v_mov_b32_e32 v1, 0x7b
-; GFX803-NEXT:    buffer_store_dword v1, off, s[0:3], s32
+; GFX803-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
-; GFX803-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:4094 glc
+; GFX803-NEXT:    v_mov_b32_e32 v2, 44
+; GFX803-NEXT:    buffer_load_ushort v1, v2, s[0:3], s32 offen offset:4054 glc
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX803-NEXT:    v_or_b32_e32 v0, v1, v0
@@ -1953,9 +1956,10 @@ define void @load_private_lo_v2i16_reglo_vreg_to_offset(i32 %reg) #0 {
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v1, 0x7b
-; GFX900-FLATSCR-NEXT:    scratch_store_dword off, v1, s32
+; GFX900-FLATSCR-NEXT:    scratch_store_dword off, v1, s32 offset:4
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-FLATSCR-NEXT:    scratch_load_short_d16 v0, off, s32 offset:4094 glc
+; GFX900-FLATSCR-NEXT:    s_add_i32 vcc_hi, s32, 44
+; GFX900-FLATSCR-NEXT:    scratch_load_short_d16 v0, off, vcc_hi offset:4054 glc
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
@@ -1978,9 +1982,10 @@ define void @load_private_lo_v2i16_reglo_vreg_sexti8_to_offset(i32 %reg) #0 {
 ; GFX900-MUBUF:       ; %bb.0: ; %entry
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-MUBUF-NEXT:    v_mov_b32_e32 v1, 0x7b
-; GFX900-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s32
+; GFX900-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-MUBUF-NEXT:    buffer_load_sbyte_d16 v0, off, s[0:3], s32 offset:4095 glc
+; GFX900-MUBUF-NEXT:    v_mov_b32_e32 v1, 44
+; GFX900-MUBUF-NEXT:    buffer_load_sbyte_d16 v0, v1, s[0:3], s32 offen offset:4055 glc
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v0, off
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
@@ -1990,9 +1995,10 @@ define void @load_private_lo_v2i16_reglo_vreg_sexti8_to_offset(i32 %reg) #0 {
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_mov_b32_e32 v1, 0x7b
-; GFX906-NEXT:    buffer_store_dword v1, off, s[0:3], s32
+; GFX906-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    buffer_load_sbyte v1, off, s[0:3], s32 offset:4095 glc
+; GFX906-NEXT:    v_mov_b32_e32 v3, 44
+; GFX906-NEXT:    buffer_load_sbyte v1, v3, s[0:3], s32 offen offset:4055 glc
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_mov_b32_e32 v2, 0xffff
 ; GFX906-NEXT:    v_bfi_b32 v0, v2, v1, v0
@@ -2004,9 +2010,10 @@ define void @load_private_lo_v2i16_reglo_vreg_sexti8_to_offset(i32 %reg) #0 {
 ; GFX803:       ; %bb.0: ; %entry
 ; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    v_mov_b32_e32 v1, 0x7b
-; GFX803-NEXT:    buffer_store_dword v1, off, s[0:3], s32
+; GFX803-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
-; GFX803-NEXT:    buffer_load_sbyte v1, off, s[0:3], s32 offset:4095 glc
+; GFX803-NEXT:    v_mov_b32_e32 v2, 44
+; GFX803-NEXT:    buffer_load_sbyte v1, v2, s[0:3], s32 offen offset:4055 glc
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX803-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -2018,9 +2025,10 @@ define void @load_private_lo_v2i16_reglo_vreg_sexti8_to_offset(i32 %reg) #0 {
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v1, 0x7b
-; GFX900-FLATSCR-NEXT:    scratch_store_dword off, v1, s32
+; GFX900-FLATSCR-NEXT:    scratch_store_dword off, v1, s32 offset:4
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-FLATSCR-NEXT:    scratch_load_sbyte_d16 v0, off, s32 offset:4095 glc
+; GFX900-FLATSCR-NEXT:    s_add_i32 vcc_hi, s32, 44
+; GFX900-FLATSCR-NEXT:    scratch_load_sbyte_d16 v0, off, vcc_hi offset:4055 glc
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
@@ -2044,9 +2052,10 @@ define void @load_private_lo_v2i16_reglo_vreg_zexti8_to_offset(i32 %reg) #0 {
 ; GFX900-MUBUF:       ; %bb.0: ; %entry
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-MUBUF-NEXT:    v_mov_b32_e32 v1, 0x7b
-; GFX900-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s32
+; GFX900-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-MUBUF-NEXT:    buffer_load_ubyte_d16 v0, off, s[0:3], s32 offset:4095 glc
+; GFX900-MUBUF-NEXT:    v_mov_b32_e32 v1, 44
+; GFX900-MUBUF-NEXT:    buffer_load_ubyte_d16 v0, v1, s[0:3], s32 offen offset:4055 glc
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v0, off
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
@@ -2056,9 +2065,10 @@ define void @load_private_lo_v2i16_reglo_vreg_zexti8_to_offset(i32 %reg) #0 {
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_mov_b32_e32 v1, 0x7b
-; GFX906-NEXT:    buffer_store_dword v1, off, s[0:3], s32
+; GFX906-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    buffer_load_ubyte v1, off, s[0:3], s32 offset:4095 glc
+; GFX906-NEXT:    v_mov_b32_e32 v3, 44
+; GFX906-NEXT:    buffer_load_ubyte v1, v3, s[0:3], s32 offen offset:4055 glc
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_mov_b32_e32 v2, 0xffff
 ; GFX906-NEXT:    v_bfi_b32 v0, v2, v1, v0
@@ -2070,9 +2080,10 @@ define void @load_private_lo_v2i16_reglo_vreg_zexti8_to_offset(i32 %reg) #0 {
 ; GFX803:       ; %bb.0: ; %entry
 ; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    v_mov_b32_e32 v1, 0x7b
-; GFX803-NEXT:    buffer_store_dword v1, off, s[0:3], s32
+; GFX803-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
-; GFX803-NEXT:    buffer_load_ubyte v1, off, s[0:3], s32 offset:4095 glc
+; GFX803-NEXT:    v_mov_b32_e32 v2, 44
+; GFX803-NEXT:    buffer_load_ubyte v1, v2, s[0:3], s32 offen offset:4055 glc
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX803-NEXT:    s_mov_b32 s4, 0x5040c00
@@ -2085,9 +2096,10 @@ define void @load_private_lo_v2i16_reglo_vreg_zexti8_to_offset(i32 %reg) #0 {
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v1, 0x7b
-; GFX900-FLATSCR-NEXT:    scratch_store_dword off, v1, s32
+; GFX900-FLATSCR-NEXT:    scratch_store_dword off, v1, s32 offset:4
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-FLATSCR-NEXT:    scratch_load_ubyte_d16 v0, off, s32 offset:4095 glc
+; GFX900-FLATSCR-NEXT:    s_add_i32 vcc_hi, s32, 44
+; GFX900-FLATSCR-NEXT:    scratch_load_ubyte_d16 v0, off, vcc_hi offset:4055 glc
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
@@ -2111,9 +2123,10 @@ define void @load_private_lo_v2f16_reglo_vreg_sexti8_to_offset(i32 %reg) #0 {
 ; GFX900-MUBUF:       ; %bb.0: ; %entry
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-MUBUF-NEXT:    v_mov_b32_e32 v1, 0x7b
-; GFX900-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s32
+; GFX900-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-MUBUF-NEXT:    buffer_load_sbyte_d16 v0, off, s[0:3], s32 offset:4095 glc
+; GFX900-MUBUF-NEXT:    v_mov_b32_e32 v1, 44
+; GFX900-MUBUF-NEXT:    buffer_load_sbyte_d16 v0, v1, s[0:3], s32 offen offset:4055 glc
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v0, off
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
@@ -2123,9 +2136,10 @@ define void @load_private_lo_v2f16_reglo_vreg_sexti8_to_offset(i32 %reg) #0 {
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_mov_b32_e32 v1, 0x7b
-; GFX906-NEXT:    buffer_store_dword v1, off, s[0:3], s32
+; GFX906-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    buffer_load_sbyte v1, off, s[0:3], s32 offset:4095 glc
+; GFX906-NEXT:    v_mov_b32_e32 v2, 44
+; GFX906-NEXT:    buffer_load_sbyte v1, v2, s[0:3], s32 offen offset:4055 glc
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX906-NEXT:    v_and_b32_e32 v1, 0xffff, v1
@@ -2138,9 +2152,10 @@ define void @load_private_lo_v2f16_reglo_vreg_sexti8_to_offset(i32 %reg) #0 {
 ; GFX803:       ; %bb.0: ; %entry
 ; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    v_mov_b32_e32 v1, 0x7b
-; GFX803-NEXT:    buffer_store_dword v1, off, s[0:3], s32
+; GFX803-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
-; GFX803-NEXT:    buffer_load_sbyte v1, off, s[0:3], s32 offset:4095 glc
+; GFX803-NEXT:    v_mov_b32_e32 v2, 44
+; GFX803-NEXT:    buffer_load_sbyte v1, v2, s[0:3], s32 offen offset:4055 glc
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX803-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -2152,9 +2167,10 @@ define void @load_private_lo_v2f16_reglo_vreg_sexti8_to_offset(i32 %reg) #0 {
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v1, 0x7b
-; GFX900-FLATSCR-NEXT:    scratch_store_dword off, v1, s32
+; GFX900-FLATSCR-NEXT:    scratch_store_dword off, v1, s32 offset:4
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-FLATSCR-NEXT:    scratch_load_sbyte_d16 v0, off, s32 offset:4095 glc
+; GFX900-FLATSCR-NEXT:    s_add_i32 vcc_hi, s32, 44
+; GFX900-FLATSCR-NEXT:    scratch_load_sbyte_d16 v0, off, vcc_hi offset:4055 glc
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
@@ -2179,9 +2195,10 @@ define void @load_private_lo_v2f16_reglo_vreg_zexti8_to_offset(i32 %reg) #0 {
 ; GFX900-MUBUF:       ; %bb.0: ; %entry
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-MUBUF-NEXT:    v_mov_b32_e32 v1, 0x7b
-; GFX900-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s32
+; GFX900-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-MUBUF-NEXT:    buffer_load_ubyte_d16 v0, off, s[0:3], s32 offset:4095 glc
+; GFX900-MUBUF-NEXT:    v_mov_b32_e32 v1, 44
+; GFX900-MUBUF-NEXT:    buffer_load_ubyte_d16 v0, v1, s[0:3], s32 offen offset:4055 glc
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v0, off
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
@@ -2191,9 +2208,10 @@ define void @load_private_lo_v2f16_reglo_vreg_zexti8_to_offset(i32 %reg) #0 {
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_mov_b32_e32 v1, 0x7b
-; GFX906-NEXT:    buffer_store_dword v1, off, s[0:3], s32
+; GFX906-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    buffer_load_ubyte v1, off, s[0:3], s32 offset:4095 glc
+; GFX906-NEXT:    v_mov_b32_e32 v2, 44
+; GFX906-NEXT:    buffer_load_ubyte v1, v2, s[0:3], s32 offen offset:4055 glc
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX906-NEXT:    v_and_b32_e32 v1, 0xffff, v1
@@ -2206,9 +2224,10 @@ define void @load_private_lo_v2f16_reglo_vreg_zexti8_to_offset(i32 %reg) #0 {
 ; GFX803:       ; %bb.0: ; %entry
 ; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    v_mov_b32_e32 v1, 0x7b
-; GFX803-NEXT:    buffer_store_dword v1, off, s[0:3], s32
+; GFX803-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
-; GFX803-NEXT:    buffer_load_ubyte v1, off, s[0:3], s32 offset:4095 glc
+; GFX803-NEXT:    v_mov_b32_e32 v2, 44
+; GFX803-NEXT:    buffer_load_ubyte v1, v2, s[0:3], s32 offen offset:4055 glc
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX803-NEXT:    s_mov_b32 s4, 0x5040c00
@@ -2221,9 +2240,10 @@ define void @load_private_lo_v2f16_reglo_vreg_zexti8_to_offset(i32 %reg) #0 {
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v1, 0x7b
-; GFX900-FLATSCR-NEXT:    scratch_store_dword off, v1, s32
+; GFX900-FLATSCR-NEXT:    scratch_store_dword off, v1, s32 offset:4
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-FLATSCR-NEXT:    scratch_load_ubyte_d16 v0, off, s32 offset:4095 glc
+; GFX900-FLATSCR-NEXT:    s_add_i32 vcc_hi, s32, 44
+; GFX900-FLATSCR-NEXT:    scratch_load_ubyte_d16 v0, off, vcc_hi offset:4055 glc
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)

diff  --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll
index 619a2d7e9deb1..c6b9f8f28412f 100644
--- a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll
@@ -112,12 +112,13 @@ define void @func_local_stack_offset_uses_sp(i64 addrspace(1)* %out) {
 ; MUBUF-NEXT:    s_add_i32 s33, s32, 0x7ffc0
 ; MUBUF-NEXT:    s_and_b32 s33, s33, 0xfff80000
 ; MUBUF-NEXT:    v_lshrrev_b32_e64 v3, 6, s33
-; MUBUF-NEXT:    v_add_u32_e32 v3, 0x1000, v3
+; MUBUF-NEXT:    v_add_u32_e32 v3, 0x3000, v3
 ; MUBUF-NEXT:    v_add_u32_e32 v2, 64, v3
 ; MUBUF-NEXT:    v_mov_b32_e32 v4, 0
+; MUBUF-NEXT:    v_mov_b32_e32 v5, 0x2000
 ; MUBUF-NEXT:    s_mov_b32 s4, 0
-; MUBUF-NEXT:    s_add_i32 s32, s32, 0x180000
-; MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], s33
+; MUBUF-NEXT:    s_add_i32 s32, s32, 0x200000
+; MUBUF-NEXT:    buffer_store_dword v4, v5, s[0:3], s33 offen
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:  .LBB1_1: ; %loadstoreloop
 ; MUBUF-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -129,7 +130,7 @@ define void @func_local_stack_offset_uses_sp(i64 addrspace(1)* %out) {
 ; MUBUF-NEXT:    s_cbranch_scc1 .LBB1_1
 ; MUBUF-NEXT:  ; %bb.2: ; %split
 ; MUBUF-NEXT:    v_lshrrev_b32_e64 v3, 6, s33
-; MUBUF-NEXT:    v_add_u32_e32 v3, 0x1000, v3
+; MUBUF-NEXT:    v_add_u32_e32 v3, 0x3000, v3
 ; MUBUF-NEXT:    v_add_u32_e32 v3, 0x20d0, v3
 ; MUBUF-NEXT:    buffer_load_dword v4, v3, s[0:3], 0 offen glc
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
@@ -139,7 +140,7 @@ define void @func_local_stack_offset_uses_sp(i64 addrspace(1)* %out) {
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:4 glc
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
-; MUBUF-NEXT:    s_add_i32 s32, s32, 0xffe80000
+; MUBUF-NEXT:    s_add_i32 s32, s32, 0xffe00000
 ; MUBUF-NEXT:    s_mov_b32 s33, s5
 ; MUBUF-NEXT:    v_add_co_u32_e32 v2, vcc, v4, v6
 ; MUBUF-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v7, vcc
@@ -153,14 +154,15 @@ define void @func_local_stack_offset_uses_sp(i64 addrspace(1)* %out) {
 ; FLATSCR-NEXT:    s_mov_b32 s2, s33
 ; FLATSCR-NEXT:    s_add_i32 s33, s32, 0x1fff
 ; FLATSCR-NEXT:    s_and_b32 s33, s33, 0xffffe000
+; FLATSCR-NEXT:    s_add_i32 s32, s32, 0x8000
 ; FLATSCR-NEXT:    v_mov_b32_e32 v2, 0
+; FLATSCR-NEXT:    s_add_i32 vcc_hi, s33, 0x2000
 ; FLATSCR-NEXT:    s_mov_b32 s0, 0
-; FLATSCR-NEXT:    s_addk_i32 s32, 0x6000
-; FLATSCR-NEXT:    scratch_store_dword off, v2, s33
+; FLATSCR-NEXT:    scratch_store_dword off, v2, vcc_hi
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:  .LBB1_1: ; %loadstoreloop
 ; FLATSCR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; FLATSCR-NEXT:    s_add_i32 vcc_hi, s33, 0x1000
+; FLATSCR-NEXT:    s_add_i32 vcc_hi, s33, 0x3000
 ; FLATSCR-NEXT:    s_add_i32 s1, s0, vcc_hi
 ; FLATSCR-NEXT:    s_add_i32 s0, s0, 1
 ; FLATSCR-NEXT:    s_cmpk_lt_u32 s0, 0x2120
@@ -169,14 +171,14 @@ define void @func_local_stack_offset_uses_sp(i64 addrspace(1)* %out) {
 ; FLATSCR-NEXT:    s_cbranch_scc1 .LBB1_1
 ; FLATSCR-NEXT:  ; %bb.2: ; %split
 ; FLATSCR-NEXT:    s_movk_i32 s0, 0x2000
-; FLATSCR-NEXT:    s_add_i32 s1, s33, 0x1000
+; FLATSCR-NEXT:    s_add_i32 s1, s33, 0x3000
 ; FLATSCR-NEXT:    s_add_i32 s0, s0, s1
 ; FLATSCR-NEXT:    scratch_load_dwordx2 v[2:3], off, s0 offset:208 glc
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR-NEXT:    s_add_i32 s0, s33, 0x1000
+; FLATSCR-NEXT:    s_add_i32 s0, s33, 0x3000
 ; FLATSCR-NEXT:    scratch_load_dwordx2 v[4:5], off, s0 offset:64 glc
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR-NEXT:    s_addk_i32 s32, 0xa000
+; FLATSCR-NEXT:    s_addk_i32 s32, 0x8000
 ; FLATSCR-NEXT:    s_mov_b32 s33, s2
 ; FLATSCR-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
 ; FLATSCR-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v5, vcc

diff  --git a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll
index 47d38fbfd7365..68646ad229a7c 100644
--- a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll
+++ b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll
@@ -10,7 +10,7 @@ define hidden fastcc void @callee_has_fp() #1 {
 ; CHECK-NEXT:    s_mov_b32 s33, s32
 ; CHECK-NEXT:    s_add_i32 s32, s32, 0x200
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 1
-; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4
+; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], s33
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_add_i32 s32, s32, 0xfffffe00
 ; CHECK-NEXT:    s_mov_b32 s33, s4

diff  --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir
index 9e896c4a1c5c3..eef72204b138b 100644
--- a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir
+++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir
@@ -30,25 +30,29 @@ body:             |
     ; CHECK: liveins: $vgpr1, $vgpr2
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    ; CHECK-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 524544, implicit-def $scc
+    ; CHECK-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 1048832, implicit-def $scc
     ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr6, 0, 0, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
     ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
     ; CHECK-NEXT: $vgpr2 = V_WRITELANE_B32 $sgpr33, 0, undef $vgpr2
     ; CHECK-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc
     ; CHECK-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc
-    ; CHECK-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 1572864, implicit-def dead $scc
+    ; CHECK-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc
     ; CHECK-NEXT: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
-    ; CHECK-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
     ; CHECK-NEXT: $sgpr33 = S_LSHR_B32 $sgpr33, 6, implicit-def $scc
     ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, 8192, implicit-def $scc
-    ; CHECK-NEXT: $vgpr3 = COPY killed $sgpr33
+    ; CHECK-NEXT: $vgpr0 = COPY killed $sgpr33
     ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, -8192, implicit-def $scc
     ; CHECK-NEXT: $sgpr33 = S_LSHL_B32 $sgpr33, 6, implicit-def $scc
+    ; CHECK-NEXT: $sgpr33 = S_LSHR_B32 $sgpr33, 6, implicit-def $scc
+    ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, 16384, implicit-def $scc
+    ; CHECK-NEXT: $vgpr3 = COPY killed $sgpr33
+    ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, -16384, implicit-def $scc
+    ; CHECK-NEXT: $sgpr33 = S_LSHL_B32 $sgpr33, 6, implicit-def $scc
     ; CHECK-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr3, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31
-    ; CHECK-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -1572864, implicit-def dead $scc
+    ; CHECK-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc
     ; CHECK-NEXT: $sgpr33 = V_READLANE_B32 $vgpr2, 0
     ; CHECK-NEXT: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    ; CHECK-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 524544, implicit-def $scc
+    ; CHECK-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 1048832, implicit-def $scc
     ; CHECK-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr6, 0, 0, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
     ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
     ; CHECK-NEXT: S_ENDPGM 0, implicit $vcc
@@ -85,16 +89,20 @@ body:             |
     ; CHECK-NEXT: $sgpr29 = frame-setup COPY $sgpr33
     ; CHECK-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc
     ; CHECK-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc
-    ; CHECK-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 1572864, implicit-def dead $scc
+    ; CHECK-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc
     ; CHECK-NEXT: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
-    ; CHECK-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
     ; CHECK-NEXT: $sgpr33 = S_LSHR_B32 $sgpr33, 6, implicit-def $scc
     ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, 8192, implicit-def $scc
-    ; CHECK-NEXT: $vgpr2 = COPY killed $sgpr33
+    ; CHECK-NEXT: $vgpr0 = COPY killed $sgpr33
     ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, -8192, implicit-def $scc
     ; CHECK-NEXT: $sgpr33 = S_LSHL_B32 $sgpr33, 6, implicit-def $scc
+    ; CHECK-NEXT: $sgpr33 = S_LSHR_B32 $sgpr33, 6, implicit-def $scc
+    ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, 16384, implicit-def $scc
+    ; CHECK-NEXT: $vgpr2 = COPY killed $sgpr33
+    ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, -16384, implicit-def $scc
+    ; CHECK-NEXT: $sgpr33 = S_LSHL_B32 $sgpr33, 6, implicit-def $scc
     ; CHECK-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr31
-    ; CHECK-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -1572864, implicit-def dead $scc
+    ; CHECK-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc
     ; CHECK-NEXT: $sgpr33 = frame-destroy COPY $sgpr29
     ; CHECK-NEXT: S_ENDPGM 0, implicit $vcc
     S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
@@ -130,14 +138,16 @@ body:             |
     ; CHECK-NEXT: $sgpr28 = frame-setup COPY $sgpr33
     ; CHECK-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc
     ; CHECK-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc
-    ; CHECK-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 1572864, implicit-def dead $scc
+    ; CHECK-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc
     ; CHECK-NEXT: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
-    ; CHECK-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
     ; CHECK-NEXT: $sgpr29 = S_LSHR_B32 $sgpr33, 6, implicit-def $scc
     ; CHECK-NEXT: $sgpr29 = S_ADD_I32 killed $sgpr29, 8192, implicit-def $scc
+    ; CHECK-NEXT: $vgpr0 = COPY killed $sgpr29
+    ; CHECK-NEXT: $sgpr29 = S_LSHR_B32 $sgpr33, 6, implicit-def $scc
+    ; CHECK-NEXT: $sgpr29 = S_ADD_I32 killed $sgpr29, 16384, implicit-def $scc
     ; CHECK-NEXT: $vgpr2 = COPY killed $sgpr29
     ; CHECK-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr31
-    ; CHECK-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -1572864, implicit-def dead $scc
+    ; CHECK-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc
     ; CHECK-NEXT: $sgpr33 = frame-destroy COPY $sgpr28
     ; CHECK-NEXT: S_ENDPGM 0, implicit $vcc
     S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
@@ -172,14 +182,16 @@ body:             |
     ; CHECK-NEXT: $sgpr28 = frame-setup COPY $sgpr33
     ; CHECK-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc
     ; CHECK-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc
-    ; CHECK-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 1572864, implicit-def dead $scc
+    ; CHECK-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc
     ; CHECK-NEXT: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr30, implicit-def $sgpr31
     ; CHECK-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
-    ; CHECK-NEXT: $vgpr2 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
     ; CHECK-NEXT: $vcc_lo = S_MOV_B32 8192
+    ; CHECK-NEXT: $vgpr0, dead $vcc = V_ADD_CO_U32_e64 killed $vcc_lo, killed $vgpr0, 0, implicit $exec
+    ; CHECK-NEXT: $vgpr2 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
+    ; CHECK-NEXT: $vcc_lo = S_MOV_B32 16384
     ; CHECK-NEXT: $vgpr2, dead $vcc = V_ADD_CO_U32_e64 killed $vcc_lo, killed $vgpr2, 0, implicit $exec
     ; CHECK-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr31
-    ; CHECK-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -1572864, implicit-def dead $scc
+    ; CHECK-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc
     ; CHECK-NEXT: $sgpr33 = frame-destroy COPY $sgpr28
     ; CHECK-NEXT: S_ENDPGM 0
     S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr30, implicit-def $sgpr31

diff  --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir
index 43c3c2fd088f3..4ab5307cbcb73 100644
--- a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir
+++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir
@@ -26,22 +26,23 @@ body:             |
     ; MUBUF: liveins: $vgpr1, $vgpr2
     ; MUBUF-NEXT: {{  $}}
     ; MUBUF-NEXT: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    ; MUBUF-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 524544, implicit-def $scc
+    ; MUBUF-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 1048832, implicit-def $scc
     ; MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr6, 0, 0, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
     ; MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
     ; MUBUF-NEXT: $vgpr2 = V_WRITELANE_B32 $sgpr33, 0, undef $vgpr2
     ; MUBUF-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc
     ; MUBUF-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc
-    ; MUBUF-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 1572864, implicit-def dead $scc
+    ; MUBUF-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc
     ; MUBUF-NEXT: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
     ; MUBUF-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
+    ; MUBUF-NEXT: $vgpr0 = V_ADD_U32_e32 8192, killed $vgpr0, implicit $exec
     ; MUBUF-NEXT: $vgpr3 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
-    ; MUBUF-NEXT: $vgpr3 = V_ADD_U32_e32 8192, killed $vgpr3, implicit $exec
+    ; MUBUF-NEXT: $vgpr3 = V_ADD_U32_e32 16384, killed $vgpr3, implicit $exec
     ; MUBUF-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr3, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31
-    ; MUBUF-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -1572864, implicit-def dead $scc
+    ; MUBUF-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc
     ; MUBUF-NEXT: $sgpr33 = V_READLANE_B32 $vgpr2, 0
     ; MUBUF-NEXT: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    ; MUBUF-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 524544, implicit-def $scc
+    ; MUBUF-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 1048832, implicit-def $scc
     ; MUBUF-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr6, 0, 0, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
     ; MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
     ; MUBUF-NEXT: S_ENDPGM 0, implicit $vcc
@@ -49,22 +50,24 @@ body:             |
     ; FLATSCR: liveins: $vgpr1, $vgpr2
     ; FLATSCR-NEXT: {{  $}}
     ; FLATSCR-NEXT: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    ; FLATSCR-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 8196, implicit-def $scc
+    ; FLATSCR-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 16388, implicit-def $scc
     ; FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr2, killed $sgpr6, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.3, addrspace 5)
     ; FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
     ; FLATSCR-NEXT: $vgpr2 = V_WRITELANE_B32 $sgpr33, 0, undef $vgpr2
     ; FLATSCR-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 8191, implicit-def $scc
     ; FLATSCR-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294959104, implicit-def dead $scc
-    ; FLATSCR-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 24576, implicit-def dead $scc
+    ; FLATSCR-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 32768, implicit-def dead $scc
     ; FLATSCR-NEXT: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
-    ; FLATSCR-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr33, implicit $exec
     ; FLATSCR-NEXT: $sgpr33 = S_ADD_I32 $sgpr33, 8192, implicit-def $scc
-    ; FLATSCR-NEXT: $vgpr0 = V_OR_B32_e32 $sgpr33, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31
+    ; FLATSCR-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr33, implicit $exec
     ; FLATSCR-NEXT: $sgpr33 = S_ADD_I32 $sgpr33, -8192, implicit-def $scc
-    ; FLATSCR-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -24576, implicit-def dead $scc
+    ; FLATSCR-NEXT: $sgpr33 = S_ADD_I32 $sgpr33, 16384, implicit-def $scc
+    ; FLATSCR-NEXT: $vgpr0 = V_OR_B32_e32 $sgpr33, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31
+    ; FLATSCR-NEXT: $sgpr33 = S_ADD_I32 $sgpr33, -16384, implicit-def $scc
+    ; FLATSCR-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -32768, implicit-def dead $scc
     ; FLATSCR-NEXT: $sgpr33 = V_READLANE_B32 $vgpr2, 0
     ; FLATSCR-NEXT: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    ; FLATSCR-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 8196, implicit-def $scc
+    ; FLATSCR-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 16388, implicit-def $scc
     ; FLATSCR-NEXT: $vgpr2 = SCRATCH_LOAD_DWORD_SADDR killed $sgpr6, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.3, addrspace 5)
     ; FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
     ; FLATSCR-NEXT: S_ENDPGM 0, implicit $vcc

diff  --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr.mir
index 369ec09302691..74df03b12f794 100644
--- a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr.mir
+++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr.mir
@@ -25,19 +25,25 @@ body:             |
     ; CHECK: liveins: $vgpr1, $vgpr2
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
+    ; CHECK-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 262400, implicit-def $scc
+    ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr6, 0, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
     ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
     ; CHECK-NEXT: $vgpr2 = V_WRITELANE_B32 $sgpr33, 0, undef $vgpr2
     ; CHECK-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 262080, implicit-def $scc
     ; CHECK-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294705152, implicit-def dead $scc
-    ; CHECK-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 524288, implicit-def dead $scc
+    ; CHECK-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 786432, implicit-def dead $scc
     ; CHECK-NEXT: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
-    ; CHECK-NEXT: $vgpr3 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
+    ; CHECK-NEXT: $sgpr33 = S_LSHR_B32 $sgpr33, 6, implicit-def $scc
+    ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, 4096, implicit-def $scc
+    ; CHECK-NEXT: $vgpr3 = COPY killed $sgpr33
+    ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, -4096, implicit-def $scc
+    ; CHECK-NEXT: $sgpr33 = S_LSHL_B32 $sgpr33, 6, implicit-def $scc
     ; CHECK-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr3, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31
-    ; CHECK-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -524288, implicit-def dead $scc
+    ; CHECK-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -786432, implicit-def dead $scc
     ; CHECK-NEXT: $sgpr33 = V_READLANE_B32 $vgpr2, 0
     ; CHECK-NEXT: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    ; CHECK-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
+    ; CHECK-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 262400, implicit-def $scc
+    ; CHECK-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr6, 0, 0, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
     ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
     ; CHECK-NEXT: S_ENDPGM 0, implicit $vcc
     S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc

diff  --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir
index 7089ed8a53da8..863e867e47062 100644
--- a/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir
+++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir
@@ -27,73 +27,73 @@ body:             |
     ; GFX8: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr2
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    ; GFX8-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 524544, implicit-def $scc
+    ; GFX8-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 1048832, implicit-def $scc
     ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr6, 0, 0, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
     ; GFX8-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
     ; GFX8-NEXT: $vgpr2 = V_WRITELANE_B32 $sgpr33, 0, undef $vgpr2
     ; GFX8-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc
     ; GFX8-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc
-    ; GFX8-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 1572864, implicit-def dead $scc
+    ; GFX8-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc
     ; GFX8-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
-    ; GFX8-NEXT: $sgpr7 = S_ADD_I32 $sgpr33, 524800, implicit-def $scc
-    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr7, 0, 0, 0, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5)
-    ; GFX8-NEXT: $vgpr3 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
     ; GFX8-NEXT: $vcc_lo = S_MOV_B32 8192
+    ; GFX8-NEXT: $vgpr0, dead $vcc = V_ADD_CO_U32_e64 killed $vcc_lo, killed $vgpr0, 0, implicit $exec
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5)
+    ; GFX8-NEXT: $vgpr3 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
+    ; GFX8-NEXT: $vcc_lo = S_MOV_B32 16384
     ; GFX8-NEXT: $vgpr3, dead $vcc = V_ADD_CO_U32_e64 killed $vcc_lo, killed $vgpr3, 0, implicit $exec
     ; GFX8-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr3, $vgpr1, implicit $exec
-    ; GFX8-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -1572864, implicit-def dead $scc
+    ; GFX8-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc
     ; GFX8-NEXT: $sgpr33 = V_READLANE_B32 $vgpr2, 0
     ; GFX8-NEXT: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    ; GFX8-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 524544, implicit-def $scc
+    ; GFX8-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 1048832, implicit-def $scc
     ; GFX8-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr6, 0, 0, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
     ; GFX8-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
-    ; GFX8-NEXT: $sgpr4 = S_ADD_I32 $sgpr33, 524800, implicit-def $scc
-    ; GFX8-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr4, 0, 0, 0, 0, implicit $exec :: (load (s32) from %stack.4, addrspace 5)
+    ; GFX8-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, 0, implicit $exec :: (load (s32) from %stack.4, addrspace 5)
     ; GFX8-NEXT: S_ENDPGM 0, csr_amdgpu_allvgprs
     ; GFX9-LABEL: name: pei_scavenge_vgpr_spill
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr2
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    ; GFX9-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 524544, implicit-def $scc
+    ; GFX9-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 1048832, implicit-def $scc
     ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr6, 0, 0, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
     ; GFX9-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
     ; GFX9-NEXT: $vgpr2 = V_WRITELANE_B32 $sgpr33, 0, undef $vgpr2
     ; GFX9-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc
     ; GFX9-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc
-    ; GFX9-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 1572864, implicit-def dead $scc
+    ; GFX9-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc
     ; GFX9-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
-    ; GFX9-NEXT: $sgpr7 = S_ADD_I32 $sgpr33, 524800, implicit-def $scc
-    ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr7, 0, 0, 0, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5)
+    ; GFX9-NEXT: $vgpr0 = V_ADD_U32_e32 8192, killed $vgpr0, implicit $exec
+    ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5)
     ; GFX9-NEXT: $vgpr3 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
-    ; GFX9-NEXT: $vgpr3 = V_ADD_U32_e32 8192, killed $vgpr3, implicit $exec
+    ; GFX9-NEXT: $vgpr3 = V_ADD_U32_e32 16384, killed $vgpr3, implicit $exec
     ; GFX9-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr3, $vgpr1, implicit $exec
-    ; GFX9-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -1572864, implicit-def dead $scc
+    ; GFX9-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc
     ; GFX9-NEXT: $sgpr33 = V_READLANE_B32 $vgpr2, 0
     ; GFX9-NEXT: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    ; GFX9-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 524544, implicit-def $scc
+    ; GFX9-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 1048832, implicit-def $scc
     ; GFX9-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr6, 0, 0, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
     ; GFX9-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
-    ; GFX9-NEXT: $sgpr4 = S_ADD_I32 $sgpr33, 524800, implicit-def $scc
-    ; GFX9-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr4, 0, 0, 0, 0, implicit $exec :: (load (s32) from %stack.4, addrspace 5)
+    ; GFX9-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, 0, implicit $exec :: (load (s32) from %stack.4, addrspace 5)
     ; GFX9-NEXT: S_ENDPGM 0, csr_amdgpu_allvgprs
     ; GFX9-FLATSCR-LABEL: name: pei_scavenge_vgpr_spill
     ; GFX9-FLATSCR: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr2
     ; GFX9-FLATSCR-NEXT: {{  $}}
     ; GFX9-FLATSCR-NEXT: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    ; GFX9-FLATSCR-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 8196, implicit-def $scc
+    ; GFX9-FLATSCR-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 16388, implicit-def $scc
     ; GFX9-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr2, killed $sgpr6, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.3, addrspace 5)
     ; GFX9-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
     ; GFX9-FLATSCR-NEXT: $vgpr2 = V_WRITELANE_B32 $sgpr33, 0, undef $vgpr2
     ; GFX9-FLATSCR-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 8191, implicit-def $scc
     ; GFX9-FLATSCR-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294959104, implicit-def dead $scc
-    ; GFX9-FLATSCR-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 24576, implicit-def dead $scc
-    ; GFX9-FLATSCR-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr33, implicit $exec
+    ; GFX9-FLATSCR-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 32768, implicit-def dead $scc
     ; GFX9-FLATSCR-NEXT: $vcc_hi = S_ADD_I32 $sgpr33, 8192, implicit-def $scc
+    ; GFX9-FLATSCR-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vcc_hi, implicit $exec
+    ; GFX9-FLATSCR-NEXT: $vcc_hi = S_ADD_I32 $sgpr33, 16384, implicit-def $scc
     ; GFX9-FLATSCR-NEXT: $vgpr0 = V_OR_B32_e32 killed $vcc_hi, $vgpr1, implicit $exec
-    ; GFX9-FLATSCR-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -24576, implicit-def dead $scc
+    ; GFX9-FLATSCR-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -32768, implicit-def dead $scc
     ; GFX9-FLATSCR-NEXT: $sgpr33 = V_READLANE_B32 $vgpr2, 0
     ; GFX9-FLATSCR-NEXT: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    ; GFX9-FLATSCR-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 8196, implicit-def $scc
+    ; GFX9-FLATSCR-NEXT: $sgpr6 = S_ADD_I32 $sgpr32, 16388, implicit-def $scc
     ; GFX9-FLATSCR-NEXT: $vgpr2 = SCRATCH_LOAD_DWORD_SADDR killed $sgpr6, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.3, addrspace 5)
     ; GFX9-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
     ; GFX9-FLATSCR-NEXT: S_ENDPGM 0, csr_amdgpu_allvgprs

diff  --git a/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll b/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll
index 1ba6f81c1e0c7..11e5f0f9aa71c 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll
@@ -78,10 +78,10 @@ entry:
   ; 0x40000 / 64 = 4096 (for wave64)
   %a = load volatile i32, i32 addrspace(5)* %aptr
 
-  ; MUBUF:   s_add_i32 s32, s32, 0x40000
+  ; MUBUF:   s_add_i32 s32, s32, 0x40100
   ; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s32 ; 4-byte Folded Spill
-  ; MUBUF:   s_add_i32 s32, s32, 0xfffc0000
-  ; FLATSCR: s_add_i32 [[SOFF:s[0-9]+]], s32, 0x1000
+  ; MUBUF:   s_add_i32 s32, s32, 0xfffbff00
+  ; FLATSCR: s_add_i32 [[SOFF:s[0-9]+]], s32, 0x1004
   ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, [[SOFF]] ; 4-byte Folded Spill
   call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0.0, i32 %asm1.0, i32 %asm2.0, i32 %asm3.0, i32 %asm4.0, i32 %asm5.0, i32 %asm6.0, i32 %asm7.0, i32 %a)
 
@@ -97,10 +97,10 @@ entry:
 
   call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0
 
-  ; MUBUF:   s_add_i32 s32, s32, 0x40000
+  ; MUBUF:   s_add_i32 s32, s32, 0x40100
   ; MUBUF:   buffer_load_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s32 ; 4-byte Folded Reload
-  ; MUBUF:   s_add_i32 s32, s32, 0xfffc0000
-  ; FLATSCR: s_add_i32 [[SOFF:s[0-9]+]], s32, 0x1000
+  ; MUBUF:   s_add_i32 s32, s32, 0xfffbff00
+  ; FLATSCR: s_add_i32 [[SOFF:s[0-9]+]], s32, 0x1004
   ; FLATSCR: scratch_load_dword v{{[0-9]+}}, off, [[SOFF]] ; 4-byte Folded Reload
 
    ; Force %a to spill with no free SGPRs
@@ -173,14 +173,16 @@ entry:
 ; GCN-LABEL: test_inst_offset_function
 define void @test_inst_offset_function() {
 entry:
-  ; Occupy 4092 bytes of scratch, so the offset of the spill of %a just fits in
-  ; the instruction offset field.
-  %alloca = alloca i8, i32 4092, align 4, addrspace(5)
+  ; Occupy enough bytes of scratch, so the offset of the spill of %a
+  ; just fits in the instruction offset field when the emergency stack
+  ; slot is added. It's hard to hit the actual limit since we're also
+  ; going to insert the emergency stack slot for large frames.
+  %alloca = alloca i8, i32 4088, align 4, addrspace(5)
   %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
 
   %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
-  ; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4092 ; 4-byte Folded Spill
-  ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s{{[0-9]+}} offset:4092 ; 4-byte Folded Spill
+  ; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4088 ; 4-byte Folded Spill
+  ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s{{[0-9]+}} offset:4088 ; 4-byte Folded Spill
   %a = load volatile i32, i32 addrspace(5)* %aptr
 
   ; Force %a to spill.
@@ -202,9 +204,9 @@ entry:
 
   %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
   ; 0x40000 / 64 = 4096 (for wave64)
-  ; MUBUF:   s_add_i32 s4, s32, 0x40000
+  ; MUBUF:   s_add_i32 s4, s32, 0x40100
   ; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 ; 4-byte Folded Spill
-  ; FLATSCR: s_add_i32 s0, s32, 0x1000
+  ; FLATSCR: s_add_i32 s0, s32, 0x1004
   ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s0 ; 4-byte Folded Spill
   %a = load volatile i32, i32 addrspace(5)* %aptr
 
@@ -220,16 +222,21 @@ entry:
 ; GCN-LABEL: test_sgpr_offset_subregs_function
 define void @test_sgpr_offset_subregs_function() {
 entry:
-  ; Occupy 4088 bytes of scratch, so that the spill of the last subreg of %a
-  ; still fits below offset 4096 (4088 + 8 - 4 = 4092), and can be placed in
+  ; We want to test the spill of the last subreg of %a is the highest
+  ; valid value for the immediate offset. We enable the emergency
+  ; stack slot for large frames, so it's hard to get the frame layout
+  ; exactly as we want to test it.
+  ;
+  ; Occupy 4084 bytes of scratch, so that the spill of the last subreg of %a
+  ; still fits below offset 4096 (4084 + 8 - 4 = 4092), and can be placed in
   ; the instruction offset field.
-  %alloca = alloca i8, i32 4088, align 4, addrspace(5)
+  %alloca = alloca i8, i32 4084, align 4, addrspace(5)
   %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
   %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)*
 
+  ; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4084 ; 4-byte Folded Spill
   ; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4088 ; 4-byte Folded Spill
-  ; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4092 ; 4-byte Folded Spill
-  ; FLATSCR: scratch_store_dwordx2 off, v[{{[0-9:]+}}], s32 offset:4088 ; 8-byte Folded Spill
+  ; FLATSCR: scratch_store_dwordx2 off, v[{{[0-9:]+}}], s32 offset:4084 ; 8-byte Folded Spill
   %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1
   %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr
 
@@ -249,14 +256,14 @@ entry:
 ; GCN-LABEL: test_inst_offset_subregs_function
 define void @test_inst_offset_subregs_function() {
 entry:
-  ; Occupy 4092 bytes of scratch, so that the spill of the last subreg of %a
-  ; does not fit below offset 4096 (4092 + 8 - 4 = 4096), and has to live
+  ; Occupy 4088 bytes of scratch, so that the spill of the last subreg of %a
+  ; does not fit below offset 4096 (408 + 4 + 8 - 4 = 4096), and has to live
   ; in the SGPR offset.
-  %alloca = alloca i8, i32 4092, align 4, addrspace(5)
+  %alloca = alloca i8, i32 4088, align 4, addrspace(5)
   %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
   %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)*
 
-  ; 0x3ff00 / 64 = 4092 (for wave64)
+  ; 0x3ff0000 / 64 = 4092 (for wave64)
   ; MUBUF: s_add_i32 s4, s32, 0x3ff00
   ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 ; 4-byte Folded Spill
   ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 offset:4 ; 4-byte Folded Spill

diff  --git a/llvm/test/CodeGen/AMDGPU/store-hi16.ll b/llvm/test/CodeGen/AMDGPU/store-hi16.ll
index c9584c4a52e68..98f9dad9e851c 100644
--- a/llvm/test/CodeGen/AMDGPU/store-hi16.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-hi16.ll
@@ -647,15 +647,14 @@ entry:
 ; GCN: s_waitcnt
 ; GFX900-MUBUF:        buffer_store_dword
 ; GFX900-MUBUF-NEXT:   s_waitcnt vmcnt(0)
-; GFX900-MUBUF-NEXT:   buffer_store_short_d16_hi v0, off, s[0:3], s32 offset:4094
+; GFX900-MUBUF-NEXT:   buffer_store_short_d16_hi v0, off, s[0:3], s32 offset:4058
 ; GFX900-MUBUF-NEXT:   s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR:      scratch_store_dword
 ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX900-FLATSCR-NEXT: scratch_store_short_d16_hi off, v0, s32 offset:4094
+; GFX900-FLATSCR-NEXT: scratch_store_short_d16_hi off, v0, s32 offset:4058
 ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-define void @store_private_hi_v2i16_to_offset(i32 %arg) #0 {
+define void @store_private_hi_v2i16_to_offset(i32 %arg, [10 x i32] addrspace(5)* %obj0) #0 {
 entry:
-  %obj0 = alloca [10 x i32], align 4, addrspace(5)
   %obj1 = alloca [4096 x i16], align 2, addrspace(5)
   %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
   store volatile i32 123, i32 addrspace(5)* %bc
@@ -670,14 +669,13 @@ entry:
 ; GCN: s_waitcnt
 ; GFX900-MUBUF:        buffer_store_dword
 ; GFX900-MUBUF-NEXT:   s_waitcnt vmcnt(0)
-; GFX900-MUBUF-NEXT:   buffer_store_byte_d16_hi v0, off, s[0:3], s32 offset:4095
+; GFX900-MUBUF-NEXT:   buffer_store_byte_d16_hi v0, off, s[0:3], s32 offset:4059
 ; GFX900-FLATSCR:      scratch_store_dword
 ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX900-FLATSCR-NEXT: scratch_store_byte_d16_hi off, v0, s32 offset:4095
+; GFX900-FLATSCR-NEXT: scratch_store_byte_d16_hi off, v0, s32 offset:4059
 ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-define void @store_private_hi_v2i16_i8_to_offset(i32 %arg) #0 {
+define void @store_private_hi_v2i16_i8_to_offset(i32 %arg, [10 x i32] addrspace(5)* %obj0) #0 {
 entry:
-  %obj0 = alloca [10 x i32], align 4, addrspace(5)
   %obj1 = alloca [4096 x i8], align 2, addrspace(5)
   %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
   store volatile i32 123, i32 addrspace(5)* %bc


        


More information about the llvm-commits mailing list