[llvm] 60b1967 - [AMDGPU] Add Scratch Wave Offset to Scratch Buffer Descriptor in entry functions

Scott Linder via llvm-commits llvm-commits at lists.llvm.org
Thu Mar 19 12:40:52 PDT 2020


Author: Scott Linder
Date: 2020-03-19T15:35:16-04:00
New Revision: 60b1967c3933c42f473a3f7be5f40747547b6057

URL: https://github.com/llvm/llvm-project/commit/60b1967c3933c42f473a3f7be5f40747547b6057
DIFF: https://github.com/llvm/llvm-project/commit/60b1967c3933c42f473a3f7be5f40747547b6057.diff

LOG: [AMDGPU] Add Scratch Wave Offset to Scratch Buffer Descriptor in entry functions

Add the scratch wave offset to the scratch buffer descriptor (SRSrc) in
the entry function prologue. This allows us to removes the scratch wave
offset register from the calling convention ABI.

As part of this change, allow the use of an inline constant zero for the
SOffset of MUBUF instructions accessing the stack in entry functions
when a frame pointer is not requested/required. Entry functions with
calls still need to set up the calling convention ABI stack pointer
register, and reference it in order to address arguments of called
functions. The ABI stack pointer register remains unswizzled, but is now
wave-relative instead of queue-relative.

Non-entry functions also use an inline constant zero SOffset for
wave-relative scratch access, but continue to use the stack and frame
pointers as before. When the stack or frame pointer is converted to a
swizzled offset it is now scaled directly, as the scratch wave offset no
longer needs to be subtracted first.

Update llvm/docs/AMDGPUUsage.rst to reflect these changes to the calling
convention.

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D75138

Added: 
    llvm/test/CodeGen/AMDGPU/cc-update.ll

Modified: 
    llvm/docs/AMDGPUUsage.rst
    llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
    llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
    llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
    llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
    llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
    llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
    llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
    llvm/lib/Target/AMDGPU/SIFrameLowering.h
    llvm/lib/Target/AMDGPU/SIISelLowering.cpp
    llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
    llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
    llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
    llvm/lib/Target/AMDGPU/SIRegisterInfo.h
    llvm/lib/Target/AMDGPU/SIRegisterInfo.td
    llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-local.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
    llvm/test/CodeGen/AMDGPU/addrspacecast.ll
    llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
    llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll
    llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
    llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll
    llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll
    llvm/test/CodeGen/AMDGPU/call-argument-types.ll
    llvm/test/CodeGen/AMDGPU/call-constant.ll
    llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
    llvm/test/CodeGen/AMDGPU/call-waitcnt.ll
    llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
    llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll
    llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
    llvm/test/CodeGen/AMDGPU/captured-frame-index.ll
    llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
    llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
    llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
    llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
    llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
    llvm/test/CodeGen/AMDGPU/extload-private.ll
    llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll
    llvm/test/CodeGen/AMDGPU/fold-fi-mubuf.mir
    llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
    llvm/test/CodeGen/AMDGPU/frame-lowering-entry-all-sgpr-used.mir
    llvm/test/CodeGen/AMDGPU/frame-lowering-fp-adjusted.mir
    llvm/test/CodeGen/AMDGPU/function-returns.ll
    llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll
    llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
    llvm/test/CodeGen/AMDGPU/idot8s.ll
    llvm/test/CodeGen/AMDGPU/idot8u.ll
    llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll
    llvm/test/CodeGen/AMDGPU/indirect-call.ll
    llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
    llvm/test/CodeGen/AMDGPU/ipra.ll
    llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll
    llvm/test/CodeGen/AMDGPU/large-alloca-graphics.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.ll
    llvm/test/CodeGen/AMDGPU/load-hi16.ll
    llvm/test/CodeGen/AMDGPU/load-lo16.ll
    llvm/test/CodeGen/AMDGPU/memory-legalizer-load.ll
    llvm/test/CodeGen/AMDGPU/memory-legalizer-store.ll
    llvm/test/CodeGen/AMDGPU/memory_clause.ll
    llvm/test/CodeGen/AMDGPU/mesa3d.ll
    llvm/test/CodeGen/AMDGPU/mir-print-dead-csr-fi.mir
    llvm/test/CodeGen/AMDGPU/misched-killflags.mir
    llvm/test/CodeGen/AMDGPU/mubuf-offset-private.ll
    llvm/test/CodeGen/AMDGPU/optimize-exec-masking-pre-ra.mir
    llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll
    llvm/test/CodeGen/AMDGPU/pei-reg-scavenger-position.mir
    llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir
    llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir
    llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr.mir
    llvm/test/CodeGen/AMDGPU/private-access-no-objects.ll
    llvm/test/CodeGen/AMDGPU/private-element-size.ll
    llvm/test/CodeGen/AMDGPU/rename-independent-subregs-mac-operands.mir
    llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir
    llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir
    llvm/test/CodeGen/AMDGPU/scratch-buffer.ll
    llvm/test/CodeGen/AMDGPU/scratch-simple.ll
    llvm/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir
    llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll
    llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll
    llvm/test/CodeGen/AMDGPU/sibling-call.ll
    llvm/test/CodeGen/AMDGPU/spill-agpr.ll
    llvm/test/CodeGen/AMDGPU/spill-before-exec.mir
    llvm/test/CodeGen/AMDGPU/spill-empty-live-interval.mir
    llvm/test/CodeGen/AMDGPU/spill-m0.ll
    llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll
    llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll
    llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll
    llvm/test/CodeGen/AMDGPU/stack-realign.ll
    llvm/test/CodeGen/AMDGPU/stack-slot-color-sgpr-vgpr-spills.mir
    llvm/test/CodeGen/AMDGPU/store-hi16.ll
    llvm/test/CodeGen/AMDGPU/subreg-split-live-in-error.mir
    llvm/test/CodeGen/AMDGPU/subvector-test.mir
    llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
    llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir
    llvm/test/CodeGen/AMDGPU/wqm.ll
    llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
    llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir
    llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
    llvm/test/CodeGen/MIR/AMDGPU/parse-order-reserved-regs.mir
    llvm/test/DebugInfo/AMDGPU/variable-locations.ll

Removed: 
    llvm/test/CodeGen/AMDGPU/sp-too-many-input-sgprs.ll
    llvm/test/CodeGen/MIR/AMDGPU/mfi-parse-error-scratch-wave-offset-reg.mir
    llvm/test/CodeGen/MIR/AMDGPU/mfi-scratch-wave-offset-reg-class.mir


################################################################################
diff  --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 197765a3071b..199d9549a2fd 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -353,9 +353,9 @@ supported for the ``amdgcn`` target.
   (scratch), and group (LDS) memory depending on if the address is within one
   of the aperture ranges. Flat access to scratch requires hardware aperture
   setup and setup in the kernel prologue (see
-  :ref:`amdgpu-amdhsa-flat-scratch`). Flat access to LDS requires hardware
-  aperture setup and M0 (GFX7-GFX8) register setup (see
-  :ref:`amdgpu-amdhsa-m0`).
+  :ref:`amdgpu-amdhsa-kernel-prolog-flat-scratch`). Flat access to LDS requires
+  hardware aperture setup and M0 (GFX7-GFX8) register setup (see
+  :ref:`amdgpu-amdhsa-kernel-prolog-m0`).
 
   To convert between a private or group address space address (termed a segment
   address) and a flat address the base address of the corresponding aperture
@@ -5954,7 +5954,7 @@ SGPR register initial state is defined in
                                                   must be used to set up FLAT
                                                   SCRATCH for flat addressing
                                                   (see
-                                                  :ref:`amdgpu-amdhsa-flat-scratch`).
+                                                  :ref:`amdgpu-amdhsa-kernel-prolog-flat-scratch`).
      ========== ========================== ====== ==============================
 
 The order of the VGPR registers is defined, but the compiler can specify which
@@ -6020,7 +6020,23 @@ following properties:
 Kernel Prolog
 ~~~~~~~~~~~~~
 
-.. _amdgpu-amdhsa-m0:
+The compiler performs initialization in the kernel prologue depending on the
+target and information about things like stack usage in the kernel and called
+functions. Some of this initialization requires the compiler to request certain
+User and System SGPRs be present in the
+:ref:`amdgpu-amdhsa-initial-kernel-execution-state` via the
+:ref:`amdgpu-amdhsa-kernel-descriptor`.
+
+.. _amdgpu-amdhsa-kernel-prolog-cfi:
+
+CFI
++++
+
+1. The CFI return address is undefined.
+2. The CFI CFA is defined using an expression which evaluates to a memory
+   location description for the private segment address ``0``.
+
+.. _amdgpu-amdhsa-kernel-prolog-m0:
 
 M0
 ++
@@ -6035,15 +6051,35 @@ GFX9-GFX10
   The M0 register is not used for range checking LDS accesses and so does not
   need to be initialized in the prolog.
 
-.. _amdgpu-amdhsa-flat-scratch:
+.. _amdgpu-amdhsa-kernel-prolog-stack-pointer:
+
+Stack Pointer
++++++++++++++
+
+If the kernel has function calls it must set up the ABI stack pointer described
+in :ref:`amdgpu-amdhsa-function-call-convention-non-kernel-functions` by
+setting SGPR32 to the the unswizzled scratch offset of the address past the
+last local allocation.
+
+.. _amdgpu-amdhsa-kernel-prolog-frame-pointer:
+
+Frame Pointer
++++++++++++++
+
+If the kernel needs a frame pointer for the reasons defined in
+``SIFrameLowering`` then SGPR34 is used and is always set to ``0`` in the
+kernel prolog. If a frame pointer is not required then all uses of the frame
+pointer are replaced with immediate ``0`` offsets.
+
+.. _amdgpu-amdhsa-kernel-prolog-flat-scratch:
 
 Flat Scratch
 ++++++++++++
 
-If the kernel may use flat operations to access scratch memory, the prolog code
-must set up FLAT_SCRATCH register pair (FLAT_SCRATCH_LO/FLAT_SCRATCH_HI which
-are in SGPRn-4/SGPRn-3). Initialization uses Flat Scratch Init and Scratch
-Wavefront Offset SGPR registers (see
+If the kernel or any function it calls may use flat operations to access
+scratch memory, the prolog code must set up the FLAT_SCRATCH register pair
+(FLAT_SCRATCH_LO/FLAT_SCRATCH_HI which are in SGPRn-4/SGPRn-3). Initialization
+uses Flat Scratch Init and Scratch Wavefront Offset SGPR registers (see
 :ref:`amdgpu-amdhsa-initial-kernel-execution-state`):
 
 GFX6
@@ -6074,6 +6110,52 @@ GFX9-GFX10
   FLAT_SCRATCH pair for use as the flat scratch base in flat memory
   instructions.
 
+.. _amdgpu-amdhsa-kernel-prolog-private-segment-buffer:
+
+Private Segment Buffer
+++++++++++++++++++++++
+
+A set of four SGPRs beginning at a four-aligned SGPR index are always selected
+to serve as the scratch V# for the kernel as follows:
+
+  - If it is know during instruction selection that there is stack usage,
+    SGPR0-3 is reserved for use as the scratch V#.  Stack usage is assumed if
+    optimisations are disabled (``-O0``), if stack objects already exist (for
+    locals, etc.), or if there are any function calls.
+
+  - Otherwise, four high numbered SGPRs beginning at a four-aligned SGPR index
+    are reserved for the tentative scratch V#. These will be used if it is
+    determined that spilling is needed.
+
+    - If no use is made of the tentative scratch V#, then it is unreserved
+      and the register count is determined ignoring it.
+    - If use is made of the tenatative scratch V#, then its register numbers
+      are shifted to the first four-aligned SGPR index after the highest one
+      allocated by the register allocator, and all uses are updated. The
+      register count includes them in the shifted location.
+    - In either case, if the processor has the SGPR allocation bug, the
+      tentative allocation is not shifted or unreserved in order to ensure
+      the register count is higher to workaround the bug.
+
+    .. note::
+
+      This approach of using a tentative scratch V# and shifting the register
+      numbers if used avoids having to perform register allocation a second
+      time if the tentative V# is eliminated. This is more efficient and
+      avoids the problem that the second register allocation may perform
+      spilling which will fail as there is no longer a scratch V#.
+
+When the kernel prolog code is being emitted it is known whether the scratch V#
+described above is actually used. If it is, the prolog code must set it up by
+copying the Private Segment Buffer to the scratch V# registers and then adding
+the Private Segment Wavefront Offset to the queue base address in the V#. The
+result is a V# with a base address pointing to the beginning of the wavefront
+scratch backing memory.
+
+The Private Segment Buffer is always requested, but the Private Segment
+Wavefront Offset is only requested if it is used (see
+:ref:`amdgpu-amdhsa-initial-kernel-execution-state`).
+
 .. _amdgpu-amdhsa-memory-model:
 
 Memory Model
@@ -8514,6 +8596,8 @@ Call Convention
 See :ref:`amdgpu-dwarf-address-space-mapping` for information on swizzled
 addresses. Unswizzled addresses are normal linear addresses.
 
+.. _amdgpu-amdhsa-function-call-convention-kernel-functions:
+
 Kernel Functions
 ++++++++++++++++
 
@@ -8537,44 +8621,10 @@ how the AMDGPU implements function calls:
         by-value struct?
       - What is ABI for lambda values?
 
-2.  The CFI return address is undefined.
-3.  If the kernel contains no calls then:
-
-    - If using the ``amdhsa`` OS ABI (see :ref:`amdgpu-os-table`), and know
-      during ISel that there is stack usage SGPR0-3 is reserved for use as the
-      scratch SRD and SGPR33 reserved for the wave scratch offset. Stack usage
-      is assumed if ``-O0``, if already aware of stack objects for locals, etc.,
-      or if there are any function calls.
-    - Otherwise, five high numbered SGPRs are reserved for the tentative scratch
-      SRD and wave scratch offset. These will be used if determine need to do
-      spilling.
-
-      - If no use is made of the tentative scratch SRD or wave scratch offset,
-        then they are unreserved and the register count is determined ignoring
-        them.
-      - If use is made of the tenatative scratch SRD or wave scratch offset,
-        then the register numbers used are shifted to be after the highest one
-        allocated by the register allocator, and all uses updated. The register
-        count will include them in the shifted location. Since register
-        allocation may introduce spills, this shifting allows them to be
-        eliminated without having to perform register allocation again.
-      - In either case, if the processor has the SGPR allocation bug, the
-        tentative allocation is not shifted or unreserved inorder to ensure the
-        register count is higher to workaround the bug.
-
-4.  If the kernel contains function calls:
-
-    - SP is set to the wave scratch offset.
-
-      - Since SP is an unswizzled address relative to the queue scratch base, an
-        wave scratch offset is an unswizzle offset, this means that if SP is
-        used to access swizzled scratch memory, it will access the private
-        segment address 0.
-
-      .. note::
+4.  The kernel performs certain setup in its prolog, as described in
+    :ref:`amdgpu-amdhsa-kernel-prolog`.
 
-        This is planned to be changed to be the unswizzled base address of the
-        wavefront scratch backing memory.
+.. _amdgpu-amdhsa-function-call-convention-non-kernel-functions:
 
 Non-Kernel Functions
 ++++++++++++++++++++
@@ -8582,26 +8632,23 @@ Non-Kernel Functions
 This section describes the call convention ABI for functions other than the
 outer kernel function.
 
-If a kernel has function calls then scratch is always allocated and used for the
-call stack which grows from low address to high address using the swizzled
+If a kernel has function calls then scratch is always allocated and used for
+the call stack which grows from low address to high address using the swizzled
 scratch address space.
 
 On entry to a function:
 
-1.  SGPR0-3 contain a V# with the following properties:
-
-    * Base address of the queue scratch backing memory.
-
-      .. note::
-
-        This is planned to be changed to be the unswizzled base address of the
-        wavefront scratch backing memory.
+1.  SGPR0-3 contain a V# with the following properties (see
+    :ref:`amdgpu-amdhsa-kernel-prolog-private-segment-buffer`):
 
+    * Base address pointing to the beginning of the wavefront scratch backing
+      memory.
     * Swizzled with dword element size and stride of wavefront size elements.
 
 2.  The FLAT_SCRATCH register pair is setup. See
-    :ref:`amdgpu-amdhsa-flat-scratch`.
-3.  GFX6-8: M0 register set to the size of LDS in bytes.
+    :ref:`amdgpu-amdhsa-kernel-prolog-flat-scratch`.
+3.  GFX6-8: M0 register set to the size of LDS in bytes. See
+    :ref:`amdgpu-amdhsa-kernel-prolog-m0`.
 4.  The EXEC register is set to the lanes active on entry to the function.
 5.  MODE register: *TBD*
 6.  VGPR0-31 and SGPR4-29 are used to pass function input arguments as described
@@ -8609,14 +8656,21 @@ On entry to a function:
 7.  SGPR30-31 return address (RA). The code address that the function must
     return to when it completes. The value is undefined if the function is *no
     return*.
-8.  SGPR32 is used for the stack pointer (SP). It is an unswizzled
-    scratch offset relative to the beginning of the queue scratch backing
-    memory.
+8.  SGPR32 is used for the stack pointer (SP). It is an unswizzled scratch
+    offset relative to the beginning of the wavefront scratch backing memory.
 
     The unswizzled SP can be used with buffer instructions as an unswizzled SGPR
     offset with the scratch V# in SGPR0-3 to access the stack in a swizzled
     manner.
 
+    The unswizzled SP value can be converted into the swizzled SP value by:
+
+      | swizzled SP = unswizzled SP / wavefront size
+
+    This may be used to obtain the private address space address of stack
+    objects and to convert this address to a flat address by adding the flat
+    scratch aperture base address.
+
     The swizzled SP value is always 4 bytes aligned for the ``r600``
     architecture and 16 byte aligned for the ``amdgcn`` architecture.
 
@@ -8639,41 +8693,14 @@ On entry to a function:
     arguments after the last local allocation and adjust SGPR32 to the address
     after the last local allocation.
 
-    .. note::
-
-      The SP value is planned to be changed to be the unswizzled offset relative
-      to the wavefront scratch backing memory.
-
-9.  SGPR33 wavefront scratch base offset. The unswizzled offset from the queue
-    scratch backing memory base to the base of the wavefront scratch backing
-    memory.
-
-    It is used to convert the unswizzled SP value to swizzled address in the
-    private address space by:
-
-      | private address = (unswizzled SP - wavefront scratch base offset) /
-        wavefront size
-
-    This may be used to obtain the private address of stack objects and to
-    convert these address to a flat address by adding the flat scratch aperture
-    base address.
-
-    .. note::
-
-      This is planned to be eliminated when SP is changed to be the unswizzled
-      offset relative to the wavefront scratch backing memory. The the
-      conversion simplifies to:
-
-        | private address = unswizzled SP / wavefront size
-
-10. All other registers are unspecified.
-11. Any necessary ``waitcnt`` has been performed to ensure memory is available
+9.  All other registers are unspecified.
+10. Any necessary ``waitcnt`` has been performed to ensure memory is available
     to the function.
 
 On exit from a function:
 
 1.  VGPR0-31 and SGPR4-29 are used to pass function result arguments as
-    described below. Any registers used are considered clobbered registers,
+    described below. Any registers used are considered clobbered registers.
 2.  The following registers are preserved and have the same value as on entry:
 
     * FLAT_SCRATCH
@@ -8872,7 +8899,7 @@ describes how the AMDGPU implements function calls:
 
 1.  SGPR34 is used as a frame pointer (FP) if necessary. Like the SP it is an
     unswizzled scratch address. It is only needed if runtime sized ``alloca``
-    are used, or for the reasons defined in ``SiFrameLowering``.
+    are used, or for the reasons defined in ``SIFrameLowering``.
 2.  Runtime stack alignment is not currently supported.
 
     .. TODO::
@@ -8886,14 +8913,11 @@ describes how the AMDGPU implements function calls:
 
     ..note::
 
-      Before CFI is generated, the call convention will be changed so that SP is
-      an unswizzled address relative to the wave scratch base.
-
       CFI will be generated that defines the CFA as the unswizzled address
       relative to the wave scratch base in the unswizzled private address space
       of the lowest address stack allocated local variable.
 
-      ``DW_AT_frame_base`` will be defined as the swizelled address in the
+      ``DW_AT_frame_base`` will be defined as the swizzled address in the
       swizzled private address space by dividing the CFA by the wavefront size
       (since CFA is always at least dword aligned which matches the scratch
       swizzle element size).

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index e7e5e132ecbd..86586b3009ad 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -800,8 +800,6 @@ bool AMDGPUCallLowering::lowerFormalArguments(
     TLI.allocateSystemSGPRs(CCInfo, MF, *Info, CC, IsShader);
   } else {
     CCInfo.AllocateReg(Info->getScratchRSrcReg());
-    CCInfo.AllocateReg(Info->getScratchWaveOffsetReg());
-    CCInfo.AllocateReg(Info->getFrameOffsetReg());
     TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
   }
 

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 4265e9992a50..507c14a63d9e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1474,6 +1474,7 @@ static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) {
 }
 
 std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
+  SDLoc DL(N);
   const MachineFunction &MF = CurDAG->getMachineFunction();
   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
 
@@ -1488,9 +1489,8 @@ std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const
   }
 
   // If we don't know this private access is a local stack object, it needs to
-  // be relative to the entry point's scratch wave offset register.
-  return std::make_pair(N, CurDAG->getRegister(Info->getScratchWaveOffsetReg(),
-                                               MVT::i32));
+  // be relative to the entry point's scratch wave offset.
+  return std::make_pair(N, CurDAG->getTargetConstant(0, DL, MVT::i32));
 }
 
 bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
@@ -1515,10 +1515,10 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
     // In a call sequence, stores to the argument stack area are relative to the
     // stack pointer.
     const MachinePointerInfo &PtrInfo = cast<MemSDNode>(Parent)->getPointerInfo();
-    unsigned SOffsetReg = isStackPtrRelative(PtrInfo) ?
-      Info->getStackPtrOffsetReg() : Info->getScratchWaveOffsetReg();
 
-    SOffset = CurDAG->getRegister(SOffsetReg, MVT::i32);
+    SOffset = isStackPtrRelative(PtrInfo)
+                  ? CurDAG->getRegister(Info->getStackPtrOffsetReg(), MVT::i32)
+                  : CurDAG->getTargetConstant(0, DL, MVT::i32);
     ImmOffset = CurDAG->getTargetConstant(Imm & 4095, DL, MVT::i16);
     return true;
   }
@@ -1576,12 +1576,12 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,
   SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
 
   const MachinePointerInfo &PtrInfo = cast<MemSDNode>(Parent)->getPointerInfo();
-  unsigned SOffsetReg = isStackPtrRelative(PtrInfo) ?
-    Info->getStackPtrOffsetReg() : Info->getScratchWaveOffsetReg();
 
   // FIXME: Get from MachinePointerInfo? We should only be using the frame
   // offset if we know this is in a call sequence.
-  SOffset = CurDAG->getRegister(SOffsetReg, MVT::i32);
+  SOffset = isStackPtrRelative(PtrInfo)
+                ? CurDAG->getRegister(Info->getStackPtrOffsetReg(), MVT::i32)
+                : CurDAG->getTargetConstant(0, DL, MVT::i32);
 
   Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16);
   return true;

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index b79c44604dab..f3cfbc1061ff 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -2688,10 +2688,10 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
                const MachineMemOperand *MMO = *MI->memoperands_begin();
                const MachinePointerInfo &PtrInfo = MMO->getPointerInfo();
 
-               Register SOffsetReg = isStackPtrRelative(PtrInfo)
-                                         ? Info->getStackPtrOffsetReg()
-                                         : Info->getScratchWaveOffsetReg();
-               MIB.addReg(SOffsetReg);
+               if (isStackPtrRelative(PtrInfo))
+                 MIB.addReg(Info->getStackPtrOffsetReg());
+               else
+                 MIB.addImm(0);
              },
              [=](MachineInstrBuilder &MIB) { // offset
                MIB.addImm(Offset & 4095);
@@ -2728,13 +2728,6 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
     }
   }
 
-  // If we don't know this private access is a local stack object, it needs to
-  // be relative to the entry point's scratch wave offset register.
-  // TODO: Should split large offsets that don't fit like above.
-  // TODO: Don't use scratch wave offset just because the offset didn't fit.
-  Register SOffset = FI.hasValue() ? Info->getStackPtrOffsetReg()
-                                   : Info->getScratchWaveOffsetReg();
-
   return {{[=](MachineInstrBuilder &MIB) { // rsrc
              MIB.addReg(Info->getScratchRSrcReg());
            },
@@ -2745,7 +2738,15 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
                MIB.addReg(VAddr);
            },
            [=](MachineInstrBuilder &MIB) { // soffset
-             MIB.addReg(SOffset);
+             // If we don't know this private access is a local stack object, it
+             // needs to be relative to the entry point's scratch wave offset.
+             // TODO: Should split large offsets that don't fit like above.
+             // TODO: Don't use scratch wave offset just because the offset
+             // didn't fit.
+             if (!Info->isEntryFunction() && FI.hasValue())
+               MIB.addReg(Info->getStackPtrOffsetReg());
+             else
+               MIB.addImm(0);
            },
            [=](MachineInstrBuilder &MIB) { // offset
              MIB.addImm(Offset);
@@ -2783,15 +2784,17 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffset(
   const MachineMemOperand *MMO = *MI->memoperands_begin();
   const MachinePointerInfo &PtrInfo = MMO->getPointerInfo();
 
-  Register SOffsetReg = isStackPtrRelative(PtrInfo)
-                            ? Info->getStackPtrOffsetReg()
-                            : Info->getScratchWaveOffsetReg();
   return {{
-      [=](MachineInstrBuilder &MIB) {
+      [=](MachineInstrBuilder &MIB) { // rsrc
         MIB.addReg(Info->getScratchRSrcReg());
-      },                                                         // rsrc
-      [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffsetReg); }, // soffset
-      [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }      // offset
+      },
+      [=](MachineInstrBuilder &MIB) { // soffset
+        if (isStackPtrRelative(PtrInfo))
+          MIB.addReg(Info->getStackPtrOffsetReg());
+        else
+          MIB.addImm(0);
+      },
+      [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
   }};
 }
 

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 8a210bc9738a..893ac5ff03be 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -1076,7 +1076,6 @@ bool GCNTargetMachine::parseMachineFunctionInfo(
   };
 
   if (parseRegister(YamlMFI.ScratchRSrcReg, MFI->ScratchRSrcReg) ||
-      parseRegister(YamlMFI.ScratchWaveOffsetReg, MFI->ScratchWaveOffsetReg) ||
       parseRegister(YamlMFI.FrameOffsetReg, MFI->FrameOffsetReg) ||
       parseRegister(YamlMFI.StackPtrOffsetReg, MFI->StackPtrOffsetReg))
     return true;
@@ -1086,11 +1085,6 @@ bool GCNTargetMachine::parseMachineFunctionInfo(
     return diagnoseRegisterClass(YamlMFI.ScratchRSrcReg);
   }
 
-  if (MFI->ScratchWaveOffsetReg != AMDGPU::SCRATCH_WAVE_OFFSET_REG &&
-      !AMDGPU::SGPR_32RegClass.contains(MFI->ScratchWaveOffsetReg)) {
-    return diagnoseRegisterClass(YamlMFI.ScratchWaveOffsetReg);
-  }
-
   if (MFI->FrameOffsetReg != AMDGPU::FP_REG &&
       !AMDGPU::SGPR_32RegClass.contains(MFI->FrameOffsetReg)) {
     return diagnoseRegisterClass(YamlMFI.FrameOffsetReg);

diff  --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index df843f4034ef..0f7db76babb0 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -308,7 +308,6 @@ void AMDGPUInstPrinter::printRegOperand(unsigned RegNo, raw_ostream &O,
   switch (RegNo) {
   case AMDGPU::FP_REG:
   case AMDGPU::SP_REG:
-  case AMDGPU::SCRATCH_WAVE_OFFSET_REG:
   case AMDGPU::PRIVATE_RSRC_REG:
     llvm_unreachable("pseudo-register should not ever be emitted");
   case AMDGPU::SCC:

diff  --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index e9344dbcdf75..73b7e7caaeae 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -612,19 +612,26 @@ void SIFoldOperands::foldOperand(
   if (frameIndexMayFold(TII, *UseMI, UseOpIdx, OpToFold)) {
     // Sanity check that this is a stack access.
     // FIXME: Should probably use stack pseudos before frame lowering.
-    MachineOperand *SOff = TII->getNamedOperand(*UseMI, AMDGPU::OpName::soffset);
-    if (!SOff->isReg() || (SOff->getReg() != MFI->getScratchWaveOffsetReg() &&
-                           SOff->getReg() != MFI->getStackPtrOffsetReg()))
-      return;
 
     if (TII->getNamedOperand(*UseMI, AMDGPU::OpName::srsrc)->getReg() !=
         MFI->getScratchRSrcReg())
       return;
 
+    // Ensure this is either relative to the current frame or the current wave.
+    MachineOperand &SOff =
+        *TII->getNamedOperand(*UseMI, AMDGPU::OpName::soffset);
+    if ((!SOff.isReg() || SOff.getReg() != MFI->getStackPtrOffsetReg()) &&
+        (!SOff.isImm() || SOff.getImm() != 0))
+      return;
+
     // A frame index will resolve to a positive constant, so it should always be
     // safe to fold the addressing mode, even pre-GFX9.
     UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getIndex());
-    SOff->setReg(MFI->getStackPtrOffsetReg());
+
+    // If this is relative to the current wave, update it to be relative to the
+    // current frame.
+    if (SOff.isImm())
+      SOff.ChangeToRegister(MFI->getStackPtrOffsetReg(), false);
     return;
   }
 

diff  --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 71f2660cd090..1f2517bec6db 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -30,12 +30,6 @@ static ArrayRef<MCPhysReg> getAllSGPR128(const GCNSubtarget &ST,
                       ST.getMaxNumSGPRs(MF) / 4);
 }
 
-static ArrayRef<MCPhysReg> getAllSGPRs(const GCNSubtarget &ST,
-                                       const MachineFunction &MF) {
-  return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(),
-                      ST.getMaxNumSGPRs(MF));
-}
-
 // Find a scratch register that we can use at the start of the prologue to
 // re-align the stack pointer. We avoid using callee-save registers since they
 // may appear to be free when this is called from canUseAsPrologue (during
@@ -263,7 +257,7 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
 
 // Shift down registers reserved for the scratch RSRC.
 Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg(
-    MachineFunction &MF) const {
+    MachineFunction &MF, Register ScratchWaveOffsetReg) const {
 
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
@@ -292,8 +286,6 @@ Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg(
   // cannot do this for the resources required for scratch access. For now we
   // skip over user SGPRs and may leave unused holes.
 
-  // We find the resource first because it has an alignment requirement.
-
   unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4;
   ArrayRef<MCPhysReg> AllSGPR128s = getAllSGPR128(ST, MF);
   AllSGPR128s = AllSGPR128s.slice(std::min(static_cast<unsigned>(AllSGPR128s.size()), NumPreloaded));
@@ -303,7 +295,14 @@ Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg(
   for (MCPhysReg Reg : AllSGPR128s) {
     // Pick the first unallocated one. Make sure we don't clobber the other
     // reserved input we needed.
-    if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg)) {
+    //
+    // FIXME: The preloaded SGPR count is not accurate for shaders as the
+    // scratch wave offset may be in a fixed SGPR or
+    // SITargetLowering::allocateSystemSGPRs may choose some free SGPR for the
+    // scratch wave offset. We explicitly avoid the scratch wave offset to
+    // account for this.
+    if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) &&
+        !TRI->isSubRegisterEq(Reg, ScratchWaveOffsetReg)) {
       MRI.replaceRegWith(ScratchRsrcReg, Reg);
       MFI->setScratchRSrcReg(Reg);
       return Reg;
@@ -313,76 +312,6 @@ Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg(
   return ScratchRsrcReg;
 }
 
-// Shift down registers reserved for the scratch wave offset.
-Register SIFrameLowering::getEntryFunctionReservedScratchWaveOffsetReg(
-    MachineFunction &MF) const {
-
-  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
-  const SIInstrInfo *TII = ST.getInstrInfo();
-  const SIRegisterInfo *TRI = &TII->getRegisterInfo();
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-
-  assert(MFI->isEntryFunction());
-
-  Register ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
-
-  if (ScratchWaveOffsetReg == AMDGPU::NoRegister ||
-      (!MRI.isPhysRegUsed(ScratchWaveOffsetReg) && !hasFP(MF) &&
-       !MFI->hasFlatScratchInit())) {
-    assert(!hasFP(MF) && !MFI->hasFlatScratchInit());
-    return AMDGPU::NoRegister;
-  }
-
-  if (ST.hasSGPRInitBug() ||
-      ScratchWaveOffsetReg != TRI->reservedPrivateSegmentWaveByteOffsetReg(MF))
-    return ScratchWaveOffsetReg;
-
-  unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
-
-  ArrayRef<MCPhysReg> AllSGPRs = getAllSGPRs(ST, MF);
-  if (NumPreloaded > AllSGPRs.size())
-    return ScratchWaveOffsetReg;
-
-  AllSGPRs = AllSGPRs.slice(NumPreloaded);
-
-  // We need to drop register from the end of the list that we cannot use
-  // for the scratch wave offset.
-  // + 2 s102 and s103 do not exist on VI.
-  // + 2 for vcc
-  // + 2 for xnack_mask
-  // + 2 for flat_scratch
-  // + 4 for registers reserved for scratch resource register
-  // + 1 for register reserved for scratch wave offset.  (By exluding this
-  //     register from the list to consider, it means that when this
-  //     register is being used for the scratch wave offset and there
-  //     are no other free SGPRs, then the value will stay in this register.
-  // + 1 if stack pointer is used.
-  // ----
-  //  13 (+1)
-  unsigned ReservedRegCount = 13;
-
-  if (AllSGPRs.size() < ReservedRegCount)
-    return ScratchWaveOffsetReg;
-
-  for (MCPhysReg Reg : AllSGPRs.drop_back(ReservedRegCount)) {
-    // Pick the first unallocated SGPR. Be careful not to pick an alias of the
-    // scratch descriptor, since we haven’t added its uses yet.
-    if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg)) {
-      MRI.replaceRegWith(ScratchWaveOffsetReg, Reg);
-      if (MFI->getScratchWaveOffsetReg() == MFI->getStackPtrOffsetReg()) {
-        assert(!hasFP(MF));
-        MFI->setStackPtrOffsetReg(Reg);
-      }
-      MFI->setScratchWaveOffsetReg(Reg);
-      MFI->setFrameOffsetReg(Reg);
-      return Reg;
-    }
-  }
-
-  return ScratchWaveOffsetReg;
-}
-
 void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
                                                 MachineBasicBlock &MBB) const {
   assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
@@ -401,128 +330,89 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
-  const SIRegisterInfo *TRI = &TII->getRegisterInfo();
   MachineRegisterInfo &MRI = MF.getRegInfo();
   const Function &F = MF.getFunction();
 
   assert(MFI->isEntryFunction());
 
-  // We need to do the replacement of the private segment buffer and wave offset
-  // register even if there are no stack objects. There could be stores to undef
-  // or a constant without an associated object.
-  //
-  // These calls will return `AMDGPU::NoRegister` in cases where there are no
-  // actual uses of the respective registers.
-  Register ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF);
-  Register ScratchWaveOffsetReg =
-      getEntryFunctionReservedScratchWaveOffsetReg(MF);
-
-  // Make the selected registers live throughout the function.
-  for (MachineBasicBlock &OtherBB : MF) {
-    if (&OtherBB == &MBB)
-      continue;
-
-    if (ScratchWaveOffsetReg != AMDGPU::NoRegister)
-      OtherBB.addLiveIn(ScratchWaveOffsetReg);
-
-    if (ScratchRsrcReg != AMDGPU::NoRegister)
-      OtherBB.addLiveIn(ScratchRsrcReg);
-  }
-
-  // Now that we have fixed the reserved registers we need to locate the
-  // (potentially) preloaded registers. We should always have a preloaded
-  // scratch wave offset register, but we only have a preloaded scratch rsrc
-  // register for HSA.
-  Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg(
+  Register ScratchWaveOffsetReg = MFI->getPreloadedReg(
       AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
   // FIXME: Hack to not crash in situations which emitted an error.
-  if (PreloadedScratchWaveOffsetReg == AMDGPU::NoRegister)
+  if (ScratchWaveOffsetReg == AMDGPU::NoRegister)
     return;
 
-  // We added live-ins during argument lowering, but since they were not used
-  // they were deleted. We're adding the uses now, so add them back.
-  MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
-  MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
+  // We need to do the replacement of the private segment buffer register even
+  // if there are no stack objects. There could be stores to undef or a
+  // constant without an associated object.
+  //
+  // This will return `AMDGPU::NoRegister` in cases where there are no actual
+  // uses of the SRSRC.
+  Register ScratchRsrcReg =
+      getEntryFunctionReservedScratchRsrcReg(MF, ScratchWaveOffsetReg);
 
+  // Make the selected register live throughout the function.
+  if (ScratchRsrcReg != AMDGPU::NoRegister) {
+    for (MachineBasicBlock &OtherBB : MF) {
+      if (&OtherBB != &MBB) {
+        OtherBB.addLiveIn(ScratchRsrcReg);
+      }
+    }
+  }
+
+  // Now that we have fixed the reserved SRSRC we need to locate the
+  // (potentially) preloaded SRSRC.
   Register PreloadedScratchRsrcReg = AMDGPU::NoRegister;
   if (ST.isAmdHsaOrMesa(F)) {
     PreloadedScratchRsrcReg =
         MFI->getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
     if (ScratchRsrcReg != AMDGPU::NoRegister &&
         PreloadedScratchRsrcReg != AMDGPU::NoRegister) {
+      // We added live-ins during argument lowering, but since they were not
+      // used they were deleted. We're adding the uses now, so add them back.
       MRI.addLiveIn(PreloadedScratchRsrcReg);
       MBB.addLiveIn(PreloadedScratchRsrcReg);
     }
   }
 
+  // Debug location must be unknown since the first debug location is used to
+  // determine the end of the prologue.
   DebugLoc DL;
   MachineBasicBlock::iterator I = MBB.begin();
 
-  const bool HasFP = hasFP(MF);
-
-  // If we are not HSA or we happened to reserved the original input registers,
-  // we don't need to copy to the reserved registers.
-  const bool CopyBuffer = ST.isAmdHsaOrMesa(F) &&
-                          ScratchRsrcReg != AMDGPU::NoRegister &&
-                          PreloadedScratchRsrcReg != AMDGPU::NoRegister &&
-                          ScratchRsrcReg != PreloadedScratchRsrcReg;
-
-  // This needs to be careful of the copying order to avoid overwriting one of
-  // the input registers before it's been copied to it's final
-  // destination. Usually the offset should be copied first.
-  const bool CopyBufferFirst =
-      TRI->isSubRegisterEq(PreloadedScratchRsrcReg, ScratchWaveOffsetReg);
-
-  if (CopyBuffer && CopyBufferFirst) {
-    BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
-        .addReg(PreloadedScratchRsrcReg, RegState::Kill);
-  }
-
-  if (ScratchWaveOffsetReg != AMDGPU::NoRegister) {
-    BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg)
-        .addReg(PreloadedScratchWaveOffsetReg, HasFP ? RegState::Kill : 0);
+  if (MF.getFrameInfo().hasCalls()) {
+    Register SPReg = MFI->getStackPtrOffsetReg();
+    assert(SPReg != AMDGPU::SP_REG);
+    BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg)
+        .addImm(MF.getFrameInfo().getStackSize() * ST.getWavefrontSize());
   }
 
-  if (CopyBuffer && !CopyBufferFirst) {
-    BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
-        .addReg(PreloadedScratchRsrcReg, RegState::Kill);
+  if (hasFP(MF)) {
+    Register FPReg = MFI->getFrameOffsetReg();
+    assert(FPReg != AMDGPU::FP_REG);
+    BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0);
   }
 
-  // FIXME: This should also implement the setup path for HSA.
-  if (ScratchRsrcReg != AMDGPU::NoRegister) {
-    emitEntryFunctionScratchRsrcRegSetup(
-        MF, MBB, I, DL, PreloadedScratchRsrcReg, ScratchRsrcReg);
+  if (MFI->hasFlatScratchInit() || ScratchRsrcReg != AMDGPU::NoRegister) {
+    MRI.addLiveIn(ScratchWaveOffsetReg);
+    MBB.addLiveIn(ScratchWaveOffsetReg);
   }
 
-  if (HasFP) {
-    const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
-    int64_t StackSize = FrameInfo.getStackSize();
-
-    Register SPReg = MFI->getStackPtrOffsetReg();
-    assert(SPReg != AMDGPU::SP_REG);
-
-    // On kernel entry, the private scratch wave offset is the SP value.
-    if (StackSize == 0) {
-      BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), SPReg)
-        .addReg(MFI->getScratchWaveOffsetReg());
-    } else {
-      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), SPReg)
-        .addReg(MFI->getScratchWaveOffsetReg())
-        .addImm(StackSize * ST.getWavefrontSize());
-    }
+  if (MFI->hasFlatScratchInit()) {
+    emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg);
   }
 
-  if (MFI->hasFlatScratchInit()) {
-    emitEntryFunctionFlatScratchInit(MF, MBB, I, DL,
-                                     MFI->getScratchWaveOffsetReg());
+  if (ScratchRsrcReg != AMDGPU::NoRegister) {
+    emitEntryFunctionScratchRsrcRegSetup(MF, MBB, I, DL,
+                                         PreloadedScratchRsrcReg,
+                                         ScratchRsrcReg, ScratchWaveOffsetReg);
   }
 }
 
-// Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoRegister`
+// Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg`
 void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
     MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
     const DebugLoc &DL, Register PreloadedScratchRsrcReg,
-    Register ScratchRsrcReg) const {
+    Register ScratchRsrcReg, Register ScratchWaveOffsetReg) const {
 
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
@@ -646,7 +536,37 @@ void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
     BuildMI(MBB, I, DL, SMovB32, Rsrc3)
       .addImm(Rsrc23 >> 32)
       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+  } else if (ST.isAmdHsaOrMesa(Fn)) {
+    assert(PreloadedScratchRsrcReg != AMDGPU::NoRegister);
+
+    if (ScratchRsrcReg != PreloadedScratchRsrcReg) {
+      BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
+          .addReg(PreloadedScratchRsrcReg, RegState::Kill);
+    }
   }
+
+  // Add the scratch wave offset into the scratch RSRC.
+  //
+  // We only want to update the first 48 bits, which is the base address
+  // pointer, without touching the adjacent 16 bits of flags. We know this add
+  // cannot carry-out from bit 47, otherwise the scratch allocation would be
+  // impossible to fit in the 48-bit global address space.
+  //
+  // TODO: Evaluate if it is better to just construct an SRD using the flat
+  // scratch init and some constants rather than update the one we are passed.
+  Register ScratchRsrcSub0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
+  Register ScratchRsrcSub1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
+
+  // We cannot Kill ScratchWaveOffsetReg here because we allow it to be used in
+  // the kernel body via inreg arguments.
+  BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), ScratchRsrcSub0)
+      .addReg(ScratchRsrcSub0)
+      .addReg(ScratchWaveOffsetReg)
+      .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+  BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), ScratchRsrcSub1)
+      .addReg(ScratchRsrcSub1)
+      .addImm(0)
+      .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
 }
 
 bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const {
@@ -1113,19 +1033,17 @@ MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr(
 
 bool SIFrameLowering::hasFP(const MachineFunction &MF) const {
   const MachineFrameInfo &MFI = MF.getFrameInfo();
-  if (MFI.hasCalls()) {
+
+  // For entry functions we can use an immediate offset in most cases, so the
+  // presence of calls doesn't imply we need a distinct frame pointer.
+  if (MFI.hasCalls() &&
+      !MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) {
     // All offsets are unsigned, so need to be addressed in the same direction
     // as stack growth.
 
     // FIXME: This function is pretty broken, since it can be called before the
     // frame layout is determined or CSR spills are inserted.
-    if (MFI.getStackSize() != 0)
-      return true;
-
-    // For the entry point, the input wave scratch offset must be copied to the
-    // API SP if there are calls.
-    if (MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction())
-      return true;
+    return MFI.getStackSize() != 0;
   }
 
   return MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken() ||

diff  --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
index 46525e9b1fbb..7314057d5ac1 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
@@ -61,17 +61,15 @@ class SIFrameLowering final : public AMDGPUFrameLowering {
                                         const DebugLoc &DL,
                                         Register ScratchWaveOffsetReg) const;
 
-  Register getEntryFunctionReservedScratchRsrcReg(MachineFunction &MF) const;
-
   Register
-  getEntryFunctionReservedScratchWaveOffsetReg(MachineFunction &MF) const;
-
-  void emitEntryFunctionScratchRsrcRegSetup(MachineFunction &MF,
-                                            MachineBasicBlock &MBB,
-                                            MachineBasicBlock::iterator I,
-                                            const DebugLoc &DL,
-                                            Register PreloadedPrivateBufferReg,
-                                            Register ScratchRsrcReg) const;
+  getEntryFunctionReservedScratchRsrcReg(MachineFunction &MF,
+                                         Register ScratchWaveOffsetReg) const;
+
+  void emitEntryFunctionScratchRsrcRegSetup(
+      MachineFunction &MF, MachineBasicBlock &MBB,
+      MachineBasicBlock::iterator I, const DebugLoc &DL,
+      Register PreloadedPrivateBufferReg, Register ScratchRsrcReg,
+      Register ScratchWaveOffsetReg) const;
 
 public:
   bool hasFP(const MachineFunction &MF) const override;

diff  --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 34fe290ade61..0e95475a1974 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1912,67 +1912,45 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM,
     Info.setScratchRSrcReg(ReservedBufferReg);
   }
 
-  // hasFP should be accurate for kernels even before the frame is finalized.
-  if (ST.getFrameLowering()->hasFP(MF)) {
-    MachineRegisterInfo &MRI = MF.getRegInfo();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
 
-    // Try to use s32 as the SP, but move it if it would interfere with input
-    // arguments. This won't work with calls though.
-    //
-    // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
-    // registers.
-    if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
-      Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
-    } else {
-      assert(AMDGPU::isShader(MF.getFunction().getCallingConv()));
+  // For entry functions we have to set up the stack pointer if we use it,
+  // whereas non-entry functions get this "for free". This means there is no
+  // intrinsic advantage to using S32 over S34 in cases where we do not have
+  // calls but do need a frame pointer (i.e. if we are requested to have one
+  // because frame pointer elimination is disabled). To keep things simple we
+  // only ever use S32 as the call ABI stack pointer, and so using it does not
+  // imply we need a separate frame pointer.
+  //
+  // Try to use s32 as the SP, but move it if it would interfere with input
+  // arguments. This won't work with calls though.
+  //
+  // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
+  // registers.
+  if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
+    Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
+  } else {
+    assert(AMDGPU::isShader(MF.getFunction().getCallingConv()));
 
-      if (MFI.hasCalls())
-        report_fatal_error("call in graphics shader with too many input SGPRs");
+    if (MFI.hasCalls())
+      report_fatal_error("call in graphics shader with too many input SGPRs");
 
-      for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
-        if (!MRI.isLiveIn(Reg)) {
-          Info.setStackPtrOffsetReg(Reg);
-          break;
-        }
+    for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
+      if (!MRI.isLiveIn(Reg)) {
+        Info.setStackPtrOffsetReg(Reg);
+        break;
       }
-
-      if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
-        report_fatal_error("failed to find register for SP");
     }
 
-    if (MFI.hasCalls()) {
-      Info.setScratchWaveOffsetReg(AMDGPU::SGPR33);
-      Info.setFrameOffsetReg(AMDGPU::SGPR33);
-    } else {
-      unsigned ReservedOffsetReg =
-        TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
-      Info.setScratchWaveOffsetReg(ReservedOffsetReg);
-      Info.setFrameOffsetReg(ReservedOffsetReg);
-    }
-  } else if (RequiresStackAccess) {
-    assert(!MFI.hasCalls());
-    // We know there are accesses and they will be done relative to SP, so just
-    // pin it to the input.
-    //
-    // FIXME: Should not do this if inline asm is reading/writing these
-    // registers.
-    Register PreloadedSP = Info.getPreloadedReg(
-        AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
-
-    Info.setStackPtrOffsetReg(PreloadedSP);
-    Info.setScratchWaveOffsetReg(PreloadedSP);
-    Info.setFrameOffsetReg(PreloadedSP);
-  } else {
-    assert(!MFI.hasCalls());
+    if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
+      report_fatal_error("failed to find register for SP");
+  }
 
-    // There may not be stack access at all. There may still be spills, or
-    // access of a constant pointer (in which cases an extra copy will be
-    // emitted in the prolog).
-    unsigned ReservedOffsetReg
-      = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
-    Info.setStackPtrOffsetReg(ReservedOffsetReg);
-    Info.setScratchWaveOffsetReg(ReservedOffsetReg);
-    Info.setFrameOffsetReg(ReservedOffsetReg);
+  // hasFP should be accurate for entry functions even before the frame is
+  // finalized, because it does not rely on the known stack size, only
+  // properties like whether variable sized objects are present.
+  if (ST.getFrameLowering()->hasFP(MF)) {
+    Info.setFrameOffsetReg(AMDGPU::SGPR34);
   }
 }
 
@@ -2231,8 +2209,6 @@ SDValue SITargetLowering::LowerFormalArguments(
     allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsShader);
   } else {
     CCInfo.AllocateReg(Info->getScratchRSrcReg());
-    CCInfo.AllocateReg(Info->getScratchWaveOffsetReg());
-    CCInfo.AllocateReg(Info->getFrameOffsetReg());
     allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
   }
 
@@ -10653,11 +10629,6 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
   if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
     MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
 
-  if (Info->getScratchWaveOffsetReg() != AMDGPU::SCRATCH_WAVE_OFFSET_REG) {
-    MRI.replaceRegWith(AMDGPU::SCRATCH_WAVE_OFFSET_REG,
-                       Info->getScratchWaveOffsetReg());
-  }
-
   Info->limitOccupancy(MF);
 
   if (ST.isWave32() && !MF.empty()) {

diff  --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index a2bfd1e64219..d5ec8164956b 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -79,7 +79,6 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
     // Non-entry functions have no special inputs for now, other registers
     // required for scratch access.
     ScratchRSrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;
-    ScratchWaveOffsetReg = AMDGPU::SGPR33;
 
     // TODO: Pick a high register, and shift down, similar to a kernel.
     FrameOffsetReg = AMDGPU::SGPR34;
@@ -87,8 +86,6 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
 
     ArgInfo.PrivateSegmentBuffer =
       ArgDescriptor::createRegister(ScratchRSrcReg);
-    ArgInfo.PrivateSegmentWaveByteOffset =
-      ArgDescriptor::createRegister(ScratchWaveOffsetReg);
 
     if (F.hasFnAttribute("amdgpu-implicitarg-ptr"))
       ImplicitArgPtr = true;
@@ -515,7 +512,6 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
     WaveLimiter(MFI.needsWaveLimiter()),
     HighBitsOf32BitAddress(MFI.get32BitAddressHighBits()),
     ScratchRSrcReg(regToString(MFI.getScratchRSrcReg(), TRI)),
-    ScratchWaveOffsetReg(regToString(MFI.getScratchWaveOffsetReg(), TRI)),
     FrameOffsetReg(regToString(MFI.getFrameOffsetReg(), TRI)),
     StackPtrOffsetReg(regToString(MFI.getStackPtrOffsetReg(), TRI)),
     ArgInfo(convertArgumentInfo(MFI.getArgInfo(), TRI)),

diff  --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index f70b8103ff4c..c6ccad800ccf 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -284,7 +284,6 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
   uint32_t HighBitsOf32BitAddress = 0;
 
   StringValue ScratchRSrcReg = "$private_rsrc_reg";
-  StringValue ScratchWaveOffsetReg = "$scratch_wave_offset_reg";
   StringValue FrameOffsetReg = "$fp_reg";
   StringValue StackPtrOffsetReg = "$sp_reg";
 
@@ -311,8 +310,6 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
     YamlIO.mapOptional("waveLimiter", MFI.WaveLimiter, false);
     YamlIO.mapOptional("scratchRSrcReg", MFI.ScratchRSrcReg,
                        StringValue("$private_rsrc_reg"));
-    YamlIO.mapOptional("scratchWaveOffsetReg", MFI.ScratchWaveOffsetReg,
-                       StringValue("$scratch_wave_offset_reg"));
     YamlIO.mapOptional("frameOffsetReg", MFI.FrameOffsetReg,
                        StringValue("$fp_reg"));
     YamlIO.mapOptional("stackPtrOffsetReg", MFI.StackPtrOffsetReg,
@@ -336,14 +333,14 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
   // Registers that may be reserved for spilling purposes. These may be the same
   // as the input registers.
   Register ScratchRSrcReg = AMDGPU::PRIVATE_RSRC_REG;
-  Register ScratchWaveOffsetReg = AMDGPU::SCRATCH_WAVE_OFFSET_REG;
 
-  // This is the current function's incremented size from the kernel's scratch
-  // wave offset register. For an entry function, this is exactly the same as
-  // the ScratchWaveOffsetReg.
+  // This is the the unswizzled offset from the current dispatch's scratch wave
+  // base to the beginning of the current function's frame.
   Register FrameOffsetReg = AMDGPU::FP_REG;
 
-  // Top of the stack SGPR offset derived from the ScratchWaveOffsetReg.
+  // This is an ABI register used in the non-entry calling convention to
+  // communicate the unswizzled offset from the current dispatch's scratch wave
+  // base to the beginning of the new function's frame.
   Register StackPtrOffsetReg = AMDGPU::SP_REG;
 
   AMDGPUFunctionArgInfo ArgInfo;
@@ -710,10 +707,6 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
     ScratchRSrcReg = Reg;
   }
 
-  Register getScratchWaveOffsetReg() const {
-    return ScratchWaveOffsetReg;
-  }
-
   Register getFrameOffsetReg() const {
     return FrameOffsetReg;
   }
@@ -736,11 +729,6 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
     return StackPtrOffsetReg;
   }
 
-  void setScratchWaveOffsetReg(Register Reg) {
-    assert(Reg != 0 && "Should never be unset");
-    ScratchWaveOffsetReg = Reg;
-  }
-
   Register getQueuePtrUserSGPR() const {
     return ArgInfo.QueuePtr.getRegister();
   }

diff  --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 0b12e649e3e2..806e03358f4a 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -91,6 +91,13 @@ Register SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
   const SIFrameLowering *TFI =
       MF.getSubtarget<GCNSubtarget>().getFrameLowering();
   const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+  // During ISel lowering we always reserve the stack pointer in entry
+  // functions, but never actually want to reference it when accessing our own
+  // frame. If we need a frame pointer we use it, but otherwise we can just use
+  // an immediate "0" which we represent by returning NoRegister.
+  if (FuncInfo->isEntryFunction()) {
+    return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() : Register();
+  }
   return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg()
                         : FuncInfo->getStackPtrOffsetReg();
 }
@@ -177,29 +184,6 @@ unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg(
   return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SGPR_128RegClass);
 }
 
-static unsigned findPrivateSegmentWaveByteOffsetRegIndex(unsigned RegCount) {
-  unsigned Reg;
-
-  // Try to place it in a hole after PrivateSegmentBufferReg.
-  if (RegCount & 3) {
-    // We cannot put the segment buffer in (Idx - 4) ... (Idx - 1) due to
-    // alignment constraints, so we have a hole where can put the wave offset.
-    Reg = RegCount - 1;
-  } else {
-    // We can put the segment buffer in (Idx - 4) ... (Idx - 1) and put the
-    // wave offset before it.
-    Reg = RegCount - 5;
-  }
-
-  return Reg;
-}
-
-unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg(
-  const MachineFunction &MF) const {
-  unsigned Reg = findPrivateSegmentWaveByteOffsetRegIndex(ST.getMaxNumSGPRs(MF));
-  return AMDGPU::SGPR_32RegClass.getRegister(Reg);
-}
-
 BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
 
@@ -279,19 +263,12 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
 
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 
-  unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
-  if (ScratchWaveOffsetReg != AMDGPU::NoRegister) {
-    // Reserve 1 SGPR for scratch wave offset in case we need to spill.
-    reserveRegisterTuples(Reserved, ScratchWaveOffsetReg);
-  }
-
   unsigned ScratchRSrcReg = MFI->getScratchRSrcReg();
   if (ScratchRSrcReg != AMDGPU::NoRegister) {
     // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we need
     // to spill.
     // TODO: May need to reserve a VGPR if doing LDS spilling.
     reserveRegisterTuples(Reserved, ScratchRSrcReg);
-    assert(!isSubRegister(ScratchRSrcReg, ScratchWaveOffsetReg));
   }
 
   // We have to assume the SP is needed in case there are calls in the function,
@@ -722,6 +699,9 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
       SOffset = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0, false);
 
     if (SOffset == AMDGPU::NoRegister) {
+      if (ScratchOffsetReg == AMDGPU::NoRegister) {
+        report_fatal_error("could not scavenge SGPR to spill in entry function");
+      }
       // There are no free SGPRs, and since we are in the process of spilling
       // VGPRs too.  Since we need a VGPR in order to spill SGPRs (this is true
       // on SI/CI and on VI it is true until we implement spilling using scalar
@@ -735,9 +715,14 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
       Scavenged = true;
     }
 
-    BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SOffset)
-      .addReg(ScratchOffsetReg)
-      .addImm(Offset);
+    if (ScratchOffsetReg == AMDGPU::NoRegister) {
+      BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), SOffset)
+          .addImm(Offset);
+    } else {
+      BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SOffset)
+          .addReg(ScratchOffsetReg)
+          .addImm(Offset);
+    }
 
     Offset = 0;
   }
@@ -772,16 +757,21 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
                                    EltSize, MinAlign(Align, EltSize * i));
 
       MIB = BuildMI(*MBB, MI, DL, Desc)
-        .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill))
-        .addReg(ScratchRsrcReg)
-        .addReg(SOffset, SOffsetRegState)
-        .addImm(Offset)
-        .addImm(0) // glc
-        .addImm(0) // slc
-        .addImm(0) // tfe
-        .addImm(0) // dlc
-        .addImm(0) // swz
-        .addMemOperand(NewMMO);
+                .addReg(SubReg,
+                        getDefRegState(!IsStore) | getKillRegState(IsKill))
+                .addReg(ScratchRsrcReg);
+      if (SOffset == AMDGPU::NoRegister) {
+        MIB.addImm(0);
+      } else {
+        MIB.addReg(SOffset, SOffsetRegState);
+      }
+      MIB.addImm(Offset)
+          .addImm(0) // glc
+          .addImm(0) // slc
+          .addImm(0) // tfe
+          .addImm(0) // dlc
+          .addImm(0) // swz
+          .addMemOperand(NewMMO);
 
       if (!IsStore && TmpReg != AMDGPU::NoRegister)
         MIB = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32),
@@ -825,8 +815,7 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
   MachineFrameInfo &FrameInfo = MF->getFrameInfo();
 
   assert(SpillToVGPR || (SuperReg != MFI->getStackPtrOffsetReg() &&
-                         SuperReg != MFI->getFrameOffsetReg() &&
-                         SuperReg != MFI->getScratchWaveOffsetReg()));
+                         SuperReg != MFI->getFrameOffsetReg()));
 
   assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
 
@@ -1135,33 +1124,21 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
       bool IsMUBUF = TII->isMUBUF(*MI);
 
       if (!IsMUBUF && !MFI->isEntryFunction()) {
-        // Convert to an absolute stack address by finding the offset from the
-        // scratch wave base and scaling by the wave size.
+        // Convert to a swizzled stack address by scaling by the wave size.
         //
-        // In an entry function/kernel the offset is already the absolute
-        // address relative to the frame register.
-
-        Register TmpDiffReg =
-          RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false);
-
-        // If there's no free SGPR, in-place modify the FP
-        Register DiffReg = TmpDiffReg.isValid() ? TmpDiffReg : FrameReg;
+        // In an entry function/kernel the offset is already swizzled.
 
         bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32;
-        Register ResultReg = IsCopy ?
-          MI->getOperand(0).getReg() :
-          RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
-
-        BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), DiffReg)
-          .addReg(FrameReg)
-          .addReg(MFI->getScratchWaveOffsetReg());
+        Register ResultReg =
+            IsCopy ? MI->getOperand(0).getReg()
+                   : RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
 
         int64_t Offset = FrameInfo.getObjectOffset(Index);
         if (Offset == 0) {
           // XXX - This never happens because of emergency scavenging slot at 0?
           BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ResultReg)
             .addImm(ST.getWavefrontSizeLog2())
-            .addReg(DiffReg);
+            .addReg(FrameReg);
         } else {
           if (auto MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) {
             Register ScaledReg =
@@ -1170,7 +1147,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
             BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
                     ScaledReg)
               .addImm(ST.getWavefrontSizeLog2())
-              .addReg(DiffReg, RegState::Kill);
+              .addReg(FrameReg);
 
             const bool IsVOP2 = MIB->getOpcode() == AMDGPU::V_ADD_U32_e32;
 
@@ -1207,10 +1184,10 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
             // unavailable. Only one additional mov is needed.
             Register TmpScaledReg =
                 RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false);
-            Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : DiffReg;
+            Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : FrameReg;
 
             BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), ScaledReg)
-              .addReg(DiffReg, RegState::Kill)
+              .addReg(FrameReg)
               .addImm(ST.getWavefrontSizeLog2());
             BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), ScaledReg)
               .addReg(ScaledReg, RegState::Kill)
@@ -1224,19 +1201,12 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
                 .addReg(ScaledReg, RegState::Kill)
                 .addImm(Offset);
               BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHL_B32), ScaledReg)
-                .addReg(DiffReg, RegState::Kill)
+                .addReg(FrameReg)
                 .addImm(ST.getWavefrontSizeLog2());
             }
           }
         }
 
-        if (!TmpDiffReg.isValid()) {
-          // Restore the FP.
-          BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), FrameReg)
-            .addReg(FrameReg)
-            .addReg(MFI->getScratchWaveOffsetReg());
-        }
-
         // Don't introduce an extra copy if we're just materializing in a mov.
         if (IsCopy)
           MI->eraseFromParent();
@@ -1251,10 +1221,17 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
                AMDGPU::getNamedOperandIdx(MI->getOpcode(),
                                           AMDGPU::OpName::vaddr));
 
-        assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
-               MFI->getStackPtrOffsetReg());
-
-        TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->setReg(FrameReg);
+        auto &SOffset = *TII->getNamedOperand(*MI, AMDGPU::OpName::soffset);
+        assert((SOffset.isReg() &&
+                SOffset.getReg() == MFI->getStackPtrOffsetReg()) ||
+               (SOffset.isImm() && SOffset.getImm() == 0));
+        if (SOffset.isReg()) {
+          if (FrameReg == AMDGPU::NoRegister) {
+            SOffset.ChangeToImmediate(0);
+          } else {
+            SOffset.setReg(FrameReg);
+          }
+        }
 
         int64_t Offset = FrameInfo.getObjectOffset(Index);
         int64_t OldImm

diff  --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index 84261bef6b8e..a13f6dc4c0e7 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -50,11 +50,6 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
   /// spilling is needed.
   unsigned reservedPrivateSegmentBufferReg(const MachineFunction &MF) const;
 
-  /// Return the end register initially reserved for the scratch wave offset in
-  /// case spilling is needed.
-  unsigned reservedPrivateSegmentWaveByteOffsetReg(
-    const MachineFunction &MF) const;
-
   BitVector getReservedRegs(const MachineFunction &MF) const override;
 
   const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;

diff  --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index b39b2b4542d0..fb789d05ce9a 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -101,7 +101,6 @@ def VCC_HI : SIReg<"vcc_hi", 107>;
 def PRIVATE_RSRC_REG : SIReg<"private_rsrc", 0>;
 def FP_REG : SIReg<"fp", 0>;
 def SP_REG : SIReg<"sp", 0>;
-def SCRATCH_WAVE_OFFSET_REG : SIReg<"scratch_wave_offset", 0>;
 
 // Pseudo-register to represent the program-counter DWARF register.
 def PC_REG : SIReg<"pc", 0>, DwarfRegNum<[16]> {
@@ -435,7 +434,7 @@ def AGPR_1024 : SIRegisterTuples<getSubRegs<32>.ret, AGPR_32, 255, 1, 32, "a">;
 //===----------------------------------------------------------------------===//
 
 def Pseudo_SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
-  (add FP_REG, SP_REG, SCRATCH_WAVE_OFFSET_REG)> {
+  (add FP_REG, SP_REG)> {
   let isAllocatable = 0;
   let CopyCost = -1;
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
index 34aca1e80f59..3f18877acd94 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
@@ -165,7 +165,7 @@ define void @constrained_if_register_class() {
 ; CHECK-NEXT:    s_cbranch_execz BB4_5
 ; CHECK-NEXT:  ; %bb.4: ; %bb11
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 4.0
-; CHECK-NEXT:    buffer_store_dword v0, v0, s[0:3], s33 offen
+; CHECK-NEXT:    buffer_store_dword v0, v0, s[0:3], 0 offen
 ; CHECK-NEXT:  BB4_5: ; %Flow
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; CHECK-NEXT:  BB4_6: ; %bb12

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
index 7b6863fb17a5..3f573e7f9a86 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
@@ -1684,8 +1684,8 @@ define amdgpu_ps <32 x i32> @dyn_insertelement_v32i32_s_s_s(<32 x i32> inreg %ve
 ; GPRIDX-NEXT:    s_mov_b32 s27, s29
 ; GPRIDX-NEXT:    s_mov_b32 s28, s30
 ; GPRIDX-NEXT:    s_mov_b32 s29, s31
-; GPRIDX-NEXT:    s_mov_b32 s30, s32
 ; GPRIDX-NEXT:    s_mov_b32 s31, s33
+; GPRIDX-NEXT:    s_mov_b32 s30, s32
 ; GPRIDX-NEXT:    s_mov_b32 m0, s35
 ; GPRIDX-NEXT:    s_nop 0
 ; GPRIDX-NEXT:    s_movreld_b32 s0, s34
@@ -1724,8 +1724,8 @@ define amdgpu_ps <32 x i32> @dyn_insertelement_v32i32_s_s_s(<32 x i32> inreg %ve
 ; MOVREL-NEXT:    s_mov_b32 s27, s29
 ; MOVREL-NEXT:    s_mov_b32 s28, s30
 ; MOVREL-NEXT:    s_mov_b32 s29, s31
-; MOVREL-NEXT:    s_mov_b32 s30, s32
 ; MOVREL-NEXT:    s_mov_b32 s31, s33
+; MOVREL-NEXT:    s_mov_b32 s30, s32
 ; MOVREL-NEXT:    s_movreld_b32 s0, s34
 ; MOVREL-NEXT:    ; implicit-def: $vcc_hi
 ; MOVREL-NEXT:    ; return to shader part epilog

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local.mir
index 08fdd0f30a16..1382434fe0a7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local.mir
@@ -16,12 +16,6 @@ body: |
   bb.0:
     liveins:  $vgpr0
 
-    ; GFX6-LABEL: name: load_local_s32_from_4
-    ; GFX6: liveins: $vgpr0
-    ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX6: $m0 = S_MOV_B32 -1
-    ; GFX6: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 4, addrspace 3)
-    ; GFX6: $vgpr0 = COPY [[DS_READ_B32_]]
     ; GFX7-LABEL: name: load_local_s32_from_4
     ; GFX7: liveins: $vgpr0
     ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
@@ -33,6 +27,12 @@ body: |
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9: [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY]], 0, 0, implicit $exec :: (load 4, addrspace 3)
     ; GFX9: $vgpr0 = COPY [[DS_READ_B32_gfx9_]]
+    ; GFX6-LABEL: name: load_local_s32_from_4
+    ; GFX6: liveins: $vgpr0
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX6: $m0 = S_MOV_B32 -1
+    ; GFX6: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 4, addrspace 3)
+    ; GFX6: $vgpr0 = COPY [[DS_READ_B32_]]
     %0:vgpr(p3) = COPY $vgpr0
     %1:vgpr(s32) = G_LOAD %0 :: (load 4, align 4, addrspace 3)
     $vgpr0 = COPY %1
@@ -50,12 +50,6 @@ body: |
   bb.0:
     liveins:  $vgpr0
 
-    ; GFX6-LABEL: name: load_local_s32_from_2
-    ; GFX6: liveins: $vgpr0
-    ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX6: $m0 = S_MOV_B32 -1
-    ; GFX6: [[DS_READ_U16_:%[0-9]+]]:vgpr_32 = DS_READ_U16 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 2, addrspace 3)
-    ; GFX6: $vgpr0 = COPY [[DS_READ_U16_]]
     ; GFX7-LABEL: name: load_local_s32_from_2
     ; GFX7: liveins: $vgpr0
     ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
@@ -67,6 +61,12 @@ body: |
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9: [[DS_READ_U16_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_U16_gfx9 [[COPY]], 0, 0, implicit $exec :: (load 2, addrspace 3)
     ; GFX9: $vgpr0 = COPY [[DS_READ_U16_gfx9_]]
+    ; GFX6-LABEL: name: load_local_s32_from_2
+    ; GFX6: liveins: $vgpr0
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX6: $m0 = S_MOV_B32 -1
+    ; GFX6: [[DS_READ_U16_:%[0-9]+]]:vgpr_32 = DS_READ_U16 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 2, addrspace 3)
+    ; GFX6: $vgpr0 = COPY [[DS_READ_U16_]]
     %0:vgpr(p3) = COPY $vgpr0
     %1:vgpr(s32) = G_LOAD %0 :: (load 2, align 2, addrspace 3)
     $vgpr0 = COPY %1
@@ -81,19 +81,12 @@ regBankSelected: true
 tracksRegLiveness: true
 machineFunctionInfo:
   scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
-  scratchWaveOffsetReg: $sgpr4
   stackPtrOffsetReg: $sgpr32
 
 body: |
   bb.0:
     liveins:  $vgpr0
 
-    ; GFX6-LABEL: name: load_local_s32_from_1
-    ; GFX6: liveins: $vgpr0
-    ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX6: $m0 = S_MOV_B32 -1
-    ; GFX6: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 1, addrspace 3)
-    ; GFX6: $vgpr0 = COPY [[DS_READ_U8_]]
     ; GFX7-LABEL: name: load_local_s32_from_1
     ; GFX7: liveins: $vgpr0
     ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
@@ -105,6 +98,12 @@ body: |
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9: [[DS_READ_U8_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 [[COPY]], 0, 0, implicit $exec :: (load 1, addrspace 3)
     ; GFX9: $vgpr0 = COPY [[DS_READ_U8_gfx9_]]
+    ; GFX6-LABEL: name: load_local_s32_from_1
+    ; GFX6: liveins: $vgpr0
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX6: $m0 = S_MOV_B32 -1
+    ; GFX6: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 1, addrspace 3)
+    ; GFX6: $vgpr0 = COPY [[DS_READ_U8_]]
     %0:vgpr(p3) = COPY $vgpr0
     %1:vgpr(s32) = G_LOAD %0 :: (load 1, align 1, addrspace 3)
     $vgpr0 = COPY %1
@@ -122,12 +121,6 @@ body: |
   bb.0:
     liveins:  $vgpr0
 
-    ; GFX6-LABEL: name: load_local_v2s32
-    ; GFX6: liveins: $vgpr0
-    ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX6: $m0 = S_MOV_B32 -1
-    ; GFX6: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 8, addrspace 3)
-    ; GFX6: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]]
     ; GFX7-LABEL: name: load_local_v2s32
     ; GFX7: liveins: $vgpr0
     ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
@@ -139,6 +132,12 @@ body: |
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9: [[DS_READ_B64_gfx9_:%[0-9]+]]:vreg_64 = DS_READ_B64_gfx9 [[COPY]], 0, 0, implicit $exec :: (load 8, addrspace 3)
     ; GFX9: $vgpr0_vgpr1 = COPY [[DS_READ_B64_gfx9_]]
+    ; GFX6-LABEL: name: load_local_v2s32
+    ; GFX6: liveins: $vgpr0
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX6: $m0 = S_MOV_B32 -1
+    ; GFX6: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 8, addrspace 3)
+    ; GFX6: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]]
     %0:vgpr(p3) = COPY $vgpr0
     %1:vgpr(<2 x s32>) = G_LOAD %0 :: (load 8, align 8, addrspace 3)
     $vgpr0_vgpr1 = COPY %1
@@ -156,12 +155,6 @@ body: |
   bb.0:
     liveins:  $vgpr0
 
-    ; GFX6-LABEL: name: load_local_v2s32_align4
-    ; GFX6: liveins: $vgpr0
-    ; GFX6: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0
-    ; GFX6: $m0 = S_MOV_B32 -1
-    ; GFX6: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3)
-    ; GFX6: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
     ; GFX7-LABEL: name: load_local_v2s32_align4
     ; GFX7: liveins: $vgpr0
     ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
@@ -173,6 +166,12 @@ body: |
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9: [[DS_READ2_B32_gfx9_:%[0-9]+]]:vreg_64 = DS_READ2_B32_gfx9 [[COPY]], 0, 1, 0, implicit $exec :: (load 8, align 4, addrspace 3)
     ; GFX9: $vgpr0_vgpr1 = COPY [[DS_READ2_B32_gfx9_]]
+    ; GFX6-LABEL: name: load_local_v2s32_align4
+    ; GFX6: liveins: $vgpr0
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0
+    ; GFX6: $m0 = S_MOV_B32 -1
+    ; GFX6: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3)
+    ; GFX6: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
     %0:vgpr(p3) = COPY $vgpr0
     %1:vgpr(<2 x s32>) = G_LOAD %0 :: (load 8, align 4, addrspace 3)
     $vgpr0_vgpr1 = COPY %1
@@ -190,12 +189,6 @@ body: |
   bb.0:
     liveins:  $vgpr0
 
-    ; GFX6-LABEL: name: load_local_s64
-    ; GFX6: liveins: $vgpr0
-    ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX6: $m0 = S_MOV_B32 -1
-    ; GFX6: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 8, addrspace 3)
-    ; GFX6: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]]
     ; GFX7-LABEL: name: load_local_s64
     ; GFX7: liveins: $vgpr0
     ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
@@ -207,6 +200,12 @@ body: |
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9: [[DS_READ_B64_gfx9_:%[0-9]+]]:vreg_64 = DS_READ_B64_gfx9 [[COPY]], 0, 0, implicit $exec :: (load 8, addrspace 3)
     ; GFX9: $vgpr0_vgpr1 = COPY [[DS_READ_B64_gfx9_]]
+    ; GFX6-LABEL: name: load_local_s64
+    ; GFX6: liveins: $vgpr0
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX6: $m0 = S_MOV_B32 -1
+    ; GFX6: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 8, addrspace 3)
+    ; GFX6: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]]
     %0:vgpr(p3) = COPY $vgpr0
     %1:vgpr(s64) = G_LOAD %0 :: (load 8, align 8, addrspace 3)
     $vgpr0_vgpr1 = COPY %1
@@ -224,12 +223,6 @@ body: |
   bb.0:
     liveins:  $vgpr0
 
-    ; GFX6-LABEL: name: load_local_s64_align4
-    ; GFX6: liveins: $vgpr0
-    ; GFX6: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0
-    ; GFX6: $m0 = S_MOV_B32 -1
-    ; GFX6: [[LOAD:%[0-9]+]]:vreg_64(s64) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3)
-    ; GFX6: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
     ; GFX7-LABEL: name: load_local_s64_align4
     ; GFX7: liveins: $vgpr0
     ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
@@ -241,6 +234,12 @@ body: |
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9: [[DS_READ2_B32_gfx9_:%[0-9]+]]:vreg_64 = DS_READ2_B32_gfx9 [[COPY]], 0, 1, 0, implicit $exec :: (load 8, align 4, addrspace 3)
     ; GFX9: $vgpr0_vgpr1 = COPY [[DS_READ2_B32_gfx9_]]
+    ; GFX6-LABEL: name: load_local_s64_align4
+    ; GFX6: liveins: $vgpr0
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0
+    ; GFX6: $m0 = S_MOV_B32 -1
+    ; GFX6: [[LOAD:%[0-9]+]]:vreg_64(s64) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3)
+    ; GFX6: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
     %0:vgpr(p3) = COPY $vgpr0
     %1:vgpr(s64) = G_LOAD %0 :: (load 8, align 4, addrspace 3)
     $vgpr0_vgpr1 = COPY %1
@@ -258,12 +257,6 @@ body: |
   bb.0:
     liveins:  $vgpr0
 
-    ; GFX6-LABEL: name: load_local_p3_from_4
-    ; GFX6: liveins: $vgpr0
-    ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX6: $m0 = S_MOV_B32 -1
-    ; GFX6: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 4, addrspace 3)
-    ; GFX6: $vgpr0 = COPY [[DS_READ_B32_]]
     ; GFX7-LABEL: name: load_local_p3_from_4
     ; GFX7: liveins: $vgpr0
     ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
@@ -275,6 +268,12 @@ body: |
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9: [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY]], 0, 0, implicit $exec :: (load 4, addrspace 3)
     ; GFX9: $vgpr0 = COPY [[DS_READ_B32_gfx9_]]
+    ; GFX6-LABEL: name: load_local_p3_from_4
+    ; GFX6: liveins: $vgpr0
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX6: $m0 = S_MOV_B32 -1
+    ; GFX6: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 4, addrspace 3)
+    ; GFX6: $vgpr0 = COPY [[DS_READ_B32_]]
     %0:vgpr(p3) = COPY $vgpr0
     %1:vgpr(p3) = G_LOAD %0 :: (load 4, align 4, addrspace 3)
     $vgpr0 = COPY %1
@@ -292,12 +291,6 @@ body: |
   bb.0:
     liveins:  $vgpr0
 
-    ; GFX6-LABEL: name: load_local_p5_from_4
-    ; GFX6: liveins: $vgpr0
-    ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX6: $m0 = S_MOV_B32 -1
-    ; GFX6: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 4, addrspace 3)
-    ; GFX6: $vgpr0 = COPY [[DS_READ_B32_]]
     ; GFX7-LABEL: name: load_local_p5_from_4
     ; GFX7: liveins: $vgpr0
     ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
@@ -309,6 +302,12 @@ body: |
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9: [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY]], 0, 0, implicit $exec :: (load 4, addrspace 3)
     ; GFX9: $vgpr0 = COPY [[DS_READ_B32_gfx9_]]
+    ; GFX6-LABEL: name: load_local_p5_from_4
+    ; GFX6: liveins: $vgpr0
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX6: $m0 = S_MOV_B32 -1
+    ; GFX6: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 4, addrspace 3)
+    ; GFX6: $vgpr0 = COPY [[DS_READ_B32_]]
     %0:vgpr(p3) = COPY $vgpr0
     %1:vgpr(p3) = G_LOAD %0 :: (load 4, align 4, addrspace 3)
     $vgpr0 = COPY %1
@@ -326,12 +325,6 @@ body: |
   bb.0:
     liveins:  $vgpr0
 
-    ; GFX6-LABEL: name: load_local_p1_align8
-    ; GFX6: liveins: $vgpr0
-    ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX6: $m0 = S_MOV_B32 -1
-    ; GFX6: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 8, addrspace 3)
-    ; GFX6: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]]
     ; GFX7-LABEL: name: load_local_p1_align8
     ; GFX7: liveins: $vgpr0
     ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
@@ -343,6 +336,12 @@ body: |
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9: [[DS_READ_B64_gfx9_:%[0-9]+]]:vreg_64 = DS_READ_B64_gfx9 [[COPY]], 0, 0, implicit $exec :: (load 8, addrspace 3)
     ; GFX9: $vgpr0_vgpr1 = COPY [[DS_READ_B64_gfx9_]]
+    ; GFX6-LABEL: name: load_local_p1_align8
+    ; GFX6: liveins: $vgpr0
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX6: $m0 = S_MOV_B32 -1
+    ; GFX6: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 8, addrspace 3)
+    ; GFX6: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]]
     %0:vgpr(p3) = COPY $vgpr0
     %1:vgpr(p1) = G_LOAD %0 :: (load 8, align 8, addrspace 3)
     $vgpr0_vgpr1 = COPY %1
@@ -360,12 +359,6 @@ body: |
   bb.0:
     liveins:  $vgpr0
 
-    ; GFX6-LABEL: name: load_local_p1_align4
-    ; GFX6: liveins: $vgpr0
-    ; GFX6: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0
-    ; GFX6: $m0 = S_MOV_B32 -1
-    ; GFX6: [[LOAD:%[0-9]+]]:vreg_64(p1) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3)
-    ; GFX6: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
     ; GFX7-LABEL: name: load_local_p1_align4
     ; GFX7: liveins: $vgpr0
     ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
@@ -377,6 +370,12 @@ body: |
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9: [[DS_READ2_B32_gfx9_:%[0-9]+]]:vreg_64 = DS_READ2_B32_gfx9 [[COPY]], 0, 1, 0, implicit $exec :: (load 8, align 4, addrspace 3)
     ; GFX9: $vgpr0_vgpr1 = COPY [[DS_READ2_B32_gfx9_]]
+    ; GFX6-LABEL: name: load_local_p1_align4
+    ; GFX6: liveins: $vgpr0
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0
+    ; GFX6: $m0 = S_MOV_B32 -1
+    ; GFX6: [[LOAD:%[0-9]+]]:vreg_64(p1) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3)
+    ; GFX6: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
     %0:vgpr(p3) = COPY $vgpr0
     %1:vgpr(p1) = G_LOAD %0 :: (load 8, align 4, addrspace 3)
     $vgpr0_vgpr1 = COPY %1
@@ -394,12 +393,6 @@ body: |
   bb.0:
     liveins:  $vgpr0
 
-    ; GFX6-LABEL: name: load_local_p999_from_8
-    ; GFX6: liveins: $vgpr0
-    ; GFX6: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0
-    ; GFX6: $m0 = S_MOV_B32 -1
-    ; GFX6: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p3) :: (load 8, addrspace 3)
-    ; GFX6: $vgpr0_vgpr1 = COPY [[LOAD]](p999)
     ; GFX7-LABEL: name: load_local_p999_from_8
     ; GFX7: liveins: $vgpr0
     ; GFX7: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0
@@ -411,6 +404,12 @@ body: |
     ; GFX9: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0
     ; GFX9: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p3) :: (load 8, addrspace 3)
     ; GFX9: $vgpr0_vgpr1 = COPY [[LOAD]](p999)
+    ; GFX6-LABEL: name: load_local_p999_from_8
+    ; GFX6: liveins: $vgpr0
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0
+    ; GFX6: $m0 = S_MOV_B32 -1
+    ; GFX6: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p3) :: (load 8, addrspace 3)
+    ; GFX6: $vgpr0_vgpr1 = COPY [[LOAD]](p999)
     %0:vgpr(p3) = COPY $vgpr0
     %1:vgpr(p999) = G_LOAD %0 :: (load 8, align 8, addrspace 3)
     $vgpr0_vgpr1 = COPY %1
@@ -428,12 +427,6 @@ body: |
   bb.0:
     liveins:  $vgpr0
 
-    ; GFX6-LABEL: name: load_local_v2p3
-    ; GFX6: liveins: $vgpr0
-    ; GFX6: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0
-    ; GFX6: $m0 = S_MOV_B32 -1
-    ; GFX6: [[LOAD:%[0-9]+]]:vreg_64(<2 x p3>) = G_LOAD [[COPY]](p3) :: (load 8, addrspace 3)
-    ; GFX6: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
     ; GFX7-LABEL: name: load_local_v2p3
     ; GFX7: liveins: $vgpr0
     ; GFX7: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0
@@ -445,6 +438,12 @@ body: |
     ; GFX9: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0
     ; GFX9: [[LOAD:%[0-9]+]]:vreg_64(<2 x p3>) = G_LOAD [[COPY]](p3) :: (load 8, addrspace 3)
     ; GFX9: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
+    ; GFX6-LABEL: name: load_local_v2p3
+    ; GFX6: liveins: $vgpr0
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0
+    ; GFX6: $m0 = S_MOV_B32 -1
+    ; GFX6: [[LOAD:%[0-9]+]]:vreg_64(<2 x p3>) = G_LOAD [[COPY]](p3) :: (load 8, addrspace 3)
+    ; GFX6: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
     %0:vgpr(p3) = COPY $vgpr0
     %1:vgpr(<2 x p3>) = G_LOAD %0 :: (load 8, align 8, addrspace 3)
     $vgpr0_vgpr1 = COPY %1
@@ -462,12 +461,6 @@ body: |
   bb.0:
     liveins:  $vgpr0
 
-    ; GFX6-LABEL: name: load_local_v2s16
-    ; GFX6: liveins: $vgpr0
-    ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX6: $m0 = S_MOV_B32 -1
-    ; GFX6: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 4, addrspace 3)
-    ; GFX6: $vgpr0 = COPY [[DS_READ_B32_]]
     ; GFX7-LABEL: name: load_local_v2s16
     ; GFX7: liveins: $vgpr0
     ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
@@ -479,6 +472,12 @@ body: |
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9: [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY]], 0, 0, implicit $exec :: (load 4, addrspace 3)
     ; GFX9: $vgpr0 = COPY [[DS_READ_B32_gfx9_]]
+    ; GFX6-LABEL: name: load_local_v2s16
+    ; GFX6: liveins: $vgpr0
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX6: $m0 = S_MOV_B32 -1
+    ; GFX6: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 4, addrspace 3)
+    ; GFX6: $vgpr0 = COPY [[DS_READ_B32_]]
     %0:vgpr(p3) = COPY $vgpr0
     %1:vgpr(<2 x s16>) = G_LOAD %0 :: (load 4, align 4, addrspace 3)
     $vgpr0 = COPY %1
@@ -496,12 +495,6 @@ body: |
   bb.0:
     liveins:  $vgpr0
 
-    ; GFX6-LABEL: name: load_local_v4s16
-    ; GFX6: liveins: $vgpr0
-    ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX6: $m0 = S_MOV_B32 -1
-    ; GFX6: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 8, addrspace 3)
-    ; GFX6: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]]
     ; GFX7-LABEL: name: load_local_v4s16
     ; GFX7: liveins: $vgpr0
     ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
@@ -513,6 +506,12 @@ body: |
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9: [[DS_READ_B64_gfx9_:%[0-9]+]]:vreg_64 = DS_READ_B64_gfx9 [[COPY]], 0, 0, implicit $exec :: (load 8, addrspace 3)
     ; GFX9: $vgpr0_vgpr1 = COPY [[DS_READ_B64_gfx9_]]
+    ; GFX6-LABEL: name: load_local_v4s16
+    ; GFX6: liveins: $vgpr0
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX6: $m0 = S_MOV_B32 -1
+    ; GFX6: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 8, addrspace 3)
+    ; GFX6: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]]
     %0:vgpr(p3) = COPY $vgpr0
     %1:vgpr(<4 x s16>) = G_LOAD %0 :: (load 8, align 8, addrspace 3)
     $vgpr0_vgpr1 = COPY %1
@@ -527,7 +526,6 @@ body: |
 # tracksRegLiveness: true
 # machineFunctionInfo:
 #   scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
-#   scratchWaveOffsetReg: $sgpr4
 #   stackPtrOffsetReg: $sgpr32
 
 # body: |
@@ -555,14 +553,6 @@ body: |
   bb.0:
     liveins:  $vgpr0
 
-    ; GFX6-LABEL: name: load_local_s32_from_1_gep_65535
-    ; GFX6: liveins: $vgpr0
-    ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec
-    ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX6: $m0 = S_MOV_B32 -1
-    ; GFX6: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 %2, 0, 0, implicit $m0, implicit $exec :: (load 1, addrspace 3)
-    ; GFX6: $vgpr0 = COPY [[DS_READ_U8_]]
     ; GFX7-LABEL: name: load_local_s32_from_1_gep_65535
     ; GFX7: liveins: $vgpr0
     ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
@@ -574,6 +564,14 @@ body: |
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9: [[DS_READ_U8_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 [[COPY]], 65535, 0, implicit $exec :: (load 1, addrspace 3)
     ; GFX9: $vgpr0 = COPY [[DS_READ_U8_gfx9_]]
+    ; GFX6-LABEL: name: load_local_s32_from_1_gep_65535
+    ; GFX6: liveins: $vgpr0
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec
+    ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX6: $m0 = S_MOV_B32 -1
+    ; GFX6: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 %2, 0, 0, implicit $m0, implicit $exec :: (load 1, addrspace 3)
+    ; GFX6: $vgpr0 = COPY [[DS_READ_U8_]]
     %0:vgpr(p3) = COPY $vgpr0
     %1:vgpr(s32) = G_CONSTANT i32 65535
     %2:vgpr(p3) = G_PTR_ADD %0, %1
@@ -593,14 +591,6 @@ body: |
   bb.0:
     liveins:  $vgpr0
 
-    ; GFX6-LABEL: name: load_local_s32_from_1_gep_65535_known_bits_base_address
-    ; GFX6: liveins: $vgpr0
-    ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2147483647, implicit $exec
-    ; GFX6: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY]], [[V_MOV_B32_e32_]], implicit $exec
-    ; GFX6: $m0 = S_MOV_B32 -1
-    ; GFX6: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 [[V_AND_B32_e64_]], 65535, 0, implicit $m0, implicit $exec :: (load 1, addrspace 3)
-    ; GFX6: $vgpr0 = COPY [[DS_READ_U8_]]
     ; GFX7-LABEL: name: load_local_s32_from_1_gep_65535_known_bits_base_address
     ; GFX7: liveins: $vgpr0
     ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
@@ -616,6 +606,14 @@ body: |
     ; GFX9: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY]], [[V_MOV_B32_e32_]], implicit $exec
     ; GFX9: [[DS_READ_U8_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 [[V_AND_B32_e64_]], 65535, 0, implicit $exec :: (load 1, addrspace 3)
     ; GFX9: $vgpr0 = COPY [[DS_READ_U8_gfx9_]]
+    ; GFX6-LABEL: name: load_local_s32_from_1_gep_65535_known_bits_base_address
+    ; GFX6: liveins: $vgpr0
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2147483647, implicit $exec
+    ; GFX6: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY]], [[V_MOV_B32_e32_]], implicit $exec
+    ; GFX6: $m0 = S_MOV_B32 -1
+    ; GFX6: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 [[V_AND_B32_e64_]], 65535, 0, implicit $m0, implicit $exec :: (load 1, addrspace 3)
+    ; GFX6: $vgpr0 = COPY [[DS_READ_U8_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = G_CONSTANT i32 2147483647
     %2:vgpr(s32) = G_AND %0, %1
@@ -638,14 +636,6 @@ body: |
   bb.0:
     liveins:  $vgpr0
 
-    ; GFX6-LABEL: name: load_local_s32_from_1_gep_65536
-    ; GFX6: liveins: $vgpr0
-    ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65536, implicit $exec
-    ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX6: $m0 = S_MOV_B32 -1
-    ; GFX6: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 %2, 0, 0, implicit $m0, implicit $exec :: (load 1, addrspace 3)
-    ; GFX6: $vgpr0 = COPY [[DS_READ_U8_]]
     ; GFX7-LABEL: name: load_local_s32_from_1_gep_65536
     ; GFX7: liveins: $vgpr0
     ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
@@ -661,6 +651,14 @@ body: |
     ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX9: [[DS_READ_U8_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 [[V_ADD_U32_e64_]], 0, 0, implicit $exec :: (load 1, addrspace 3)
     ; GFX9: $vgpr0 = COPY [[DS_READ_U8_gfx9_]]
+    ; GFX6-LABEL: name: load_local_s32_from_1_gep_65536
+    ; GFX6: liveins: $vgpr0
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65536, implicit $exec
+    ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX6: $m0 = S_MOV_B32 -1
+    ; GFX6: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 %2, 0, 0, implicit $m0, implicit $exec :: (load 1, addrspace 3)
+    ; GFX6: $vgpr0 = COPY [[DS_READ_U8_]]
     %0:vgpr(p3) = COPY $vgpr0
     %1:vgpr(s32) = G_CONSTANT i32 65536
     %2:vgpr(p3) = G_PTR_ADD %0, %1
@@ -680,14 +678,6 @@ body: |
   bb.0:
     liveins:  $vgpr0
 
-    ; GFX6-LABEL: name: load_local_s32_from_1_gep_m1
-    ; GFX6: liveins: $vgpr0
-    ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec
-    ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX6: $m0 = S_MOV_B32 -1
-    ; GFX6: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 %2, 0, 0, implicit $m0, implicit $exec :: (load 1, addrspace 3)
-    ; GFX6: $vgpr0 = COPY [[DS_READ_U8_]]
     ; GFX7-LABEL: name: load_local_s32_from_1_gep_m1
     ; GFX7: liveins: $vgpr0
     ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
@@ -703,6 +693,14 @@ body: |
     ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX9: [[DS_READ_U8_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 [[V_ADD_U32_e64_]], 0, 0, implicit $exec :: (load 1, addrspace 3)
     ; GFX9: $vgpr0 = COPY [[DS_READ_U8_gfx9_]]
+    ; GFX6-LABEL: name: load_local_s32_from_1_gep_m1
+    ; GFX6: liveins: $vgpr0
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec
+    ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX6: $m0 = S_MOV_B32 -1
+    ; GFX6: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 %2, 0, 0, implicit $m0, implicit $exec :: (load 1, addrspace 3)
+    ; GFX6: $vgpr0 = COPY [[DS_READ_U8_]]
     %0:vgpr(p3) = COPY $vgpr0
     %1:vgpr(s32) = G_CONSTANT i32 -1
     %2:vgpr(p3) = G_PTR_ADD %0, %1
@@ -722,14 +720,6 @@ body: |
   bb.0:
     liveins:  $vgpr0_vgpr1
 
-    ; GFX6-LABEL: name: load_local_s64_align4_from_1_gep_1016
-    ; GFX6: liveins: $vgpr0_vgpr1
-    ; GFX6: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0
-    ; GFX6: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1016
-    ; GFX6: [[PTR_ADD:%[0-9]+]]:vgpr(p3) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX6: $m0 = S_MOV_B32 -1
-    ; GFX6: [[LOAD:%[0-9]+]]:vreg_64(s64) = G_LOAD [[PTR_ADD]](p3) :: (load 8, align 4, addrspace 3)
-    ; GFX6: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
     ; GFX7-LABEL: name: load_local_s64_align4_from_1_gep_1016
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
@@ -741,6 +731,14 @@ body: |
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9: [[DS_READ2_B32_gfx9_:%[0-9]+]]:vreg_64 = DS_READ2_B32_gfx9 [[COPY]], 254, 255, 0, implicit $exec :: (load 8, align 4, addrspace 3)
     ; GFX9: $vgpr0_vgpr1 = COPY [[DS_READ2_B32_gfx9_]]
+    ; GFX6-LABEL: name: load_local_s64_align4_from_1_gep_1016
+    ; GFX6: liveins: $vgpr0_vgpr1
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0
+    ; GFX6: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1016
+    ; GFX6: [[PTR_ADD:%[0-9]+]]:vgpr(p3) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; GFX6: $m0 = S_MOV_B32 -1
+    ; GFX6: [[LOAD:%[0-9]+]]:vreg_64(s64) = G_LOAD [[PTR_ADD]](p3) :: (load 8, align 4, addrspace 3)
+    ; GFX6: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
     %0:vgpr(p3) = COPY $vgpr0
     %1:vgpr(s32) = G_CONSTANT i32 1016
     %2:vgpr(p3) = G_PTR_ADD %0, %1
@@ -760,14 +758,6 @@ body: |
   bb.0:
     liveins:  $vgpr0_vgpr1
 
-    ; GFX6-LABEL: name: load_local_s64_align4_from_1_gep_1020
-    ; GFX6: liveins: $vgpr0_vgpr1
-    ; GFX6: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0
-    ; GFX6: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1020
-    ; GFX6: [[PTR_ADD:%[0-9]+]]:vgpr(p3) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX6: $m0 = S_MOV_B32 -1
-    ; GFX6: [[LOAD:%[0-9]+]]:vreg_64(s64) = G_LOAD [[PTR_ADD]](p3) :: (load 8, align 4, addrspace 3)
-    ; GFX6: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
     ; GFX7-LABEL: name: load_local_s64_align4_from_1_gep_1020
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
@@ -783,6 +773,14 @@ body: |
     ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX9: [[DS_READ2_B32_gfx9_:%[0-9]+]]:vreg_64 = DS_READ2_B32_gfx9 [[V_ADD_U32_e64_]], 0, 1, 0, implicit $exec :: (load 8, align 4, addrspace 3)
     ; GFX9: $vgpr0_vgpr1 = COPY [[DS_READ2_B32_gfx9_]]
+    ; GFX6-LABEL: name: load_local_s64_align4_from_1_gep_1020
+    ; GFX6: liveins: $vgpr0_vgpr1
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0
+    ; GFX6: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1020
+    ; GFX6: [[PTR_ADD:%[0-9]+]]:vgpr(p3) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; GFX6: $m0 = S_MOV_B32 -1
+    ; GFX6: [[LOAD:%[0-9]+]]:vreg_64(s64) = G_LOAD [[PTR_ADD]](p3) :: (load 8, align 4, addrspace 3)
+    ; GFX6: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
     %0:vgpr(p3) = COPY $vgpr0
     %1:vgpr(s32) = G_CONSTANT i32 1020
     %2:vgpr(p3) = G_PTR_ADD %0, %1

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir
index 12e6e747a070..13e4035a4882 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir
@@ -10,7 +10,6 @@ regBankSelected: true
 tracksRegLiveness: true
 machineFunctionInfo:
   scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
-  scratchWaveOffsetReg: $sgpr4
   stackPtrOffsetReg: $sgpr32
 
 body: |
@@ -20,12 +19,12 @@ body: |
     ; GFX6-LABEL: name: load_private_s32_from_4
     ; GFX6: liveins: $vgpr0
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5)
+    ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5)
     ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
     ; GFX9-LABEL: name: load_private_s32_from_4
     ; GFX9: liveins: $vgpr0
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5)
+    ; GFX9: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5)
     ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
     %0:vgpr(p5) = COPY $vgpr0
     %1:vgpr(s32) = G_LOAD %0 :: (load 4, align 4, addrspace 5)
@@ -41,7 +40,6 @@ regBankSelected: true
 tracksRegLiveness: true
 machineFunctionInfo:
   scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
-  scratchWaveOffsetReg: $sgpr4
   stackPtrOffsetReg: $sgpr32
 
 body: |
@@ -51,12 +49,12 @@ body: |
     ; GFX6-LABEL: name: load_private_s32_from_2
     ; GFX6: liveins: $vgpr0
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX6: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 2, addrspace 5)
+    ; GFX6: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 2, addrspace 5)
     ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_OFFEN]]
     ; GFX9-LABEL: name: load_private_s32_from_2
     ; GFX9: liveins: $vgpr0
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 2, addrspace 5)
+    ; GFX9: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 2, addrspace 5)
     ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_OFFEN]]
     %0:vgpr(p5) = COPY $vgpr0
     %1:vgpr(s32) = G_LOAD %0 :: (load 2, align 2, addrspace 5)
@@ -72,7 +70,6 @@ regBankSelected: true
 tracksRegLiveness: true
 machineFunctionInfo:
   scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
-  scratchWaveOffsetReg: $sgpr4
   stackPtrOffsetReg: $sgpr32
 
 body: |
@@ -82,12 +79,12 @@ body: |
     ; GFX6-LABEL: name: load_private_s32_from_1
     ; GFX6: liveins: $vgpr0
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
     ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
     ; GFX9-LABEL: name: load_private_s32_from_1
     ; GFX9: liveins: $vgpr0
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
     ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
     %0:vgpr(p5) = COPY $vgpr0
     %1:vgpr(s32) = G_LOAD %0 :: (load 1, align 1, addrspace 5)
@@ -130,7 +127,6 @@ regBankSelected: true
 tracksRegLiveness: true
 machineFunctionInfo:
   scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
-  scratchWaveOffsetReg: $sgpr4
   stackPtrOffsetReg: $sgpr32
 
 body: |
@@ -161,7 +157,6 @@ regBankSelected: true
 tracksRegLiveness: true
 machineFunctionInfo:
   scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
-  scratchWaveOffsetReg: $sgpr4
   stackPtrOffsetReg: $sgpr32
 
 body: |
@@ -196,7 +191,6 @@ regBankSelected: true
 tracksRegLiveness: true
 machineFunctionInfo:
   scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
-  scratchWaveOffsetReg: $sgpr4
   stackPtrOffsetReg: $sgpr32
 
 body: |
@@ -208,12 +202,12 @@ body: |
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2047, implicit $exec
     ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
     ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
     ; GFX9-LABEL: name: load_private_s32_from_1_gep_2047
     ; GFX9: liveins: $vgpr0
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 2047, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 2047, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
     ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
     %0:vgpr(p5) = COPY $vgpr0
     %1:vgpr(s32) = G_CONSTANT i32 2047
@@ -231,7 +225,6 @@ regBankSelected: true
 tracksRegLiveness: true
 machineFunctionInfo:
   scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
-  scratchWaveOffsetReg: $sgpr4
   stackPtrOffsetReg: $sgpr32
 
 body: |
@@ -243,14 +236,14 @@ body: |
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2147483647, implicit $exec
     ; GFX6: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY]], [[V_MOV_B32_e32_]], implicit $exec
-    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_AND_B32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 2047, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_AND_B32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 2047, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
     ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
     ; GFX9-LABEL: name: load_private_s32_from_1_gep_2047_known_bits
     ; GFX9: liveins: $vgpr0
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2147483647, implicit $exec
     ; GFX9: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY]], [[V_MOV_B32_e32_]], implicit $exec
-    ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_AND_B32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 2047, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_AND_B32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 2047, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
     ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = G_CONSTANT i32 2147483647
@@ -271,7 +264,6 @@ regBankSelected: true
 tracksRegLiveness: true
 machineFunctionInfo:
   scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
-  scratchWaveOffsetReg: $sgpr4
   stackPtrOffsetReg: $sgpr32
 
 body: |
@@ -283,12 +275,12 @@ body: |
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2048, implicit $exec
     ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
     ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
     ; GFX9-LABEL: name: load_private_s32_from_1_gep_2048
     ; GFX9: liveins: $vgpr0
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 2048, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 2048, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
     ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
     %0:vgpr(p5) = COPY $vgpr0
     %1:vgpr(s32) = G_CONSTANT i32 2048
@@ -306,7 +298,6 @@ regBankSelected: true
 tracksRegLiveness: true
 machineFunctionInfo:
   scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
-  scratchWaveOffsetReg: $sgpr4
   stackPtrOffsetReg: $sgpr32
 
 body: |
@@ -318,14 +309,14 @@ body: |
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965249, implicit $exec
     ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
     ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
     ; GFX9-LABEL: name: load_private_s32_from_1_gep_m2047
     ; GFX9: liveins: $vgpr0
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965249, implicit $exec
     ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
     ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
     %0:vgpr(p5) = COPY $vgpr0
     %1:vgpr(s32) = G_CONSTANT i32 -2047
@@ -343,7 +334,6 @@ regBankSelected: true
 tracksRegLiveness: true
 machineFunctionInfo:
   scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
-  scratchWaveOffsetReg: $sgpr4
   stackPtrOffsetReg: $sgpr32
 
 body: |
@@ -355,14 +345,14 @@ body: |
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec
     ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
     ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
     ; GFX9-LABEL: name: load_private_s32_from_1_gep_m2048
     ; GFX9: liveins: $vgpr0
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec
     ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
     ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
     %0:vgpr(p5) = COPY $vgpr0
     %1:vgpr(s32) = G_CONSTANT i32 -2048
@@ -380,7 +370,6 @@ regBankSelected: true
 tracksRegLiveness: true
 machineFunctionInfo:
   scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
-  scratchWaveOffsetReg: $sgpr4
   stackPtrOffsetReg: $sgpr32
 
 body: |
@@ -392,12 +381,12 @@ body: |
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
     ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
     ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
     ; GFX9-LABEL: name: load_private_s32_from_1_gep_4095
     ; GFX9: liveins: $vgpr0
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4095, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
     ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
     %0:vgpr(p5) = COPY $vgpr0
     %1:vgpr(s32) = G_CONSTANT i32 4095
@@ -415,7 +404,6 @@ regBankSelected: true
 tracksRegLiveness: true
 machineFunctionInfo:
   scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
-  scratchWaveOffsetReg: $sgpr4
   stackPtrOffsetReg: $sgpr32
 
 body: |
@@ -427,14 +415,14 @@ body: |
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
     ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
     ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
     ; GFX9-LABEL: name: load_private_s32_from_1_gep_4096
     ; GFX9: liveins: $vgpr0
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
     ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
     ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
     %0:vgpr(p5) = COPY $vgpr0
     %1:vgpr(s32) = G_CONSTANT i32 4096
@@ -452,7 +440,6 @@ regBankSelected: true
 tracksRegLiveness: true
 machineFunctionInfo:
   scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
-  scratchWaveOffsetReg: $sgpr4
   stackPtrOffsetReg: $sgpr32
 
 body: |
@@ -464,14 +451,14 @@ body: |
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963201, implicit $exec
     ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
     ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
     ; GFX9-LABEL: name: load_private_s32_from_1_gep_m4095
     ; GFX9: liveins: $vgpr0
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963201, implicit $exec
     ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
     ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
     %0:vgpr(p5) = COPY $vgpr0
     %1:vgpr(s32) = G_CONSTANT i32 -4095
@@ -489,7 +476,6 @@ regBankSelected: true
 tracksRegLiveness: true
 machineFunctionInfo:
   scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
-  scratchWaveOffsetReg: $sgpr4
   stackPtrOffsetReg: $sgpr32
 
 body: |
@@ -501,14 +487,14 @@ body: |
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963200, implicit $exec
     ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
     ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
     ; GFX9-LABEL: name: load_private_s32_from_1_gep_m4096
     ; GFX9: liveins: $vgpr0
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963200, implicit $exec
     ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
     ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
     %0:vgpr(p5) = COPY $vgpr0
     %1:vgpr(s32) = G_CONSTANT i32 -4096
@@ -526,7 +512,6 @@ regBankSelected: true
 tracksRegLiveness: true
 machineFunctionInfo:
   scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
-  scratchWaveOffsetReg: $sgpr4
   stackPtrOffsetReg: $sgpr32
 
 body: |
@@ -538,14 +523,14 @@ body: |
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8191, implicit $exec
     ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
     ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
     ; GFX9-LABEL: name: load_private_s32_from_1_gep_8191
     ; GFX9: liveins: $vgpr0
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8191, implicit $exec
     ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
     ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
     %0:vgpr(p5) = COPY $vgpr0
     %1:vgpr(s32) = G_CONSTANT i32 8191
@@ -563,7 +548,6 @@ regBankSelected: true
 tracksRegLiveness: true
 machineFunctionInfo:
   scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
-  scratchWaveOffsetReg: $sgpr4
   stackPtrOffsetReg: $sgpr32
 
 body: |
@@ -575,14 +559,14 @@ body: |
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8192, implicit $exec
     ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
     ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
     ; GFX9-LABEL: name: load_private_s32_from_1_gep_8192
     ; GFX9: liveins: $vgpr0
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8192, implicit $exec
     ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
     ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
     %0:vgpr(p5) = COPY $vgpr0
     %1:vgpr(s32) = G_CONSTANT i32 8192
@@ -600,7 +584,6 @@ regBankSelected: true
 tracksRegLiveness: true
 machineFunctionInfo:
   scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
-  scratchWaveOffsetReg: $sgpr4
   stackPtrOffsetReg: $sgpr32
 
 body: |
@@ -612,14 +595,14 @@ body: |
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959105, implicit $exec
     ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
     ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
     ; GFX9-LABEL: name: load_private_s32_from_1_gep_m8191
     ; GFX9: liveins: $vgpr0
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959105, implicit $exec
     ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
     ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
     %0:vgpr(p5) = COPY $vgpr0
     %1:vgpr(s32) = G_CONSTANT i32 -8191
@@ -637,7 +620,6 @@ regBankSelected: true
 tracksRegLiveness: true
 machineFunctionInfo:
   scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
-  scratchWaveOffsetReg: $sgpr4
   stackPtrOffsetReg: $sgpr32
 
 body: |
@@ -649,14 +631,14 @@ body: |
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959104, implicit $exec
     ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
     ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
     ; GFX9-LABEL: name: load_private_s32_from_1_gep_m8192
     ; GFX9: liveins: $vgpr0
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959104, implicit $exec
     ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
     ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
     %0:vgpr(p5) = COPY $vgpr0
     %1:vgpr(s32) = G_CONSTANT i32 -8192
@@ -674,17 +656,16 @@ regBankSelected: true
 tracksRegLiveness: true
 machineFunctionInfo:
   scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
-  scratchWaveOffsetReg: $sgpr4
   stackPtrOffsetReg: $sgpr32
 
 body: |
   bb.0:
 
     ; GFX6-LABEL: name: load_private_s32_from_4_constant_0
-    ; GFX6: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5)
+    ; GFX6: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5)
     ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]]
     ; GFX9-LABEL: name: load_private_s32_from_4_constant_0
-    ; GFX9: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5)
+    ; GFX9: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5)
     ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]]
     %0:vgpr(p5) = G_CONSTANT i32 0
     %1:vgpr(s32) = G_LOAD %0 :: (load 4, align 4, addrspace 5)
@@ -700,17 +681,16 @@ regBankSelected: true
 tracksRegLiveness: true
 machineFunctionInfo:
   scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
-  scratchWaveOffsetReg: $sgpr4
   stackPtrOffsetReg: $sgpr32
 
 body: |
   bb.0:
 
     ; GFX6-LABEL: name: load_private_s32_from_4_constant_sgpr_16
-    ; GFX6: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 16, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5)
+    ; GFX6: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 16, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5)
     ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]]
     ; GFX9-LABEL: name: load_private_s32_from_4_constant_sgpr_16
-    ; GFX9: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 16, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5)
+    ; GFX9: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 16, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5)
     ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]]
     %0:sgpr(p5) = G_CONSTANT i32 16
     %1:vgpr(s32) = G_LOAD %0 :: (load 4, align 4, addrspace 5)
@@ -726,17 +706,16 @@ regBankSelected: true
 tracksRegLiveness: true
 machineFunctionInfo:
   scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
-  scratchWaveOffsetReg: $sgpr4
   stackPtrOffsetReg: $sgpr32
 
 body: |
   bb.0:
 
     ; GFX6-LABEL: name: load_private_s32_from_1_constant_4095
-    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4095, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
     ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFSET]]
     ; GFX9-LABEL: name: load_private_s32_from_1_constant_4095
-    ; GFX9: [[BUFFER_LOAD_UBYTE_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4095, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX9: [[BUFFER_LOAD_UBYTE_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
     ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFSET]]
     %0:vgpr(p5) = G_CONSTANT i32 4095
     %1:vgpr(s32) = G_LOAD %0 :: (load 1, align 1, addrspace 5)
@@ -752,7 +731,6 @@ regBankSelected: true
 tracksRegLiveness: true
 machineFunctionInfo:
   scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
-  scratchWaveOffsetReg: $sgpr4
   stackPtrOffsetReg: $sgpr32
 
 body: |
@@ -760,11 +738,11 @@ body: |
 
     ; GFX6-LABEL: name: load_private_s32_from_1_constant_4096
     ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
-    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
     ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
     ; GFX9-LABEL: name: load_private_s32_from_1_constant_4096
     ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
-    ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
     ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
     %0:vgpr(p5) = G_CONSTANT i32 4096
     %1:vgpr(s32) = G_LOAD %0 :: (load 1, align 1, addrspace 5)
@@ -780,7 +758,6 @@ regBankSelected: true
 tracksRegLiveness: true
 machineFunctionInfo:
   scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
-  scratchWaveOffsetReg: $sgpr4
   stackPtrOffsetReg: $sgpr32
 stack:
   - { id: 0, size: 4, alignment: 4 }
@@ -808,7 +785,6 @@ regBankSelected: true
 tracksRegLiveness: true
 machineFunctionInfo:
   scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
-  scratchWaveOffsetReg: $sgpr4
   stackPtrOffsetReg: $sgpr32
 stack:
   - { id: 0, size: 4096, alignment: 4 }
@@ -820,7 +796,7 @@ body: |
     ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX6: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
     ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], 0, implicit $exec
-    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
     ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
     ; GFX9-LABEL: name: load_private_s32_from_1_fi_offset_4095
     ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4095, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
@@ -841,7 +817,6 @@ regBankSelected: true
 tracksRegLiveness: true
 machineFunctionInfo:
   scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
-  scratchWaveOffsetReg: $sgpr4
   stackPtrOffsetReg: $sgpr32
 stack:
   - { id: 0, size: 8192, alignment: 4 }
@@ -853,13 +828,13 @@ body: |
     ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX6: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
     ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], 0, implicit $exec
-    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
     ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
     ; GFX9-LABEL: name: load_private_s32_from_1_fi_offset_4096
     ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX9: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
     ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], 0, implicit $exec
-    ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
     ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
     %0:vgpr(p5) = G_FRAME_INDEX %stack.0
     %1:vgpr(s32) = G_CONSTANT i32 4096

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-local.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-local.mir
index 60cc05c7da5c..440c34f10163 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-local.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-local.mir
@@ -13,19 +13,12 @@ regBankSelected: true
 tracksRegLiveness: true
 machineFunctionInfo:
   scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
-  scratchWaveOffsetReg: $sgpr4
   stackPtrOffsetReg: $sgpr32
 
 body: |
   bb.0:
     liveins: $vgpr0, $vgpr1
 
-    ; GFX6-LABEL: name: store_local_s32_to_4
-    ; GFX6: liveins: $vgpr0, $vgpr1
-    ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX6: $m0 = S_MOV_B32 -1
-    ; GFX6: DS_WRITE_B32 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store 4, addrspace 3)
     ; GFX7-LABEL: name: store_local_s32_to_4
     ; GFX7: liveins: $vgpr0, $vgpr1
     ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
@@ -37,6 +30,12 @@ body: |
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX9: DS_WRITE_B32_gfx9 [[COPY1]], [[COPY]], 0, 0, implicit $exec :: (store 4, addrspace 3)
+    ; GFX6-LABEL: name: store_local_s32_to_4
+    ; GFX6: liveins: $vgpr0, $vgpr1
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX6: $m0 = S_MOV_B32 -1
+    ; GFX6: DS_WRITE_B32 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store 4, addrspace 3)
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(p3) = COPY $vgpr1
     G_STORE %0, %1 :: (store 4, align 4, addrspace 3)
@@ -51,19 +50,12 @@ regBankSelected: true
 tracksRegLiveness: true
 machineFunctionInfo:
   scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
-  scratchWaveOffsetReg: $sgpr4
   stackPtrOffsetReg: $sgpr32
 
 body: |
   bb.0:
     liveins: $vgpr0, $vgpr1
 
-    ; GFX6-LABEL: name: store_local_s32_to_2
-    ; GFX6: liveins: $vgpr0, $vgpr1
-    ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX6: $m0 = S_MOV_B32 -1
-    ; GFX6: DS_WRITE_B16 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store 2, addrspace 3)
     ; GFX7-LABEL: name: store_local_s32_to_2
     ; GFX7: liveins: $vgpr0, $vgpr1
     ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
@@ -75,6 +67,12 @@ body: |
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX9: DS_WRITE_B16_gfx9 [[COPY1]], [[COPY]], 0, 0, implicit $exec :: (store 2, addrspace 3)
+    ; GFX6-LABEL: name: store_local_s32_to_2
+    ; GFX6: liveins: $vgpr0, $vgpr1
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX6: $m0 = S_MOV_B32 -1
+    ; GFX6: DS_WRITE_B16 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store 2, addrspace 3)
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(p3) = COPY $vgpr1
     G_STORE %0, %1 :: (store 2, align 2, addrspace 3)
@@ -89,19 +87,12 @@ regBankSelected: true
 tracksRegLiveness: true
 machineFunctionInfo:
   scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
-  scratchWaveOffsetReg: $sgpr4
   stackPtrOffsetReg: $sgpr32
 
 body: |
   bb.0:
     liveins: $vgpr0, $vgpr1
 
-    ; GFX6-LABEL: name: store_local_s32_to_1
-    ; GFX6: liveins: $vgpr0, $vgpr1
-    ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX6: $m0 = S_MOV_B32 -1
-    ; GFX6: DS_WRITE_B8 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store 1, addrspace 3)
     ; GFX7-LABEL: name: store_local_s32_to_1
     ; GFX7: liveins: $vgpr0, $vgpr1
     ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
@@ -113,6 +104,12 @@ body: |
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX9: DS_WRITE_B8_gfx9 [[COPY1]], [[COPY]], 0, 0, implicit $exec :: (store 1, addrspace 3)
+    ; GFX6-LABEL: name: store_local_s32_to_1
+    ; GFX6: liveins: $vgpr0, $vgpr1
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX6: $m0 = S_MOV_B32 -1
+    ; GFX6: DS_WRITE_B8 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store 1, addrspace 3)
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(p3) = COPY $vgpr1
     G_STORE %0, %1 :: (store 1, align 1, addrspace 3)
@@ -127,19 +124,12 @@ regBankSelected: true
 tracksRegLiveness: true
 machineFunctionInfo:
   scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
-  scratchWaveOffsetReg: $sgpr4
   stackPtrOffsetReg: $sgpr32
 
 body: |
   bb.0:
     liveins: $vgpr0, $vgpr1
 
-    ; GFX6-LABEL: name: store_local_v2s16
-    ; GFX6: liveins: $vgpr0, $vgpr1
-    ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX6: $m0 = S_MOV_B32 -1
-    ; GFX6: DS_WRITE_B32 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store 4, addrspace 3)
     ; GFX7-LABEL: name: store_local_v2s16
     ; GFX7: liveins: $vgpr0, $vgpr1
     ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
@@ -151,6 +141,12 @@ body: |
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX9: DS_WRITE_B32_gfx9 [[COPY1]], [[COPY]], 0, 0, implicit $exec :: (store 4, addrspace 3)
+    ; GFX6-LABEL: name: store_local_v2s16
+    ; GFX6: liveins: $vgpr0, $vgpr1
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX6: $m0 = S_MOV_B32 -1
+    ; GFX6: DS_WRITE_B32 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store 4, addrspace 3)
     %0:vgpr(<2 x s16>) = COPY $vgpr0
     %1:vgpr(p3) = COPY $vgpr1
     G_STORE %0, %1 :: (store 4, align 4, addrspace 3)
@@ -165,19 +161,12 @@ regBankSelected: true
 tracksRegLiveness: true
 machineFunctionInfo:
   scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
-  scratchWaveOffsetReg: $sgpr4
   stackPtrOffsetReg: $sgpr32
 
 body: |
   bb.0:
     liveins: $vgpr0, $vgpr1
 
-    ; GFX6-LABEL: name: store_local_p3
-    ; GFX6: liveins: $vgpr0, $vgpr1
-    ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX6: $m0 = S_MOV_B32 -1
-    ; GFX6: DS_WRITE_B32 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store 4, addrspace 3)
     ; GFX7-LABEL: name: store_local_p3
     ; GFX7: liveins: $vgpr0, $vgpr1
     ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
@@ -189,6 +178,12 @@ body: |
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX9: DS_WRITE_B32_gfx9 [[COPY1]], [[COPY]], 0, 0, implicit $exec :: (store 4, addrspace 3)
+    ; GFX6-LABEL: name: store_local_p3
+    ; GFX6: liveins: $vgpr0, $vgpr1
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX6: $m0 = S_MOV_B32 -1
+    ; GFX6: DS_WRITE_B32 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store 4, addrspace 3)
     %0:vgpr(p3) = COPY $vgpr0
     %1:vgpr(p3) = COPY $vgpr1
     G_STORE %0, %1 :: (store 4, align 4, addrspace 3)
@@ -205,11 +200,6 @@ tracksRegLiveness: true
 body: |
   bb.0:
 
-    ; GFX6-LABEL: name: store_local_s32_to_1_constant_4095
-    ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
-    ; GFX6: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX6: $m0 = S_MOV_B32 -1
-    ; GFX6: DS_WRITE_B8 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], 0, 0, implicit $m0, implicit $exec :: (store 1, addrspace 3)
     ; GFX7-LABEL: name: store_local_s32_to_1_constant_4095
     ; GFX7: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
     ; GFX7: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
@@ -219,6 +209,11 @@ body: |
     ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
     ; GFX9: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     ; GFX9: DS_WRITE_B8_gfx9 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], 0, 0, implicit $exec :: (store 1, addrspace 3)
+    ; GFX6-LABEL: name: store_local_s32_to_1_constant_4095
+    ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
+    ; GFX6: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX6: $m0 = S_MOV_B32 -1
+    ; GFX6: DS_WRITE_B8 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], 0, 0, implicit $m0, implicit $exec :: (store 1, addrspace 3)
     %0:vgpr(p3) = G_CONSTANT i32 4095
     %1:vgpr(s32) = G_CONSTANT i32 0
     G_STORE %1, %0 :: (store 1, align 1, addrspace 3)
@@ -233,7 +228,6 @@ regBankSelected: true
 tracksRegLiveness: true
 machineFunctionInfo:
   scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
-  scratchWaveOffsetReg: $sgpr4
   stackPtrOffsetReg: $sgpr32
 stack:
   - { id: 0, size: 4096, alignment: 4 }
@@ -241,11 +235,6 @@ stack:
 body: |
   bb.0:
 
-    ; GFX6-LABEL: name: store_local_s32_to_1_constant_4096
-    ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
-    ; GFX6: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX6: $m0 = S_MOV_B32 -1
-    ; GFX6: DS_WRITE_B8 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], 0, 0, implicit $m0, implicit $exec :: (store 1, addrspace 3)
     ; GFX7-LABEL: name: store_local_s32_to_1_constant_4096
     ; GFX7: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
     ; GFX7: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
@@ -255,6 +244,11 @@ body: |
     ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
     ; GFX9: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     ; GFX9: DS_WRITE_B8_gfx9 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], 0, 0, implicit $exec :: (store 1, addrspace 3)
+    ; GFX6-LABEL: name: store_local_s32_to_1_constant_4096
+    ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
+    ; GFX6: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX6: $m0 = S_MOV_B32 -1
+    ; GFX6: DS_WRITE_B8 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], 0, 0, implicit $m0, implicit $exec :: (store 1, addrspace 3)
     %0:vgpr(p3) = G_CONSTANT i32 4096
     %1:vgpr(s32) = G_CONSTANT i32 0
     G_STORE %1, %0 :: (store 1, align 1, addrspace 3)
@@ -269,19 +263,12 @@ regBankSelected: true
 tracksRegLiveness: true
 machineFunctionInfo:
   scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
-  scratchWaveOffsetReg: $sgpr4
   stackPtrOffsetReg: $sgpr32
 
 body: |
   bb.0:
     liveins: $vgpr0_vgpr1, $vgpr2
 
-    ; GFX6-LABEL: name: store_local_s64_align4
-    ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2
-    ; GFX6: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1
-    ; GFX6: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr2
-    ; GFX6: $m0 = S_MOV_B32 -1
-    ; GFX6: G_STORE [[COPY]](s64), [[COPY1]](p3) :: (store 8, align 4, addrspace 3)
     ; GFX7-LABEL: name: store_local_s64_align4
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -297,6 +284,12 @@ body: |
     ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
     ; GFX9: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
     ; GFX9: DS_WRITE2_B32_gfx9 [[COPY1]], [[COPY3]], [[COPY2]], 0, 1, 0, implicit $exec :: (store 8, align 4, addrspace 3)
+    ; GFX6-LABEL: name: store_local_s64_align4
+    ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr2
+    ; GFX6: $m0 = S_MOV_B32 -1
+    ; GFX6: G_STORE [[COPY]](s64), [[COPY1]](p3) :: (store 8, align 4, addrspace 3)
     %0:vgpr(s64) = COPY $vgpr0_vgpr1
     %1:vgpr(p3) = COPY $vgpr2
     G_STORE %0, %1 :: (store 8, align 4, addrspace 3)
@@ -311,19 +304,12 @@ regBankSelected: true
 tracksRegLiveness: true
 machineFunctionInfo:
   scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
-  scratchWaveOffsetReg: $sgpr4
   stackPtrOffsetReg: $sgpr32
 
 body: |
   bb.0:
     liveins: $vgpr0_vgpr1, $vgpr2
 
-    ; GFX6-LABEL: name: store_local_p1_align4
-    ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2
-    ; GFX6: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
-    ; GFX6: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr2
-    ; GFX6: $m0 = S_MOV_B32 -1
-    ; GFX6: G_STORE [[COPY]](p1), [[COPY1]](p3) :: (store 8, align 4, addrspace 3)
     ; GFX7-LABEL: name: store_local_p1_align4
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -339,6 +325,12 @@ body: |
     ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
     ; GFX9: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
     ; GFX9: DS_WRITE2_B32_gfx9 [[COPY1]], [[COPY3]], [[COPY2]], 0, 1, 0, implicit $exec :: (store 8, align 4, addrspace 3)
+    ; GFX6-LABEL: name: store_local_p1_align4
+    ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr2
+    ; GFX6: $m0 = S_MOV_B32 -1
+    ; GFX6: G_STORE [[COPY]](p1), [[COPY1]](p3) :: (store 8, align 4, addrspace 3)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(p3) = COPY $vgpr2
     G_STORE %0, %1 :: (store 8, align 4, addrspace 3)
@@ -353,19 +345,12 @@ regBankSelected: true
 tracksRegLiveness: true
 machineFunctionInfo:
   scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
-  scratchWaveOffsetReg: $sgpr4
   stackPtrOffsetReg: $sgpr32
 
 body: |
   bb.0:
     liveins: $vgpr0_vgpr1, $vgpr2
 
-    ; GFX6-LABEL: name: store_local_v2s32_align4
-    ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2
-    ; GFX6: [[COPY:%[0-9]+]]:vgpr(<2 x s32>) = COPY $vgpr0_vgpr1
-    ; GFX6: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr2
-    ; GFX6: $m0 = S_MOV_B32 -1
-    ; GFX6: G_STORE [[COPY]](<2 x s32>), [[COPY1]](p3) :: (store 8, align 4, addrspace 3)
     ; GFX7-LABEL: name: store_local_v2s32_align4
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -381,6 +366,12 @@ body: |
     ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
     ; GFX9: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
     ; GFX9: DS_WRITE2_B32_gfx9 [[COPY1]], [[COPY3]], [[COPY2]], 0, 1, 0, implicit $exec :: (store 8, align 4, addrspace 3)
+    ; GFX6-LABEL: name: store_local_v2s32_align4
+    ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr(<2 x s32>) = COPY $vgpr0_vgpr1
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr2
+    ; GFX6: $m0 = S_MOV_B32 -1
+    ; GFX6: G_STORE [[COPY]](<2 x s32>), [[COPY1]](p3) :: (store 8, align 4, addrspace 3)
     %0:vgpr(<2 x s32>) = COPY $vgpr0_vgpr1
     %1:vgpr(p3) = COPY $vgpr2
     G_STORE %0, %1 :: (store 8, align 4, addrspace 3)
@@ -395,19 +386,12 @@ regBankSelected: true
 tracksRegLiveness: true
 machineFunctionInfo:
   scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
-  scratchWaveOffsetReg: $sgpr4
   stackPtrOffsetReg: $sgpr32
 
 body: |
   bb.0:
     liveins: $vgpr0_vgpr1, $vgpr2
 
-    ; GFX6-LABEL: name: store_local_v4s16_align4
-    ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2
-    ; GFX6: [[COPY:%[0-9]+]]:vgpr(<4 x s16>) = COPY $vgpr0_vgpr1
-    ; GFX6: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr2
-    ; GFX6: $m0 = S_MOV_B32 -1
-    ; GFX6: G_STORE [[COPY]](<4 x s16>), [[COPY1]](p3) :: (store 8, align 4, addrspace 3)
     ; GFX7-LABEL: name: store_local_v4s16_align4
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -423,6 +407,12 @@ body: |
     ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
     ; GFX9: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
     ; GFX9: DS_WRITE2_B32_gfx9 [[COPY1]], [[COPY3]], [[COPY2]], 0, 1, 0, implicit $exec :: (store 8, align 4, addrspace 3)
+    ; GFX6-LABEL: name: store_local_v4s16_align4
+    ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr(<4 x s16>) = COPY $vgpr0_vgpr1
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr2
+    ; GFX6: $m0 = S_MOV_B32 -1
+    ; GFX6: G_STORE [[COPY]](<4 x s16>), [[COPY1]](p3) :: (store 8, align 4, addrspace 3)
     %0:vgpr(<4 x s16>) = COPY $vgpr0_vgpr1
     %1:vgpr(p3) = COPY $vgpr2
     G_STORE %0, %1 :: (store 8, align 4, addrspace 3)
@@ -437,19 +427,12 @@ regBankSelected: true
 tracksRegLiveness: true
 machineFunctionInfo:
   scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
-  scratchWaveOffsetReg: $sgpr4
   stackPtrOffsetReg: $sgpr32
 
 body: |
   bb.0:
     liveins: $vgpr0_vgpr1, $vgpr2
 
-    ; GFX6-LABEL: name: store_local_s64_align8
-    ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2
-    ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX6: $m0 = S_MOV_B32 -1
-    ; GFX6: DS_WRITE_B64 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store 8, addrspace 3)
     ; GFX7-LABEL: name: store_local_s64_align8
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -461,6 +444,12 @@ body: |
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9: DS_WRITE_B64_gfx9 [[COPY1]], [[COPY]], 0, 0, implicit $exec :: (store 8, addrspace 3)
+    ; GFX6-LABEL: name: store_local_s64_align8
+    ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX6: $m0 = S_MOV_B32 -1
+    ; GFX6: DS_WRITE_B64 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store 8, addrspace 3)
     %0:vgpr(s64) = COPY $vgpr0_vgpr1
     %1:vgpr(p3) = COPY $vgpr2
     G_STORE %0, %1 :: (store 8, align 8, addrspace 3)
@@ -475,19 +464,12 @@ regBankSelected: true
 tracksRegLiveness: true
 machineFunctionInfo:
   scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
-  scratchWaveOffsetReg: $sgpr4
   stackPtrOffsetReg: $sgpr32
 
 body: |
   bb.0:
     liveins: $vgpr0_vgpr1, $vgpr2
 
-    ; GFX6-LABEL: name: store_local_p1_align8
-    ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2
-    ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX6: $m0 = S_MOV_B32 -1
-    ; GFX6: DS_WRITE_B64 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store 8, addrspace 3)
     ; GFX7-LABEL: name: store_local_p1_align8
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -499,6 +481,12 @@ body: |
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9: DS_WRITE_B64_gfx9 [[COPY1]], [[COPY]], 0, 0, implicit $exec :: (store 8, addrspace 3)
+    ; GFX6-LABEL: name: store_local_p1_align8
+    ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX6: $m0 = S_MOV_B32 -1
+    ; GFX6: DS_WRITE_B64 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store 8, addrspace 3)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(p3) = COPY $vgpr2
     G_STORE %0, %1 :: (store 8, align 8, addrspace 3)
@@ -513,19 +501,12 @@ regBankSelected: true
 tracksRegLiveness: true
 machineFunctionInfo:
   scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
-  scratchWaveOffsetReg: $sgpr4
   stackPtrOffsetReg: $sgpr32
 
 body: |
   bb.0:
     liveins: $vgpr0_vgpr1, $vgpr2
 
-    ; GFX6-LABEL: name: store_local_v2s32_align8
-    ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2
-    ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX6: $m0 = S_MOV_B32 -1
-    ; GFX6: DS_WRITE_B64 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store 8, addrspace 3)
     ; GFX7-LABEL: name: store_local_v2s32_align8
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -537,6 +518,12 @@ body: |
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9: DS_WRITE_B64_gfx9 [[COPY1]], [[COPY]], 0, 0, implicit $exec :: (store 8, addrspace 3)
+    ; GFX6-LABEL: name: store_local_v2s32_align8
+    ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX6: $m0 = S_MOV_B32 -1
+    ; GFX6: DS_WRITE_B64 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store 8, addrspace 3)
     %0:vgpr(<2 x s32>) = COPY $vgpr0_vgpr1
     %1:vgpr(p3) = COPY $vgpr2
     G_STORE %0, %1 :: (store 8, align 8, addrspace 3)
@@ -551,19 +538,12 @@ regBankSelected: true
 tracksRegLiveness: true
 machineFunctionInfo:
   scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
-  scratchWaveOffsetReg: $sgpr4
   stackPtrOffsetReg: $sgpr32
 
 body: |
   bb.0:
     liveins: $vgpr0_vgpr1, $vgpr2
 
-    ; GFX6-LABEL: name: store_local_v4s16_align8
-    ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2
-    ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX6: $m0 = S_MOV_B32 -1
-    ; GFX6: DS_WRITE_B64 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store 8, addrspace 3)
     ; GFX7-LABEL: name: store_local_v4s16_align8
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -575,6 +555,12 @@ body: |
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9: DS_WRITE_B64_gfx9 [[COPY1]], [[COPY]], 0, 0, implicit $exec :: (store 8, addrspace 3)
+    ; GFX6-LABEL: name: store_local_v4s16_align8
+    ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX6: $m0 = S_MOV_B32 -1
+    ; GFX6: DS_WRITE_B64 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store 8, addrspace 3)
     %0:vgpr(<4 x s16>) = COPY $vgpr0_vgpr1
     %1:vgpr(p3) = COPY $vgpr2
     G_STORE %0, %1 :: (store 8, align 8, addrspace 3)
@@ -589,21 +575,12 @@ regBankSelected: true
 tracksRegLiveness: true
 machineFunctionInfo:
   scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
-  scratchWaveOffsetReg: $sgpr4
   stackPtrOffsetReg: $sgpr32
 
 body: |
   bb.0:
     liveins:  $vgpr0_vgpr1, $vgpr2
 
-    ; GFX6-LABEL: name: store_local_s64_align4_from_1_gep_1016
-    ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2
-    ; GFX6: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1
-    ; GFX6: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr2
-    ; GFX6: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1016
-    ; GFX6: [[PTR_ADD:%[0-9]+]]:vgpr(p3) = G_PTR_ADD [[COPY1]], [[C]](s32)
-    ; GFX6: $m0 = S_MOV_B32 -1
-    ; GFX6: G_STORE [[COPY]](s64), [[PTR_ADD]](p3) :: (store 8, align 4, addrspace 3)
     ; GFX7-LABEL: name: store_local_s64_align4_from_1_gep_1016
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -619,6 +596,14 @@ body: |
     ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
     ; GFX9: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
     ; GFX9: DS_WRITE2_B32_gfx9 [[COPY1]], [[COPY3]], [[COPY2]], 254, 255, 0, implicit $exec :: (store 8, align 4, addrspace 3)
+    ; GFX6-LABEL: name: store_local_s64_align4_from_1_gep_1016
+    ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr2
+    ; GFX6: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1016
+    ; GFX6: [[PTR_ADD:%[0-9]+]]:vgpr(p3) = G_PTR_ADD [[COPY1]], [[C]](s32)
+    ; GFX6: $m0 = S_MOV_B32 -1
+    ; GFX6: G_STORE [[COPY]](s64), [[PTR_ADD]](p3) :: (store 8, align 4, addrspace 3)
     %0:vgpr(s64) = COPY $vgpr0_vgpr1
     %1:vgpr(p3) = COPY $vgpr2
     %2:vgpr(s32) = G_CONSTANT i32 1016
@@ -635,21 +620,12 @@ regBankSelected: true
 tracksRegLiveness: true
 machineFunctionInfo:
   scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
-  scratchWaveOffsetReg: $sgpr4
   stackPtrOffsetReg: $sgpr32
 
 body: |
   bb.0:
     liveins:  $vgpr0_vgpr1, $vgpr2
 
-    ; GFX6-LABEL: name: store_local_s64_align4_from_1_gep_1020
-    ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2
-    ; GFX6: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1
-    ; GFX6: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr2
-    ; GFX6: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1020
-    ; GFX6: [[PTR_ADD:%[0-9]+]]:vgpr(p3) = G_PTR_ADD [[COPY1]], [[C]](s32)
-    ; GFX6: $m0 = S_MOV_B32 -1
-    ; GFX6: G_STORE [[COPY]](s64), [[PTR_ADD]](p3) :: (store 8, align 4, addrspace 3)
     ; GFX7-LABEL: name: store_local_s64_align4_from_1_gep_1020
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -669,6 +645,14 @@ body: |
     ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
     ; GFX9: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
     ; GFX9: DS_WRITE2_B32_gfx9 [[V_ADD_U32_e64_]], [[COPY3]], [[COPY2]], 0, 1, 0, implicit $exec :: (store 8, align 4, addrspace 3)
+    ; GFX6-LABEL: name: store_local_s64_align4_from_1_gep_1020
+    ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr2
+    ; GFX6: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1020
+    ; GFX6: [[PTR_ADD:%[0-9]+]]:vgpr(p3) = G_PTR_ADD [[COPY1]], [[C]](s32)
+    ; GFX6: $m0 = S_MOV_B32 -1
+    ; GFX6: G_STORE [[COPY]](s64), [[PTR_ADD]](p3) :: (store 8, align 4, addrspace 3)
     %0:vgpr(s64) = COPY $vgpr0_vgpr1
     %1:vgpr(p3) = COPY $vgpr2
     %2:vgpr(s32) = G_CONSTANT i32 1020

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir
index e5bc41051119..9ac43878862c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir
@@ -4,29 +4,29 @@
 
 ---
 
-name: store_private_s32_to_4
+name: function_store_private_s32_to_4
 legalized:       true
 regBankSelected: true
 tracksRegLiveness: true
 machineFunctionInfo:
+  isEntryFunction: false
   scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
-  scratchWaveOffsetReg: $sgpr4
   stackPtrOffsetReg: $sgpr32
 
 body: |
   bb.0:
     liveins: $vgpr0, $vgpr1
 
-    ; GFX6-LABEL: name: store_private_s32_to_4
+    ; GFX6-LABEL: name: function_store_private_s32_to_4
     ; GFX6: liveins: $vgpr0, $vgpr1
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5)
-    ; GFX9-LABEL: name: store_private_s32_to_4
+    ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5)
+    ; GFX9-LABEL: name: function_store_private_s32_to_4
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5)
+    ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5)
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(p5) = COPY $vgpr1
     G_STORE %0, %1 :: (store 4, align 4, addrspace 5)
@@ -35,29 +35,29 @@ body: |
 
 ---
 
-name: store_private_s32_to_2
+name: function_store_private_s32_to_2
 legalized:       true
 regBankSelected: true
 tracksRegLiveness: true
 machineFunctionInfo:
+  isEntryFunction: false
   scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
-  scratchWaveOffsetReg: $sgpr4
   stackPtrOffsetReg: $sgpr32
 
 body: |
   bb.0:
     liveins: $vgpr0, $vgpr1
 
-    ; GFX6-LABEL: name: store_private_s32_to_2
+    ; GFX6-LABEL: name: function_store_private_s32_to_2
     ; GFX6: liveins: $vgpr0, $vgpr1
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX6: BUFFER_STORE_SHORT_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 2, addrspace 5)
-    ; GFX9-LABEL: name: store_private_s32_to_2
+    ; GFX6: BUFFER_STORE_SHORT_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 2, addrspace 5)
+    ; GFX9-LABEL: name: function_store_private_s32_to_2
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX9: BUFFER_STORE_SHORT_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 2, addrspace 5)
+    ; GFX9: BUFFER_STORE_SHORT_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 2, addrspace 5)
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(p5) = COPY $vgpr1
     G_STORE %0, %1 :: (store 2, align 2, addrspace 5)
@@ -66,29 +66,29 @@ body: |
 
 ---
 
-name: store_private_s32_to_1
+name: function_store_private_s32_to_1
 legalized:       true
 regBankSelected: true
 tracksRegLiveness: true
 machineFunctionInfo:
+  isEntryFunction: false
   scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
-  scratchWaveOffsetReg: $sgpr4
   stackPtrOffsetReg: $sgpr32
 
 body: |
   bb.0:
     liveins: $vgpr0, $vgpr1
 
-    ; GFX6-LABEL: name: store_private_s32_to_1
+    ; GFX6-LABEL: name: function_store_private_s32_to_1
     ; GFX6: liveins: $vgpr0, $vgpr1
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX6: BUFFER_STORE_BYTE_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5)
-    ; GFX9-LABEL: name: store_private_s32_to_1
+    ; GFX6: BUFFER_STORE_BYTE_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5)
+    ; GFX9-LABEL: name: function_store_private_s32_to_1
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX9: BUFFER_STORE_BYTE_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5)
+    ; GFX9: BUFFER_STORE_BYTE_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5)
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(p5) = COPY $vgpr1
     G_STORE %0, %1 :: (store 1, align 1, addrspace 5)
@@ -97,29 +97,29 @@ body: |
 
 ---
 
-name: store_private_v2s16
+name: function_store_private_v2s16
 legalized:       true
 regBankSelected: true
 tracksRegLiveness: true
 machineFunctionInfo:
+  isEntryFunction: false
   scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
-  scratchWaveOffsetReg: $sgpr4
   stackPtrOffsetReg: $sgpr32
 
 body: |
   bb.0:
     liveins: $vgpr0, $vgpr1
 
-    ; GFX6-LABEL: name: store_private_v2s16
+    ; GFX6-LABEL: name: function_store_private_v2s16
     ; GFX6: liveins: $vgpr0, $vgpr1
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5)
-    ; GFX9-LABEL: name: store_private_v2s16
+    ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5)
+    ; GFX9-LABEL: name: function_store_private_v2s16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5)
+    ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5)
     %0:vgpr(<2 x s16>) = COPY $vgpr0
     %1:vgpr(p5) = COPY $vgpr1
     G_STORE %0, %1 :: (store 4, align 4, addrspace 5)
@@ -128,29 +128,29 @@ body: |
 
 ---
 
-name: store_private_p3
+name: function_store_private_p3
 legalized:       true
 regBankSelected: true
 tracksRegLiveness: true
 machineFunctionInfo:
+  isEntryFunction: false
   scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
-  scratchWaveOffsetReg: $sgpr4
   stackPtrOffsetReg: $sgpr32
 
 body: |
   bb.0:
     liveins: $vgpr0, $vgpr1
 
-    ; GFX6-LABEL: name: store_private_p3
+    ; GFX6-LABEL: name: function_store_private_p3
     ; GFX6: liveins: $vgpr0, $vgpr1
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5)
-    ; GFX9-LABEL: name: store_private_p3
+    ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5)
+    ; GFX9-LABEL: name: function_store_private_p3
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5)
+    ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5)
     %0:vgpr(p3) = COPY $vgpr0
     %1:vgpr(p5) = COPY $vgpr1
     G_STORE %0, %1 :: (store 4, align 4, addrspace 5)
@@ -159,29 +159,29 @@ body: |
 
 ---
 
-name: store_private_p5
+name: function_store_private_p5
 legalized:       true
 regBankSelected: true
 tracksRegLiveness: true
 machineFunctionInfo:
+  isEntryFunction: false
   scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
-  scratchWaveOffsetReg: $sgpr4
   stackPtrOffsetReg: $sgpr32
 
 body: |
   bb.0:
     liveins: $vgpr0, $vgpr1
 
-    ; GFX6-LABEL: name: store_private_p5
+    ; GFX6-LABEL: name: function_store_private_p5
     ; GFX6: liveins: $vgpr0, $vgpr1
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5)
-    ; GFX9-LABEL: name: store_private_p5
+    ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5)
+    ; GFX9-LABEL: name: function_store_private_p5
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5)
+    ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5)
     %0:vgpr(p5) = COPY $vgpr0
     %1:vgpr(p5) = COPY $vgpr1
     G_STORE %0, %1 :: (store 4, align 4, addrspace 5)
@@ -190,13 +190,13 @@ body: |
 
 ---
 
-name: store_private_s32_to_1_fi_offset_4095
+name: function_store_private_s32_to_1_fi_offset_4095
 legalized:       true
 regBankSelected: true
 tracksRegLiveness: true
 machineFunctionInfo:
+  isEntryFunction: false
   scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
-  scratchWaveOffsetReg: $sgpr4
   stackPtrOffsetReg: $sgpr32
 stack:
   - { id: 0, size: 4096, alignment: 4 }
@@ -204,13 +204,13 @@ stack:
 body: |
   bb.0:
 
-    ; GFX6-LABEL: name: store_private_s32_to_1_fi_offset_4095
+    ; GFX6-LABEL: name: function_store_private_s32_to_1_fi_offset_4095
     ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX6: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
     ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], 0, implicit $exec
     ; GFX6: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX6: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_2]], %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5)
-    ; GFX9-LABEL: name: store_private_s32_to_1_fi_offset_4095
+    ; GFX6: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_2]], %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5)
+    ; GFX9-LABEL: name: function_store_private_s32_to_1_fi_offset_4095
     ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     ; GFX9: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4095, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5)
     %0:vgpr(p5) = G_FRAME_INDEX %stack.0
@@ -223,13 +223,13 @@ body: |
 
 ---
 
-name: store_private_s32_to_1_constant_4095
+name: function_store_private_s32_to_1_constant_4095
 legalized:       true
 regBankSelected: true
 tracksRegLiveness: true
 machineFunctionInfo:
+  isEntryFunction: false
   scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
-  scratchWaveOffsetReg: $sgpr4
   stackPtrOffsetReg: $sgpr32
 stack:
   - { id: 0, size: 4096, alignment: 4 }
@@ -237,12 +237,12 @@ stack:
 body: |
   bb.0:
 
-    ; GFX6-LABEL: name: store_private_s32_to_1_constant_4095
+    ; GFX6-LABEL: name: function_store_private_s32_to_1_constant_4095
     ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX6: BUFFER_STORE_BYTE_OFFSET [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4095, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5)
-    ; GFX9-LABEL: name: store_private_s32_to_1_constant_4095
+    ; GFX6: BUFFER_STORE_BYTE_OFFSET [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5)
+    ; GFX9-LABEL: name: function_store_private_s32_to_1_constant_4095
     ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX9: BUFFER_STORE_BYTE_OFFSET [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4095, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5)
+    ; GFX9: BUFFER_STORE_BYTE_OFFSET [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5)
     %0:vgpr(p5) = G_CONSTANT i32 4095
     %1:vgpr(s32) = G_CONSTANT i32 0
     G_STORE %1, %0 :: (store 1, align 1, addrspace 5)
@@ -251,13 +251,13 @@ body: |
 
 ---
 
-name: store_private_s32_to_1_constant_4096
+name: function_store_private_s32_to_1_constant_4096
 legalized:       true
 regBankSelected: true
 tracksRegLiveness: true
 machineFunctionInfo:
+  isEntryFunction: false
   scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
-  scratchWaveOffsetReg: $sgpr4
   stackPtrOffsetReg: $sgpr32
 stack:
   - { id: 0, size: 4096, alignment: 4 }
@@ -265,14 +265,291 @@ stack:
 body: |
   bb.0:
 
-    ; GFX6-LABEL: name: store_private_s32_to_1_constant_4096
+    ; GFX6-LABEL: name: function_store_private_s32_to_1_constant_4096
     ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     ; GFX6: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
-    ; GFX6: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5)
-    ; GFX9-LABEL: name: store_private_s32_to_1_constant_4096
+    ; GFX6: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5)
+    ; GFX9-LABEL: name: function_store_private_s32_to_1_constant_4096
     ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     ; GFX9: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
-    ; GFX9: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5)
+    ; GFX9: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5)
+    %0:vgpr(p5) = G_CONSTANT i32 4096
+    %1:vgpr(s32) = G_CONSTANT i32 0
+    G_STORE %1, %0 :: (store 1, align 1, addrspace 5)
+
+...
+
+---
+
+name: kernel_store_private_s32_to_4
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3
+
+    ; GFX6-LABEL: name: kernel_store_private_s32_to_4
+    ; GFX6: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5)
+    ; GFX9-LABEL: name: kernel_store_private_s32_to_4
+    ; GFX9: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5)
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(p5) = COPY $vgpr1
+    G_STORE %0, %1 :: (store 4, align 4, addrspace 5)
+
+...
+
+---
+
+name: kernel_store_private_s32_to_2
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3
+
+    ; GFX6-LABEL: name: kernel_store_private_s32_to_2
+    ; GFX6: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX6: BUFFER_STORE_SHORT_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 2, addrspace 5)
+    ; GFX9-LABEL: name: kernel_store_private_s32_to_2
+    ; GFX9: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX9: BUFFER_STORE_SHORT_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 2, addrspace 5)
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(p5) = COPY $vgpr1
+    G_STORE %0, %1 :: (store 2, align 2, addrspace 5)
+
+...
+
+---
+
+name: kernel_store_private_s32_to_1
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3
+
+    ; GFX6-LABEL: name: kernel_store_private_s32_to_1
+    ; GFX6: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX6: BUFFER_STORE_BYTE_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5)
+    ; GFX9-LABEL: name: kernel_store_private_s32_to_1
+    ; GFX9: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX9: BUFFER_STORE_BYTE_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5)
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(p5) = COPY $vgpr1
+    G_STORE %0, %1 :: (store 1, align 1, addrspace 5)
+
+...
+
+---
+
+name: kernel_store_private_v2s16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3
+
+    ; GFX6-LABEL: name: kernel_store_private_v2s16
+    ; GFX6: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5)
+    ; GFX9-LABEL: name: kernel_store_private_v2s16
+    ; GFX9: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5)
+    %0:vgpr(<2 x s16>) = COPY $vgpr0
+    %1:vgpr(p5) = COPY $vgpr1
+    G_STORE %0, %1 :: (store 4, align 4, addrspace 5)
+
+...
+
+---
+
+name: kernel_store_private_p3
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3
+
+    ; GFX6-LABEL: name: kernel_store_private_p3
+    ; GFX6: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5)
+    ; GFX9-LABEL: name: kernel_store_private_p3
+    ; GFX9: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5)
+    %0:vgpr(p3) = COPY $vgpr0
+    %1:vgpr(p5) = COPY $vgpr1
+    G_STORE %0, %1 :: (store 4, align 4, addrspace 5)
+
+...
+
+---
+
+name: kernel_store_private_p5
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3
+
+    ; GFX6-LABEL: name: kernel_store_private_p5
+    ; GFX6: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5)
+    ; GFX9-LABEL: name: kernel_store_private_p5
+    ; GFX9: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5)
+    %0:vgpr(p5) = COPY $vgpr0
+    %1:vgpr(p5) = COPY $vgpr1
+    G_STORE %0, %1 :: (store 4, align 4, addrspace 5)
+
+...
+
+---
+
+name: kernel_store_private_s32_to_1_fi_offset_4095
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
+stack:
+  - { id: 0, size: 4096, alignment: 4 }
+
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+
+    ; GFX6-LABEL: name: kernel_store_private_s32_to_1_fi_offset_4095
+    ; GFX6: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX6: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
+    ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], 0, implicit $exec
+    ; GFX6: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX6: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_2]], %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5)
+    ; GFX9-LABEL: name: kernel_store_private_s32_to_1_fi_offset_4095
+    ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX9: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5)
+    %0:vgpr(p5) = G_FRAME_INDEX %stack.0
+    %1:vgpr(s32) = G_CONSTANT i32 4095
+    %2:vgpr(p5) = G_PTR_ADD %0, %1
+    %3:vgpr(s32) = G_CONSTANT i32 0
+    G_STORE %3, %2 :: (store 1, align 1, addrspace 5)
+
+...
+
+---
+
+name: kernel_store_private_s32_to_1_constant_4095
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
+stack:
+  - { id: 0, size: 4096, alignment: 4 }
+
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+
+    ; GFX6-LABEL: name: kernel_store_private_s32_to_1_constant_4095
+    ; GFX6: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX6: BUFFER_STORE_BYTE_OFFSET [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5)
+    ; GFX9-LABEL: name: kernel_store_private_s32_to_1_constant_4095
+    ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX9: BUFFER_STORE_BYTE_OFFSET [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5)
+    %0:vgpr(p5) = G_CONSTANT i32 4095
+    %1:vgpr(s32) = G_CONSTANT i32 0
+    G_STORE %1, %0 :: (store 1, align 1, addrspace 5)
+
+...
+
+---
+
+name: kernel_store_private_s32_to_1_constant_4096
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
+stack:
+  - { id: 0, size: 4096, alignment: 4 }
+
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+
+    ; GFX6-LABEL: name: kernel_store_private_s32_to_1_constant_4096
+    ; GFX6: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX6: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
+    ; GFX6: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5)
+    ; GFX9-LABEL: name: kernel_store_private_s32_to_1_constant_4096
+    ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX9: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
+    ; GFX9: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5)
     %0:vgpr(p5) = G_CONSTANT i32 4096
     %1:vgpr(s32) = G_CONSTANT i32 0
     G_STORE %1, %0 :: (store 1, align 1, addrspace 5)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
index f34c481824af..ed240f2240bb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
@@ -1459,9 +1459,9 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX9-NEXT:    s_add_u32 s21, s21, s31
 ; GFX9-NEXT:    s_cselect_b32 s24, 1, 0
 ; GFX9-NEXT:    s_and_b32 s24, s24, 1
-; GFX9-NEXT:    s_mul_hi_u32 s32, s0, s12
+; GFX9-NEXT:    s_mul_hi_u32 s33, s0, s12
 ; GFX9-NEXT:    s_add_i32 s23, s23, s24
-; GFX9-NEXT:    s_add_u32 s21, s21, s32
+; GFX9-NEXT:    s_add_u32 s21, s21, s33
 ; GFX9-NEXT:    s_cselect_b32 s24, 1, 0
 ; GFX9-NEXT:    s_and_b32 s24, s24, 1
 ; GFX9-NEXT:    s_add_i32 s23, s23, s24
@@ -1508,26 +1508,26 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX9-NEXT:    s_add_u32 s22, s22, s31
 ; GFX9-NEXT:    s_cselect_b32 s25, 1, 0
 ; GFX9-NEXT:    s_and_b32 s25, s25, 1
-; GFX9-NEXT:    s_mul_hi_u32 s32, s3, s10
-; GFX9-NEXT:    s_add_i32 s24, s24, s25
-; GFX9-NEXT:    s_add_u32 s22, s22, s32
-; GFX9-NEXT:    s_cselect_b32 s25, 1, 0
-; GFX9-NEXT:    s_and_b32 s25, s25, 1
-; GFX9-NEXT:    s_mul_hi_u32 s33, s2, s11
+; GFX9-NEXT:    s_mul_hi_u32 s33, s3, s10
 ; GFX9-NEXT:    s_add_i32 s24, s24, s25
 ; GFX9-NEXT:    s_add_u32 s22, s22, s33
 ; GFX9-NEXT:    s_cselect_b32 s25, 1, 0
 ; GFX9-NEXT:    s_and_b32 s25, s25, 1
-; GFX9-NEXT:    s_mul_hi_u32 s34, s1, s12
+; GFX9-NEXT:    s_mul_hi_u32 s34, s2, s11
 ; GFX9-NEXT:    s_add_i32 s24, s24, s25
 ; GFX9-NEXT:    s_add_u32 s22, s22, s34
 ; GFX9-NEXT:    s_cselect_b32 s25, 1, 0
 ; GFX9-NEXT:    s_and_b32 s25, s25, 1
-; GFX9-NEXT:    s_mul_hi_u32 s35, s0, s13
+; GFX9-NEXT:    s_mul_hi_u32 s35, s1, s12
 ; GFX9-NEXT:    s_add_i32 s24, s24, s25
 ; GFX9-NEXT:    s_add_u32 s22, s22, s35
 ; GFX9-NEXT:    s_cselect_b32 s25, 1, 0
 ; GFX9-NEXT:    s_and_b32 s25, s25, 1
+; GFX9-NEXT:    s_mul_hi_u32 s36, s0, s13
+; GFX9-NEXT:    s_add_i32 s24, s24, s25
+; GFX9-NEXT:    s_add_u32 s22, s22, s36
+; GFX9-NEXT:    s_cselect_b32 s25, 1, 0
+; GFX9-NEXT:    s_and_b32 s25, s25, 1
 ; GFX9-NEXT:    s_add_i32 s24, s24, s25
 ; GFX9-NEXT:    s_add_u32 s22, s22, s23
 ; GFX9-NEXT:    s_cselect_b32 s23, 1, 0

diff  --git a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
index ea40cda4fa6b..5e39e6700401 100644
--- a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
@@ -133,7 +133,7 @@ define amdgpu_kernel void @use_flat_to_group_addrspacecast(i32* %ptr) #0 {
 ; HSA-DAG: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], s[[PTR_LO]]
 ; HSA-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], 0, v[[VPTR_LO]]
 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 0{{$}}
-; HSA: buffer_store_dword v[[K]], [[CASTPTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
+; HSA: buffer_store_dword v[[K]], [[CASTPTR]], s{{\[[0-9]+:[0-9]+\]}}, 0 offen{{$}}
 define amdgpu_kernel void @use_flat_to_private_addrspacecast(i32* %ptr) #0 {
   %ftos = addrspacecast i32* %ptr to i32 addrspace(5)*
   store volatile i32 0, i32 addrspace(5)* %ftos
@@ -231,7 +231,7 @@ define amdgpu_kernel void @cast_0_private_to_flat_addrspacecast() #0 {
 
 ; HSA-LABEL: {{^}}cast_0_flat_to_private_addrspacecast:
 ; HSA: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}}
-; HSA: buffer_store_dword [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}}
+; HSA: buffer_store_dword [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0
 define amdgpu_kernel void @cast_0_flat_to_private_addrspacecast() #0 {
   %cast = addrspacecast i32* null to i32 addrspace(5)*
   store volatile i32 7, i32 addrspace(5)* %cast

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
index d26f51302a9c..072a76780447 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
@@ -45,8 +45,8 @@
 ; HSA-ALLOCA: s_add_u32 s6, s6, s9
 ; HSA-ALLOCA: s_lshr_b32 flat_scratch_hi, s6, 8
 
-; SI-ALLOCA: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x70,0xe0
-; SI-ALLOCA: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x70,0xe0
+; SI-ALLOCA: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen ; encoding: [0x00,0x10,0x70,0xe0
+; SI-ALLOCA: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen ; encoding: [0x00,0x10,0x70,0xe0
 
 
 ; HSAOPT: [[DISPATCH_PTR:%[0-9]+]] = call noalias nonnull dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
@@ -226,10 +226,10 @@ for.end:
 
 ; R600-VECT: MOVA_INT
 
-; SI-ALLOCA-DAG: buffer_store_short v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:6 ; encoding: [0x06,0x00,0x68,0xe0
-; SI-ALLOCA-DAG: buffer_store_short v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4 ; encoding: [0x04,0x00,0x68,0xe0
+; SI-ALLOCA-DAG: buffer_store_short v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:6 ; encoding: [0x06,0x00,0x68,0xe0
+; SI-ALLOCA-DAG: buffer_store_short v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4 ; encoding: [0x04,0x00,0x68,0xe0
 ; Loaded value is 0 or 1, so sext will become zext, so we get buffer_load_ushort instead of buffer_load_sshort.
-; SI-ALLOCA: buffer_load_sshort v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
+; SI-ALLOCA: buffer_load_sshort v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0
 
 ; SI-PROMOTE-VECT: s_load_dword [[IDX:s[0-9]+]]
 ; SI-PROMOTE-VECT: s_mov_b32 [[SREG:s[0-9]+]], 0x10000
@@ -257,8 +257,8 @@ entry:
 ; SI-PROMOTE-VECT-DAG: s_lshl_b32
 ; SI-PROMOTE-VECT-DAG: v_lshrrev
 
-; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4 ; encoding: [0x04,0x00,0x60,0xe0
-; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:5 ; encoding: [0x05,0x00,0x60,0xe0
+; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4 ; encoding: [0x04,0x00,0x60,0xe0
+; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:5 ; encoding: [0x05,0x00,0x60,0xe0
 define amdgpu_kernel void @char_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %0 = alloca [2 x i8], addrspace(5)
@@ -281,7 +281,7 @@ entry:
 ; R600-NOT: [[CHAN]]+
 ;
 ; A total of 5 bytes should be allocated and used.
-; SI: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4 ;
+; SI: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4 ;
 define amdgpu_kernel void @no_overlap(i32 addrspace(1)* %out, i32 %in) #0 {
 entry:
   %0 = alloca [3 x i8], align 1, addrspace(5)
@@ -393,9 +393,9 @@ entry:
 
 ; FUNC-LABEL: ptrtoint:
 ; SI-NOT: ds_write
-; SI: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen
+; SI: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
 ; SI: v_add_{{[iu]}}32_e32 [[ADD_OFFSET:v[0-9]+]], vcc, 5,
-; SI: buffer_load_dword v{{[0-9]+}}, [[ADD_OFFSET:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ;
+; SI: buffer_load_dword v{{[0-9]+}}, [[ADD_OFFSET:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0 offen ;
 define amdgpu_kernel void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %alloca = alloca [16 x i32], addrspace(5)
   %tmp0 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %a

diff  --git a/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll b/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll
index 78d99d881cb1..3d75eca93cb4 100644
--- a/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll
@@ -2,8 +2,8 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -mattr=-trap-handler < %s | FileCheck %s --check-prefixes=GCN,TRAP-HANDLER-DISABLE
 
 ; GCN-LABEL: {{^}}amdhsa_trap_num_sgprs
-; TRAP-HANDLER-ENABLE:  NumSgprs: 60
-; TRAP-HANDLER-DISABLE: NumSgprs: 78
+; TRAP-HANDLER-ENABLE:  NumSgprs: 61
+; TRAP-HANDLER-DISABLE: NumSgprs: 79
 define amdgpu_kernel void @amdhsa_trap_num_sgprs(
     i32 addrspace(1)* %out0, i32 %in0,
     i32 addrspace(1)* %out1, i32 %in1,

diff  --git a/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll b/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
index b301384b7159..f5505c97ebd5 100644
--- a/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
@@ -13,9 +13,9 @@ declare void @llvm.amdgcn.s.barrier() #2
 ; SI-LABEL: {{^}}test_private_array_ptr_calc:
 
 ; SI-ALLOCA: v_add_i32_e32 [[PTRREG:v[0-9]+]], vcc, 16, v{{[0-9]+}}
-; SI-ALLOCA: buffer_store_dword {{v[0-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:64
+; SI-ALLOCA: buffer_store_dword {{v[0-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}], 0 offen offset:64
 ; SI-ALLOCA: s_barrier
-; SI-ALLOCA: buffer_load_dword {{v[0-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:64
+; SI-ALLOCA: buffer_load_dword {{v[0-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}], 0 offen offset:64
 ;
 ; FIXME: The AMDGPUPromoteAlloca pass should be able to convert this
 ; alloca to a vector.  It currently fails because it does not know how

diff  --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll
index 9dc74cb83625..5ee5626cdedf 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll
@@ -2,11 +2,11 @@
 
 ; FIXME: Vectorization can increase required SGPR count beyond limit.
 
-; ALL-LABEL: {{^}}max_9_sgprs:
+; ALL-LABEL: {{^}}max_10_sgprs:
 
 ; ALL: SGPRBlocks: 1
-; ALL: NumSGPRsForWavesPerEU: 9
-define amdgpu_kernel void @max_9_sgprs() #0 {
+; ALL: NumSGPRsForWavesPerEU: 10
+define amdgpu_kernel void @max_10_sgprs() #0 {
   %one = load volatile i32, i32 addrspace(4)* undef
   %two = load volatile i32, i32 addrspace(4)* undef
   %three = load volatile i32, i32 addrspace(4)* undef
@@ -17,7 +17,8 @@ define amdgpu_kernel void @max_9_sgprs() #0 {
   %eight = load volatile i32, i32 addrspace(4)* undef
   %nine = load volatile i32, i32 addrspace(4)* undef
   %ten = load volatile i32, i32 addrspace(4)* undef
-  call void asm sideeffect "", "s,s,s,s,s,s,s,s,s"(i32 %one, i32 %two, i32 %three, i32 %four, i32 %five, i32 %six, i32 %seven, i32 %eight, i32 %nine)
+  %eleven = load volatile i32, i32 addrspace(4)* undef
+  call void asm sideeffect "", "s,s,s,s,s,s,s,s,s,s"(i32 %one, i32 %two, i32 %three, i32 %four, i32 %five, i32 %six, i32 %seven, i32 %eight, i32 %nine, i32 %ten)
   store volatile i32 %one, i32 addrspace(1)* undef
   store volatile i32 %two, i32 addrspace(1)* undef
   store volatile i32 %three, i32 addrspace(1)* undef
@@ -28,6 +29,7 @@ define amdgpu_kernel void @max_9_sgprs() #0 {
   store volatile i32 %eight, i32 addrspace(1)* undef
   store volatile i32 %nine, i32 addrspace(1)* undef
   store volatile i32 %ten, i32 addrspace(1)* undef
+  store volatile i32 %eleven, i32 addrspace(1)* undef
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll
index e123d80fb956..91eb7d0add48 100644
--- a/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll
+++ b/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll
@@ -2,31 +2,6 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s
 
 %struct.ByValStruct = type { [4 x i32] }
-
-; GCN-LABEL: {{^}}void_func_byval_struct:
-; GCN: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s32{{$}}
-; GCN-NOT: s32
-; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s32{{$}}
-; GCN-NOT: s32
-
-; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s32 offset:16{{$}}
-; GCN-NOT: s32
-; GCN: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:16{{$}}
-; GCN-NOT: s32
-define hidden void @void_func_byval_struct(%struct.ByValStruct addrspace(5)* byval noalias nocapture align 4 %arg0, %struct.ByValStruct addrspace(5)* byval noalias nocapture align 4 %arg1) #1 {
-entry:
-  %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg0, i32 0, i32 0, i32 0
-  %tmp = load volatile i32, i32 addrspace(5)* %arrayidx, align 4
-  %add = add nsw i32 %tmp, 1
-  store volatile i32 %add, i32 addrspace(5)* %arrayidx, align 4
-  %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg1, i32 0, i32 0, i32 0
-  %tmp1 = load volatile i32, i32 addrspace(5)* %arrayidx2, align 4
-  %add3 = add nsw i32 %tmp1, 2
-  store volatile i32 %add3, i32 addrspace(5)* %arrayidx2, align 4
-  store volatile i32 9, i32 addrspace(1)* null, align 4
-  ret void
-}
-
 ; Make sure the offset is folded and function's frame register is used
 ; rather than the global scratch wave offset.
 ; GCN-LABEL: {{^}}void_func_byval_struct_use_outside_entry_block:
@@ -67,331 +42,6 @@ bb0:
 bb1:
   ret void
 }
-
-; GCN-LABEL: {{^}}void_func_byval_struct_non_leaf:
-; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:36
-; GCN-DAG: v_writelane_b32 v33, s34,
-; GCN: s_mov_b32 s34, s32
-; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s34{{$}}
-; GCN-DAG: s_add_u32 s32, s32, 0xc00{{$}}
-; GCN-DAG: buffer_store_dword v32, off, s[0:3], s34 offset:32
-; GCN-NOT: v_writelane_b32 v{{[0-9]+}}, s32
-
-; GCN-DAG: v_add_{{[iu]}}32_e32 [[ADD0:v[0-9]+]], vcc, 1, [[LOAD0]]
-; GCN: buffer_store_dword [[ADD0]], off, s[0:3], s34{{$}}
-
-; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s34 offset:16{{$}}
-; GCN-DAG: v_add_{{[iu]}}32_e32 [[ADD1:v[0-9]+]], vcc, 2, [[LOAD1]]
-
-; GCN: s_swappc_b64
-
-; GCN: buffer_store_dword [[ADD1]], off, s[0:3], s34 offset:16{{$}}
-
-; GCN: v_readlane_b32
-; GCN-NOT: v_readlane_b32 s32
-; GCN-DAG: buffer_load_dword v32, off, s[0:3], s34 offset:32
-; GCN: s_sub_u32 s32, s32, 0xc00{{$}}
-; GCN: v_readlane_b32 s34, v33,
-; GCN-DAG: buffer_load_dword v33, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GCN: s_setpc_b64
-define void  @void_func_byval_struct_non_leaf(%struct.ByValStruct addrspace(5)* byval noalias nocapture align 4 %arg0, %struct.ByValStruct addrspace(5)* byval noalias nocapture align 4 %arg1) #1 {
-entry:
-  %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg0, i32 0, i32 0, i32 0
-  %tmp = load volatile i32, i32 addrspace(5)* %arrayidx, align 4
-  %add = add nsw i32 %tmp, 1
-  store volatile i32 %add, i32 addrspace(5)* %arrayidx, align 4
-  %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg1, i32 0, i32 0, i32 0
-  %tmp1 = load volatile i32, i32 addrspace(5)* %arrayidx2, align 4
-  %add3 = add nsw i32 %tmp1, 2
-  call void @external_void_func_void()
-  store volatile i32 %add3, i32 addrspace(5)* %arrayidx2, align 4
-  store volatile i32 9, i32 addrspace(1)* null, align 4
-  ret void
-}
-
-; GCN-LABEL: {{^}}call_void_func_byval_struct_func:
-; GCN: s_mov_b32 s34, s32
-; GCN-DAG: s_add_u32 s32, s32, 0xc00{{$}}
-; GCN-DAG: v_writelane_b32
-
-; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9
-; GCN-DAG: v_mov_b32_e32 [[THIRTEEN:v[0-9]+]], 13
-
-; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s34{{$}}
-; GCN-DAG: buffer_store_dword [[THIRTEEN]], off, s[0:3], s34 offset:16
-
-; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s34{{$}}
-; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s34 offset:4
-; GCN-DAG: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s34 offset:8
-; GCN-DAG: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s34 offset:12
-
-; GCN-NOT: s_add_u32 s32, s32, 0x800
-
-
-; GCN-DAG: buffer_store_dword [[LOAD0]], off, s[0:3], s32{{$}}
-; GCN-DAG: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:4
-; GCN-DAG: buffer_store_dword [[LOAD2]], off, s[0:3], s32 offset:8
-; GCN-DAG: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:12
-
-; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s34 offset:16
-; GCN: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s34 offset:20
-; GCN: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s34 offset:24
-; GCN: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s34 offset:28
-
-; GCN-DAG: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:16
-; GCN-DAG: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:20
-; GCN-DAG: buffer_store_dword [[LOAD6]], off, s[0:3], s32 offset:24
-; GCN-DAG: buffer_store_dword [[LOAD7]], off, s[0:3], s32 offset:28
-
-; GCN: s_swappc_b64
-; GCN-NOT: v_readlane_b32 s32
-; GCN: v_readlane_b32
-; GCN-NOT: v_readlane_b32 s32
-
-; GCN-NOT: s_sub_u32 s32, s32, 0x800
-
-; GCN: s_sub_u32 s32, s32, 0xc00{{$}}
-; GCN: v_readlane_b32 s34, v
-; GCN: s_waitcnt
-; GCN: s_setpc_b64
-define void @call_void_func_byval_struct_func() #1 {
-entry:
-  %arg0 = alloca %struct.ByValStruct, align 4, addrspace(5)
-  %arg1 = alloca %struct.ByValStruct, align 4, addrspace(5)
-  %tmp = bitcast %struct.ByValStruct addrspace(5)* %arg0 to i8 addrspace(5)*
-  call void @llvm.lifetime.start.p5i8(i64 32, i8 addrspace(5)* %tmp)
-  %tmp1 = bitcast %struct.ByValStruct addrspace(5)* %arg1 to i8 addrspace(5)*
-  call void @llvm.lifetime.start.p5i8(i64 32, i8 addrspace(5)* %tmp1)
-  %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg0, i32 0, i32 0, i32 0
-  store volatile i32 9, i32 addrspace(5)* %arrayidx, align 4
-  %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg1, i32 0, i32 0, i32 0
-  store volatile i32 13, i32 addrspace(5)* %arrayidx2, align 4
-  call void @void_func_byval_struct(%struct.ByValStruct addrspace(5)* byval nonnull align 4 %arg0, %struct.ByValStruct addrspace(5)* byval nonnull align 4 %arg1)
-  call void @llvm.lifetime.end.p5i8(i64 32, i8 addrspace(5)* %tmp1)
-  call void @llvm.lifetime.end.p5i8(i64 32, i8 addrspace(5)* %tmp)
-  ret void
-}
-
-; GCN-LABEL: {{^}}call_void_func_byval_struct_kernel:
-; GCN: s_mov_b32 s33, s7
-; GCN-NOT: s_add_u32 s32, s32, 0x800
-
-; GCN: v_mov_b32_e32 [[NINE:v[0-9]+]], 9
-; GCN: buffer_store_dword [[NINE]], off, s[0:3], s33 offset:8
-; GCN: v_mov_b32_e32 [[THIRTEEN:v[0-9]+]], 13
-; GCN: buffer_store_dword [[THIRTEEN]], off, s[0:3], s33 offset:24
-
-; GCN-NOT: s_add_u32 s32, s32, 0x800
-; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s33 offset:8
-; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s33 offset:12
-; GCN-DAG: s_add_u32 s32, s33, 0xc00{{$}}
-; GCN-DAG: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s33 offset:16
-; GCN-DAG: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s33 offset:20
-
-; GCN: s_getpc_b64
-
-; GCN-DAG: buffer_store_dword [[LOAD0]], off, s[0:3], s32{{$}}
-; GCN-DAG: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:4
-; GCN-DAG: buffer_store_dword [[LOAD2]], off, s[0:3], s32 offset:8
-; GCN-DAG: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:12
-
-; GCN-DAG: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s33 offset:24
-; GCN-DAG: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s33 offset:28
-; GCN-DAG: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s33 offset:32
-; GCN-DAG: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s33 offset:36
-
-; GCN-DAG: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:16
-; GCN-DAG: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:20
-; GCN-DAG: buffer_store_dword [[LOAD6]], off, s[0:3], s32 offset:24
-; GCN-DAG: buffer_store_dword [[LOAD7]], off, s[0:3], s32 offset:28
-
-
-; GCN: s_swappc_b64
-; GCN-NOT: s_sub_u32 s32
-; GCN: s_endpgm
-define amdgpu_kernel void @call_void_func_byval_struct_kernel() #1 {
-entry:
-  %arg0 = alloca %struct.ByValStruct, align 4, addrspace(5)
-  %arg1 = alloca %struct.ByValStruct, align 4, addrspace(5)
-  %tmp = bitcast %struct.ByValStruct addrspace(5)* %arg0 to i8 addrspace(5)*
-  call void @llvm.lifetime.start.p5i8(i64 32, i8 addrspace(5)* %tmp)
-  %tmp1 = bitcast %struct.ByValStruct addrspace(5)* %arg1 to i8 addrspace(5)*
-  call void @llvm.lifetime.start.p5i8(i64 32, i8 addrspace(5)* %tmp1)
-  %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg0, i32 0, i32 0, i32 0
-  store volatile i32 9, i32 addrspace(5)* %arrayidx, align 4
-  %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg1, i32 0, i32 0, i32 0
-  store volatile i32 13, i32 addrspace(5)* %arrayidx2, align 4
-  call void @void_func_byval_struct(%struct.ByValStruct addrspace(5)* byval nonnull align 4 %arg0, %struct.ByValStruct addrspace(5)* byval nonnull align 4 %arg1)
-  call void @llvm.lifetime.end.p5i8(i64 32, i8 addrspace(5)* %tmp1)
-  call void @llvm.lifetime.end.p5i8(i64 32, i8 addrspace(5)* %tmp)
-  ret void
-}
-
-; GCN-LABEL: {{^}}void_func_byval_struct_align8:
-; GCN: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s32{{$}}
-; GCN-NOT: s32
-; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s32{{$}}
-; GCN-NOT: s32
-
-; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s32 offset:16{{$}}
-; GCN-NOT: s32
-; GCN: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:16{{$}}
-; GCN-NOT: s32
-define hidden void @void_func_byval_struct_align8(%struct.ByValStruct addrspace(5)* byval noalias nocapture align 8 %arg0, %struct.ByValStruct addrspace(5)* byval noalias nocapture align 8 %arg1) #1 {
-entry:
-  %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg0, i32 0, i32 0, i32 0
-  %tmp = load volatile i32, i32 addrspace(5)* %arrayidx, align 8
-  %add = add nsw i32 %tmp, 1
-  store volatile i32 %add, i32 addrspace(5)* %arrayidx, align 8
-  %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg1, i32 0, i32 0, i32 0
-  %tmp1 = load volatile i32, i32 addrspace(5)* %arrayidx2, align 8
-  %add3 = add nsw i32 %tmp1, 2
-  store volatile i32 %add3, i32 addrspace(5)* %arrayidx2, align 8
-  store volatile i32 9, i32 addrspace(1)* null, align 4
-  ret void
-}
-
-; Make sure the byval alignment is respected in the call frame setup
-; GCN-LABEL: {{^}}call_void_func_byval_struct_align8_kernel:
-; GCN: s_mov_b32 s33, s7
-; GCN-NOT: s_add_u32 s32, s32, 0x800
-
-; GCN: v_mov_b32_e32 [[NINE:v[0-9]+]], 9
-; GCN: buffer_store_dword [[NINE]], off, s[0:3], s33 offset:8
-; GCN: v_mov_b32_e32 [[THIRTEEN:v[0-9]+]], 13
-; GCN: buffer_store_dword [[THIRTEEN]], off, s[0:3], s33 offset:24
-
-
-; GCN-NOT: s_add_u32 s32, s32, 0x800
-
-; GCN: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s33 offset:8
-; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s33 offset:12
-; GCN: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s33 offset:16
-; GCN: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s33 offset:20
-
-; GCN-NOT: s_add_u32 s32, s32, 0x800
-; GCN-DAG: s_add_u32 s32, s33, 0xc00{{$}}
-
-; GCN: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:12
-; GCN: buffer_store_dword [[LOAD2]], off, s[0:3], s32 offset:8
-; GCN: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:4
-; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s32{{$}}
-
-
-; GCN-DAG: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s33 offset:24
-; GCN-DAG: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s33 offset:28
-; GCN-DAG: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s33 offset:32
-; GCN-DAG: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s33 offset:36
-
-; GCN-DAG: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:16
-; GCN-DAG: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:20
-; GCN-DAG: buffer_store_dword [[LOAD6]], off, s[0:3], s32 offset:24
-; GCN-DAG: buffer_store_dword [[LOAD7]], off, s[0:3], s32 offset:28
-
-
-; GCN: s_swappc_b64
-; GCN-NOT: s_sub_u32 s32
-; GCN: s_endpgm
-define amdgpu_kernel void @call_void_func_byval_struct_align8_kernel() #1 {
-entry:
-  %arg0 = alloca %struct.ByValStruct, align 8, addrspace(5)
-  %arg1 = alloca %struct.ByValStruct, align 8, addrspace(5)
-  %tmp = bitcast %struct.ByValStruct addrspace(5)* %arg0 to i8 addrspace(5)*
-  call void @llvm.lifetime.start.p5i8(i64 32, i8 addrspace(5)* %tmp)
-  %tmp1 = bitcast %struct.ByValStruct addrspace(5)* %arg1 to i8 addrspace(5)*
-  call void @llvm.lifetime.start.p5i8(i64 32, i8 addrspace(5)* %tmp1)
-  %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg0, i32 0, i32 0, i32 0
-  store volatile i32 9, i32 addrspace(5)* %arrayidx, align 8
-  %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg1, i32 0, i32 0, i32 0
-  store volatile i32 13, i32 addrspace(5)* %arrayidx2, align 8
-  call void @void_func_byval_struct_align8(%struct.ByValStruct addrspace(5)* byval nonnull align 8 %arg0, %struct.ByValStruct addrspace(5)* byval nonnull align 8 %arg1)
-  call void @llvm.lifetime.end.p5i8(i64 32, i8 addrspace(5)* %tmp1)
-  call void @llvm.lifetime.end.p5i8(i64 32, i8 addrspace(5)* %tmp)
-  ret void
-}
-
-; GCN-LABEL: {{^}}call_void_func_byval_struct_align8_func:
-; GCN: s_mov_b32 s34, s32
-; GCN-DAG: s_add_u32 s32, s32, 0xc00{{$}}
-; GCN-DAG: v_writelane_b32
-
-; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9
-; GCN-DAG: v_mov_b32_e32 [[THIRTEEN:v[0-9]+]], 13
-
-; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s34{{$}}
-; GCN-DAG: buffer_store_dword [[THIRTEEN]], off, s[0:3], s34 offset:16
-
-; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s34{{$}}
-; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s34 offset:4
-; GCN-DAG: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s34 offset:8
-; GCN-DAG: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s34 offset:12
-
-; GCN-NOT: s_add_u32 s32, s32, 0x800
-
-; GCN-DAG: buffer_store_dword [[LOAD0]], off, s[0:3], s32{{$}}
-; GCN-DAG: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:4
-; GCN-DAG: buffer_store_dword [[LOAD2]], off, s[0:3], s32 offset:8
-; GCN-DAG: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:12
-
-; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s34 offset:16
-; GCN: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s34 offset:20
-; GCN: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s34 offset:24
-; GCN: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s34 offset:28
-
-; GCN: s_waitcnt vmcnt(0)
-; GCN-DAG: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:16
-; GCN-DAG: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:20
-; GCN-DAG: buffer_store_dword [[LOAD6]], off, s[0:3], s32 offset:24
-; GCN-DAG: buffer_store_dword [[LOAD7]], off, s[0:3], s32 offset:28
-
-; GCN: s_swappc_b64
-; GCN-NOT: v_readlane_b32 s32
-; GCN: v_readlane_b32
-; GCN-NOT: v_readlane_b32 s32
-
-; GCN-NOT: s_sub_u32 s32, s32, 0x800
-
-; GCN: s_sub_u32 s32, s32, 0xc00{{$}}
-; GCN: v_readlane_b32 s34, v
-; GCN: s_waitcnt
-; GCN-NEXT: s_setpc_b64
-define void @call_void_func_byval_struct_align8_func() #0 {
-entry:
-  %arg0 = alloca %struct.ByValStruct, align 8, addrspace(5)
-  %arg1 = alloca %struct.ByValStruct, align 8, addrspace(5)
-  %tmp = bitcast %struct.ByValStruct addrspace(5)* %arg0 to i8 addrspace(5)*
-  call void @llvm.lifetime.start.p5i8(i64 32, i8 addrspace(5)* %tmp)
-  %tmp1 = bitcast %struct.ByValStruct addrspace(5)* %arg1 to i8 addrspace(5)*
-  call void @llvm.lifetime.start.p5i8(i64 32, i8 addrspace(5)* %tmp1)
-  %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg0, i32 0, i32 0, i32 0
-  store volatile i32 9, i32 addrspace(5)* %arrayidx, align 8
-  %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg1, i32 0, i32 0, i32 0
-  store volatile i32 13, i32 addrspace(5)* %arrayidx2, align 8
-  call void @void_func_byval_struct_align8(%struct.ByValStruct addrspace(5)* byval nonnull align 8 %arg0, %struct.ByValStruct addrspace(5)* byval nonnull align 8 %arg1)
-  call void @llvm.lifetime.end.p5i8(i64 32, i8 addrspace(5)* %tmp1)
-  call void @llvm.lifetime.end.p5i8(i64 32, i8 addrspace(5)* %tmp)
-  ret void
-}
-
-; GCN-LABEL: {{^}}call_void_func_byval_struct_kernel_no_frame_pointer_elim:
-define amdgpu_kernel void @call_void_func_byval_struct_kernel_no_frame_pointer_elim() #2 {
-entry:
-  %arg0 = alloca %struct.ByValStruct, align 4, addrspace(5)
-  %arg1 = alloca %struct.ByValStruct, align 4, addrspace(5)
-  %tmp = bitcast %struct.ByValStruct addrspace(5)* %arg0 to i8 addrspace(5)*
-  call void @llvm.lifetime.start.p5i8(i64 32, i8 addrspace(5)* %tmp)
-  %tmp1 = bitcast %struct.ByValStruct addrspace(5)* %arg1 to i8 addrspace(5)*
-  call void @llvm.lifetime.start.p5i8(i64 32, i8 addrspace(5)* %tmp1)
-  %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg0, i32 0, i32 0, i32 0
-  store volatile i32 9, i32 addrspace(5)* %arrayidx, align 4
-  %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg1, i32 0, i32 0, i32 0
-  store volatile i32 13, i32 addrspace(5)* %arrayidx2, align 4
-  call void @void_func_byval_struct(%struct.ByValStruct addrspace(5)* byval nonnull align 4 %arg0, %struct.ByValStruct addrspace(5)* byval nonnull align 4 %arg1)
-  call void @llvm.lifetime.end.p5i8(i64 32, i8 addrspace(5)* %tmp1)
-  call void @llvm.lifetime.end.p5i8(i64 32, i8 addrspace(5)* %tmp)
-  ret void
-}
-
 declare hidden void @external_void_func_void() #0
 
 declare void @llvm.lifetime.start.p5i8(i64, i8 addrspace(5)* nocapture) #3

diff  --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
index faf15d0981c7..30461a55b136 100644
--- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -78,14 +78,11 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
 }
 
 ; GCN-LABEL: {{^}}test_call_external_void_func_i1_signext:
-; MESA: s_mov_b32 s33, s3{{$}}
-; HSA: s_mov_b32 s33, s9{{$}}
 
 ; HSA: buffer_load_ubyte [[VAR:v[0-9]+]]
-; HSA: s_mov_b32 s32, s33
+; HSA: s_mov_b32 s32, 0
 ; MESA-DAG: buffer_load_ubyte [[VAR:v[0-9]+]]
-; MESA-DAG: s_mov_b32 s32, s33{{$}}
-
+; MESA-DAG: s_mov_b32 s32, 0{{$}}
 
 ; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
 ; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i1_signext at rel32@lo+4
@@ -103,13 +100,12 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
 
 ; FIXME: load should be scheduled before getpc
 ; GCN-LABEL: {{^}}test_call_external_void_func_i1_zeroext:
-; MESA: s_mov_b32 s33, s3{{$}}
 
 ; HSA: buffer_load_ubyte v0
-; HSA-DAG: s_mov_b32 s32, s33{{$}}
+; HSA-DAG: s_mov_b32 s32, 0{{$}}
 
 ; MESA: buffer_load_ubyte v0
-; MESA-DAG: s_mov_b32 s32, s33{{$}}
+; MESA-DAG: s_mov_b32 s32, 0{{$}}
 
 ; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
 ; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i1_zeroext at rel32@lo+4
@@ -127,14 +123,13 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
 }
 
 ; GCN-LABEL: {{^}}test_call_external_void_func_i8_imm:
-; MESA-DAG: s_mov_b32 s33, s3{{$}}
 
 ; GCN-DAG: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
 ; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8 at rel32@lo+4
 ; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8 at rel32@hi+4
 ; GCN-DAG: v_mov_b32_e32 v0, 0x7b
 
-; GCN-DAG: s_mov_b32 s32, s33{{$}}
+; GCN-DAG: s_mov_b32 s32, 0{{$}}
 
 ; GCN: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
 ; GCN-NEXT: s_endpgm
@@ -145,15 +140,13 @@ define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 {
 
 ; FIXME: don't wait before call
 ; GCN-LABEL: {{^}}test_call_external_void_func_i8_signext:
-; HSA-DAG: s_mov_b32 s33, s9{{$}}
-; MESA-DAG: s_mov_b32 s33, s3{{$}}
 
 ; GCN-DAG: buffer_load_sbyte v0
 ; GCN-DAG: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
 ; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8_signext at rel32@lo+4
 ; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8_signext at rel32@hi+4
 
-; GCN-DAG: s_mov_b32 s32, s3
+; GCN-DAG: s_mov_b32 s32, 0
 
 ; GCN-NOT: s_waitcnt
 ; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
@@ -165,15 +158,13 @@ define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 {
 }
 
 ; GCN-LABEL: {{^}}test_call_external_void_func_i8_zeroext:
-; MESA-DAG: s_mov_b32 s33, s3{{$}}
-; HSA-DAG: s_mov_b32 s33, s9{{$}}
 
 ; GCN-DAG: buffer_load_ubyte v0
 ; GCN-DAG: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
 ; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8_zeroext at rel32@lo+4
 ; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8_zeroext at rel32@hi+4
 
-; GCN-DAG: s_mov_b32 s32, s33
+; GCN-DAG: s_mov_b32 s32, 0
 
 ; GCN-NOT: s_waitcnt
 ; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
@@ -187,7 +178,7 @@ define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 {
 ; GCN-LABEL: {{^}}test_call_external_void_func_i16_imm:
 ; GCN-DAG: v_mov_b32_e32 v0, 0x7b{{$}}
 
-; GCN-DAG: s_mov_b32 s32, s33
+; GCN-DAG: s_mov_b32 s32, 0
 
 ; GCN: s_swappc_b64
 define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 {
@@ -196,14 +187,13 @@ define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 {
 }
 
 ; GCN-LABEL: {{^}}test_call_external_void_func_i16_signext:
-; MESA-DAG: s_mov_b32 s33, s3{{$}}
 
 ; GCN-DAG: buffer_load_sshort v0
 ; GCN-DAG: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
 ; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i16_signext at rel32@lo+4
 ; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i16_signext at rel32@hi+4
 
-; GCN-DAG: s_mov_b32 s32, s33
+; GCN-DAG: s_mov_b32 s32, 0
 
 ; GCN-NOT: s_waitcnt
 ; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
@@ -215,14 +205,12 @@ define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 {
 }
 
 ; GCN-LABEL: {{^}}test_call_external_void_func_i16_zeroext:
-; MESA-DAG: s_mov_b32 s33, s3{{$}}
-
 
 ; GCN-DAG: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
 ; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i16_zeroext at rel32@lo+4
 ; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i16_zeroext at rel32@hi+4
 
-; GCN-DAG: s_mov_b32 s32, s33
+; GCN-DAG: s_mov_b32 s32, 0
 
 ; GCN-NOT: s_waitcnt
 ; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
@@ -234,13 +222,12 @@ define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 {
 }
 
 ; GCN-LABEL: {{^}}test_call_external_void_func_i32_imm:
-; MESA-DAG: s_mov_b32 s33, s3{{$}}
 
 ; GCN-DAG: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
 ; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i32 at rel32@lo+4
 ; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i32 at rel32@hi+4
 ; GCN-DAG: v_mov_b32_e32 v0, 42
-; GCN-DAG: s_mov_b32 s32, s33
+; GCN-DAG: s_mov_b32 s32, 0
 
 ; GCN: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
 ; GCN-NEXT: s_endpgm
@@ -497,9 +484,7 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32_imm() #0 {
   ret void
 }
 
-; GCN-LABEL: {{^}}test_call_external_void_func_v3i32_imm:
-; HSA-DAG: s_mov_b32 s33, s9
-; MESA-DAG: s_mov_b32 s33, s3{{$}}
+; GCN-LABEL: {{^}}test_call_external_void_func_v3i32_imm: {{.*}}
 
 ; GCN-NOT: v3
 ; GCN-DAG: v_mov_b32_e32 v0, 3
@@ -616,10 +601,8 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
 }
 
 ; GCN-LABEL: {{^}}test_call_external_void_func_v32i32_i32:
-; HSA-DAG: s_mov_b32 s33, s9
 ; HSA-NOT: s_add_u32 s32
 
-; MESA-DAG: s_mov_b32 s33, s3{{$}}
 ; MESA-NOT: s_add_u32 s32
 
 ; GCN-DAG: buffer_load_dword [[VAL1:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
@@ -670,19 +653,19 @@ define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 {
 ; GCN-LABEL: {{^}}test_call_external_void_func_byval_struct_i8_i32:
 ; GCN-DAG: v_mov_b32_e32 [[VAL0:v[0-9]+]], 3
 ; GCN-DAG: v_mov_b32_e32 [[VAL1:v[0-9]+]], 8
-; MESA-DAG: buffer_store_byte [[VAL0]], off, s[36:39], s33 offset:8
-; MESA-DAG: buffer_store_dword [[VAL1]], off, s[36:39], s33 offset:12
+; MESA-DAG: buffer_store_byte [[VAL0]], off, s[36:39], 0 offset:8
+; MESA-DAG: buffer_store_dword [[VAL1]], off, s[36:39], 0 offset:12
 
-; HSA-DAG: buffer_store_byte [[VAL0]], off, s[0:3], s33 offset:8
-; HSA-DAG: buffer_store_dword [[VAL1]], off, s[0:3], s33 offset:12
+; HSA-DAG: buffer_store_byte [[VAL0]], off, s[0:3], 0 offset:8
+; HSA-DAG: buffer_store_dword [[VAL1]], off, s[0:3], 0 offset:12
 
-; HSA: buffer_load_dword [[RELOAD_VAL0:v[0-9]+]], off, s[0:3], s33 offset:8
-; HSA: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s[0:3], s33 offset:12
+; HSA: buffer_load_dword [[RELOAD_VAL0:v[0-9]+]], off, s[0:3], 0 offset:8
+; HSA: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s[0:3], 0 offset:12
 
-; MESA: buffer_load_dword [[RELOAD_VAL0:v[0-9]+]], off, s[36:39], s33 offset:8
-; MESA: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s[36:39], s33 offset:12
+; MESA: buffer_load_dword [[RELOAD_VAL0:v[0-9]+]], off, s[36:39], 0 offset:8
+; MESA: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s[36:39], 0 offset:12
 
-; GCN-DAG: s_add_u32 [[SP:s[0-9]+]], s33, 0x400{{$}}
+; GCN-DAG: s_movk_i32 [[SP:s[0-9]+]], 0x400{{$}}
 
 ; HSA-DAG: buffer_store_dword [[RELOAD_VAL0]], off, s[0:3], [[SP]]{{$}}
 ; HSA-DAG: buffer_store_dword [[RELOAD_VAL1]], off, s[0:3], [[SP]] offset:4
@@ -703,23 +686,22 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0
 }
 
 ; GCN-LABEL: {{^}}test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32:
-; MESA-DAG: s_add_u32 [[SP:s[0-9]+]], [[FP_REG:s[0-9]+]], 0x800{{$}}
-; HSA-DAG: s_add_u32 [[SP:s[0-9]+]], [[FP_REG:s[0-9]+]], 0x800{{$}}
+; GCN-DAG: s_movk_i32 [[SP:s[0-9]+]], 0x800{{$}}
 
 ; GCN-DAG: v_mov_b32_e32 [[VAL0:v[0-9]+]], 3
 ; GCN-DAG: v_mov_b32_e32 [[VAL1:v[0-9]+]], 8
-; GCN-DAG: buffer_store_byte [[VAL0]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:8
-; GCN-DAG: buffer_store_dword [[VAL1]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:12
+; GCN-DAG: buffer_store_byte [[VAL0]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
+; GCN-DAG: buffer_store_dword [[VAL1]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:12
 
-; GCN-DAG: buffer_load_dword [[RELOAD_VAL0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:8
-; GCN-DAG: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:12
+; GCN-DAG: buffer_load_dword [[RELOAD_VAL0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
+; GCN-DAG: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:12
 
 ; GCN-NOT: s_add_u32 [[SP]]
 ; GCN-DAG: buffer_store_dword [[RELOAD_VAL0]], off, s{{\[[0-9]+:[0-9]+\]}}, [[SP]]{{$}}
 ; GCN-DAG: buffer_store_dword [[RELOAD_VAL1]], off, s{{\[[0-9]+:[0-9]+\]}}, [[SP]] offset:4
 ; GCN: s_swappc_b64
-; GCN-DAG: buffer_load_ubyte [[LOAD_OUT_VAL0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:16
-; GCN-DAG: buffer_load_dword [[LOAD_OUT_VAL1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:20
+; GCN-DAG: buffer_load_ubyte [[LOAD_OUT_VAL0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16
+; GCN-DAG: buffer_load_dword [[LOAD_OUT_VAL1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:20
 ; GCN-NOT: s_sub_u32 [[SP]]
 
 ; GCN: buffer_store_byte [[LOAD_OUT_VAL0]], off

diff  --git a/llvm/test/CodeGen/AMDGPU/call-constant.ll b/llvm/test/CodeGen/AMDGPU/call-constant.ll
index 19aadfc96ad1..11f4b3c0b913 100644
--- a/llvm/test/CodeGen/AMDGPU/call-constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-constant.ll
@@ -3,9 +3,8 @@
 ; FIXME: Emitting unnecessary flat_scratch setup
 
 ; GCN-LABEL: {{^}}test_call_undef:
-; GCN: s_mov_b32 s8, s7
 ; GCN: s_mov_b32 flat_scratch_lo, s5
-; GCN: s_add_u32 s4, s4, s8
+; GCN: s_add_u32 s4, s4, s7
 ; GCN: s_lshr_b32
 ; GCN: s_endpgm
 define amdgpu_kernel void @test_call_undef() #0 {
@@ -24,9 +23,8 @@ define i32 @test_tail_call_undef() #0 {
 }
 
 ; GCN-LABEL: {{^}}test_call_null:
-; GCN: s_mov_b32 s8, s7
 ; GCN: s_mov_b32 flat_scratch_lo, s5
-; GCN: s_add_u32 s4, s4, s8
+; GCN: s_add_u32 s4, s4, s7
 ; GCN: s_lshr_b32
 ; GCN: s_endpgm
 define amdgpu_kernel void @test_call_null() #0 {

diff  --git a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
index ee77007ef59e..0f754385d294 100644
--- a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
@@ -5,11 +5,10 @@
 declare hidden void @external_void_func_void() #0
 
 ; GCN-LABEL: {{^}}test_kernel_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void:
-; GCN: s_mov_b32 s33, s7
 ; GCN: s_getpc_b64 s[34:35]
 ; GCN-NEXT: s_add_u32 s34, s34,
 ; GCN-NEXT: s_addc_u32 s35, s35,
-; GCN-NEXT: s_mov_b32 s32, s33
+; GCN-NEXT: s_mov_b32 s32, 0
 ; GCN: s_swappc_b64 s[30:31], s[34:35]
 
 ; GCN-NEXT: #ASMSTART
@@ -105,9 +104,9 @@ define amdgpu_kernel void @test_call_void_func_void_clobber_vcc(i32 addrspace(1)
 }
 
 ; GCN-LABEL: {{^}}test_call_void_func_void_mayclobber_s31:
-; GCN: s_mov_b32 s34, s31
+; GCN: s_mov_b32 s33, s31
 ; GCN-NEXT: s_swappc_b64
-; GCN-NEXT: s_mov_b32 s31, s34
+; GCN-NEXT: s_mov_b32 s31, s33
 define amdgpu_kernel void @test_call_void_func_void_mayclobber_s31(i32 addrspace(1)* %out) #0 {
   %s31 = call i32 asm sideeffect "; def $0", "={s31}"()
   call void @external_void_func_void()
@@ -126,14 +125,11 @@ define amdgpu_kernel void @test_call_void_func_void_mayclobber_v31(i32 addrspace
   ret void
 }
 
-; FIXME: What is the expected behavior for reserved registers here?
-
 ; GCN-LABEL: {{^}}test_call_void_func_void_preserves_s33:
-; GCN: s_mov_b32 s33, s9
-; GCN: s_mov_b32 s32, s33
 ; GCN: s_getpc_b64 s[4:5]
 ; GCN-NEXT: s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
 ; GCN-NEXT: s_addc_u32 s5, s5, external_void_func_void at rel32@hi+4
+; GCN: s_mov_b32 s32, 0
 ; GCN: #ASMSTART
 ; GCN-NEXT: ; def s33
 ; GCN-NEXT: #ASMEND
@@ -150,14 +146,15 @@ define amdgpu_kernel void @test_call_void_func_void_preserves_s33(i32 addrspace(
   ret void
 }
 
-; GCN-LABEL: {{^}}test_call_void_func_void_preserves_s34:
-; GCN: s_mov_b32 s33, s9
-; GCN-NOT: s34
+; FIXME: What is the expected behavior for reserved registers here?
+
+; GCN-LABEL: {{^}}test_call_void_func_void_preserves_s34: {{.*}}
 ; GCN-NOT: s34
 
 ; GCN: s_getpc_b64 s[4:5]
 ; GCN-NEXT: s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
 ; GCN-NEXT: s_addc_u32 s5, s5, external_void_func_void at rel32@hi+4
+; GCN: s_mov_b32 s32, 0
 
 ; GCN-NOT: s34
 ; GCN: ;;#ASMSTART
@@ -180,15 +177,14 @@ define amdgpu_kernel void @test_call_void_func_void_preserves_s34(i32 addrspace(
   ret void
 }
 
-; GCN-LABEL: {{^}}test_call_void_func_void_preserves_v32:
-; GCN: s_mov_b32 s33, s9
+; GCN-LABEL: {{^}}test_call_void_func_void_preserves_v32: {{.*}}
 
 ; GCN-NOT: v32
 ; GCN: s_getpc_b64 s[4:5]
 ; GCN-NEXT: s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
 ; GCN-NEXT: s_addc_u32 s5, s5, external_void_func_void at rel32@hi+4
+; GCN: s_mov_b32 s32, 0
 ; GCN-NOT: v32
-; GCN-DAG: s_mov_b32 s32, s33
 
 ; GCN: ;;#ASMSTART
 ; GCN-NEXT: ; def v32
@@ -234,12 +230,10 @@ define hidden void @void_func_void_clobber_s34() #2 {
 }
 
 ; GCN-LABEL: {{^}}test_call_void_func_void_clobber_s33:
-; GCN: s_mov_b32 s33, s7
-
 ; GCN: s_getpc_b64
 ; GCN-NEXT: s_add_u32
 ; GCN-NEXT: s_addc_u32
-; GCN-NEXT: s_mov_b32 s32, s33
+; GCN-NEXT: s_mov_b32 s32, 0
 ; GCN: s_swappc_b64
 ; GCN-NEXT: s_endpgm
 define amdgpu_kernel void @test_call_void_func_void_clobber_s33() #0 {
@@ -248,11 +242,10 @@ define amdgpu_kernel void @test_call_void_func_void_clobber_s33() #0 {
 }
 
 ; GCN-LABEL: {{^}}test_call_void_func_void_clobber_s34:
-; GCN: s_mov_b32 s33, s7
 ; GCN: s_getpc_b64
 ; GCN-NEXT: s_add_u32
 ; GCN-NEXT: s_addc_u32
-; GCN-NEXT: s_mov_b32 s32, s33
+; GCN-NEXT: s_mov_b32 s32, 0
 ; GCN: s_swappc_b64
 ; GCN-NEXT: s_endpgm
 define amdgpu_kernel void @test_call_void_func_void_clobber_s34() #0 {

diff  --git a/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll b/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll
index 72423ec4189e..6c9a9af159bd 100644
--- a/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll
@@ -6,16 +6,17 @@ define amdgpu_kernel void @call_memory_arg_load(i32 addrspace(3)* %ptr, i32) #0
 ; GCN-LABEL: call_memory_arg_load:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GCN-NEXT:    s_mov_b32 s33, s9
-; GCN-NEXT:    s_add_u32 flat_scratch_lo, s6, s33
+; GCN-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
 ; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
-; GCN-NEXT:    s_mov_b32 s32, s33
+; GCN-NEXT:    s_add_u32 s0, s0, s9
+; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    ds_read_b32 v0, v0
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
 ; GCN-NEXT:    s_add_u32 s4, s4, func at rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s5, s5, func at rel32@hi+4
+; GCN-NEXT:    s_mov_b32 s32, 0
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GCN-NEXT:    s_endpgm
   %vgpr = load volatile i32, i32 addrspace(3)* %ptr
@@ -28,19 +29,20 @@ define amdgpu_kernel void @call_memory_no_dep(i32 addrspace(1)* %ptr, i32) #0 {
 ; GCN-LABEL: call_memory_no_dep:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GCN-NEXT:    s_mov_b32 s33, s9
-; GCN-NEXT:    s_add_u32 flat_scratch_lo, s6, s33
-; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
 ; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
+; GCN-NEXT:    s_add_u32 s0, s0, s9
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    global_store_dword v[0:1], v2, off
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_getpc_b64 s[6:7]
 ; GCN-NEXT:    s_add_u32 s6, s6, func at rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s7, s7, func at rel32@hi+4
-; GCN-NEXT:    s_mov_b32 s32, s33
+; GCN-NEXT:    s_mov_b32 s32, 0
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GCN-NEXT:    s_endpgm
   store i32 0, i32 addrspace(1)* %ptr
@@ -52,15 +54,16 @@ define amdgpu_kernel void @call_memory_no_dep(i32 addrspace(1)* %ptr, i32) #0 {
 define amdgpu_kernel void @call_no_wait_after_call(i32 addrspace(1)* %ptr, i32) #0 {
 ; GCN-LABEL: call_no_wait_after_call:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
 ; GCN-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x0
-; GCN-NEXT:    s_mov_b32 s33, s9
-; GCN-NEXT:    s_add_u32 flat_scratch_lo, s6, s33
 ; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
+; GCN-NEXT:    s_add_u32 s0, s0, s9
+; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
 ; GCN-NEXT:    s_add_u32 s4, s4, func at rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s5, s5, func at rel32@hi+4
-; GCN-NEXT:    s_mov_b32 s32, s33
+; GCN-NEXT:    s_mov_b32 s32, 0
 ; GCN-NEXT:    v_mov_b32_e32 v32, 0
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v0, s34
@@ -75,15 +78,16 @@ define amdgpu_kernel void @call_no_wait_after_call(i32 addrspace(1)* %ptr, i32)
 define amdgpu_kernel void @call_no_wait_after_call_return_val(i32 addrspace(1)* %ptr, i32) #0 {
 ; GCN-LABEL: call_no_wait_after_call_return_val:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
 ; GCN-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x0
-; GCN-NEXT:    s_mov_b32 s33, s9
-; GCN-NEXT:    s_add_u32 flat_scratch_lo, s6, s33
 ; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
+; GCN-NEXT:    s_add_u32 s0, s0, s9
+; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
 ; GCN-NEXT:    s_add_u32 s4, s4, func.return at rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s5, s5, func.return at rel32@hi+4
-; GCN-NEXT:    s_mov_b32 s32, s33
+; GCN-NEXT:    s_mov_b32 s32, 0
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v1, s34
 ; GCN-NEXT:    v_mov_b32_e32 v2, s35
@@ -98,15 +102,16 @@ define amdgpu_kernel void @call_no_wait_after_call_return_val(i32 addrspace(1)*
 define amdgpu_kernel void @call_got_load(i32 addrspace(1)* %ptr, i32) #0 {
 ; GCN-LABEL: call_got_load:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_mov_b32 s33, s9
-; GCN-NEXT:    s_add_u32 flat_scratch_lo, s6, s33
+; GCN-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
 ; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
+; GCN-NEXT:    s_add_u32 s0, s0, s9
+; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
 ; GCN-NEXT:    s_add_u32 s4, s4, got.func at gotpcrel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s5, s5, got.func at gotpcrel32@hi+4
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_mov_b32 s32, s33
+; GCN-NEXT:    s_mov_b32 s32, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GCN-NEXT:    s_endpgm

diff  --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
index 497ea354fc09..023bc6e21276 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
@@ -243,11 +243,10 @@ define hidden void @use_every_sgpr_input() #1 {
 }
 
 ; GCN-LABEL: {{^}}kern_indirect_use_every_sgpr_input:
-; GCN: s_mov_b32 s33, s17
 ; GCN: s_mov_b32 s12, s14
 ; GCN: s_mov_b32 s13, s15
 ; GCN: s_mov_b32 s14, s16
-; GCN: s_mov_b32 s32, s33
+; GCN: s_mov_b32 s32, 0
 ; GCN: s_swappc_b64
 
 ; GCN: .amdhsa_user_sgpr_private_segment_buffer 1

diff  --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll
index 601ed9698c61..16916e4aa42e 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll
@@ -203,7 +203,7 @@ define hidden void @use_workgroup_id_yz() #1 {
 ; GCN-NEXT: s_getpc_b64 s[6:7]
 ; GCN-NEXT: s_add_u32 s6, s6, use_workgroup_id_x at rel32@lo+4
 ; GCN-NEXT: s_addc_u32 s7, s7, use_workgroup_id_x at rel32@hi+4
-; GCN: s_mov_b32 s32, s33
+; GCN: s_mov_b32 s32, 0
 ; GCN: s_swappc_b64
 ; GCN-NEXT: s_endpgm
 define amdgpu_kernel void @kern_indirect_use_workgroup_id_x() #1 {
@@ -216,9 +216,8 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_x() #1 {
 ; GCN: enable_sgpr_workgroup_id_y = 1
 ; GCN: enable_sgpr_workgroup_id_z = 0
 
-; GCN: s_mov_b32 s33, s8
-; GCN-DAG: s_mov_b32 s4, s7
-; GCN: s_mov_b32 s32, s33
+; GCN: s_mov_b32 s4, s7
+; GCN: s_mov_b32 s32, 0
 ; GCN: s_swappc_b64
 define amdgpu_kernel void @kern_indirect_use_workgroup_id_y() #1 {
   call void @use_workgroup_id_y()
@@ -229,9 +228,10 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_y() #1 {
 ; GCN: enable_sgpr_workgroup_id_x = 1
 ; GCN: enable_sgpr_workgroup_id_y = 0
 ; GCN: enable_sgpr_workgroup_id_z = 1
-; GCN: s_mov_b32 s33, s8
+
 ; GCN: s_mov_b32 s4, s7
 
+; GCN: s_mov_b32 s32, 0
 ; GCN: s_swappc_b64
 define amdgpu_kernel void @kern_indirect_use_workgroup_id_z() #1 {
   call void @use_workgroup_id_z()
@@ -243,11 +243,10 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_z() #1 {
 ; GCN: enable_sgpr_workgroup_id_y = 1
 ; GCN: enable_sgpr_workgroup_id_z = 0
 
-; GCN: s_mov_b32 s33, s8
-
 ; GCN: s_mov_b32 s5, s7
 ; GCN: s_mov_b32 s4, s6
-; GCN: s_mov_b32 s32, s33
+
+; GCN: s_mov_b32 s32, 0
 ; GCN: s_swappc_b64
 define amdgpu_kernel void @kern_indirect_use_workgroup_id_xy() #1 {
   call void @use_workgroup_id_xy()
@@ -259,13 +258,11 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_xy() #1 {
 ; GCN: enable_sgpr_workgroup_id_y = 1
 ; GCN: enable_sgpr_workgroup_id_z = 1
 
-; GCN: s_mov_b32 s33, s9
-
 ; GCN: s_mov_b32 s4, s6
 ; GCN: s_mov_b32 s5, s7
 ; GCN: s_mov_b32 s6, s8
 
-; GCN: s_mov_b32 s32, s33
+; GCN: s_mov_b32 s32, 0
 ; GCN: s_swappc_b64
 define amdgpu_kernel void @kern_indirect_use_workgroup_id_xyz() #1 {
   call void @use_workgroup_id_xyz()
@@ -277,12 +274,10 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_xyz() #1 {
 ; GCN: enable_sgpr_workgroup_id_y = 0
 ; GCN: enable_sgpr_workgroup_id_z = 1
 
-; GCN: s_mov_b32 s33, s8
 ; GCN: s_mov_b32 s5, s7
 ; GCN: s_mov_b32 s4, s6
 
-; GCN: s_mov_b32 s32, s33
-
+; GCN: s_mov_b32 s32, 0
 ; GCN: s_swappc_b64
 define amdgpu_kernel void @kern_indirect_use_workgroup_id_xz() #1 {
   call void @use_workgroup_id_xz()
@@ -294,10 +289,10 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_xz() #1 {
 ; GCN: enable_sgpr_workgroup_id_y = 1
 ; GCN: enable_sgpr_workgroup_id_z = 1
 
-; GCN: s_mov_b32 s33, s9
 ; GCN: s_mov_b32 s4, s7
 ; GCN: s_mov_b32 s5, s8
-; GCN: s_mov_b32 s32, s33
+
+; GCN: s_mov_b32 s32, 0
 ; GCN: s_swappc_b64
 define amdgpu_kernel void @kern_indirect_use_workgroup_id_yz() #1 {
   call void @use_workgroup_id_yz()
@@ -364,10 +359,10 @@ define hidden void @other_arg_use_workgroup_id_z(i32 %arg0) #1 {
 ; GCN: enable_sgpr_workgroup_id_y = 0
 ; GCN: enable_sgpr_workgroup_id_z = 0
 
-; GCN-DAG: s_mov_b32 s33, s7
 ; GCN-DAG: v_mov_b32_e32 v0, 0x22b
 ; GCN-DAG: s_mov_b32 s4, s6
-; GCN-DAG: s_mov_b32 s32, s33
+
+; GCN-DAG: s_mov_b32 s32, 0
 ; GCN-NOT: s4
 ; GCN: s_swappc_b64
 define amdgpu_kernel void @kern_indirect_other_arg_use_workgroup_id_x() #1 {
@@ -380,11 +375,10 @@ define amdgpu_kernel void @kern_indirect_other_arg_use_workgroup_id_x() #1 {
 ; GCN: enable_sgpr_workgroup_id_y = 1
 ; GCN: enable_sgpr_workgroup_id_z = 0
 
-; GCN-DAG: s_mov_b32 s33, s8
 ; GCN-DAG: v_mov_b32_e32 v0, 0x22b
 ; GCN-DAG: s_mov_b32 s4, s7
 
-; GCN-DAG: s_mov_b32 s32, s33
+; GCN-DAG: s_mov_b32 s32, 0
 ; GCN: s_swappc_b64
 define amdgpu_kernel void @kern_indirect_other_arg_use_workgroup_id_y() #1 {
   call void @other_arg_use_workgroup_id_y(i32 555)
@@ -396,10 +390,9 @@ define amdgpu_kernel void @kern_indirect_other_arg_use_workgroup_id_y() #1 {
 ; GCN: enable_sgpr_workgroup_id_y = 0
 ; GCN: enable_sgpr_workgroup_id_z = 1
 
-; GCN-DAG: s_mov_b32 s33, s8
 ; GCN-DAG: v_mov_b32_e32 v0, 0x22b
 
-; GCN: s_mov_b32 s32, s33
+; GCN: s_mov_b32 s32, 0
 ; GCN: s_swappc_b64
 define amdgpu_kernel void @kern_indirect_other_arg_use_workgroup_id_z() #1 {
   call void @other_arg_use_workgroup_id_z(i32 555)
@@ -465,11 +458,10 @@ define hidden void @use_every_sgpr_input() #1 {
 ; GCN: enable_sgpr_dispatch_id = 1
 ; GCN: enable_sgpr_flat_scratch_init = 1
 
-; GCN: s_mov_b32 s33, s17
 ; GCN: s_mov_b32 s12, s14
 ; GCN: s_mov_b32 s13, s15
 ; GCN: s_mov_b32 s14, s16
-; GCN: s_mov_b32 s32, s33
+; GCN: s_mov_b32 s32, 0
 ; GCN: s_swappc_b64
 define amdgpu_kernel void @kern_indirect_use_every_sgpr_input() #1 {
   call void @use_every_sgpr_input()

diff  --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
index 421d41294a28..0dee34dcfdc8 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
@@ -457,15 +457,13 @@ define void @too_many_args_use_workitem_id_x(
 ; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x:
 ; VARABI: enable_vgpr_workitem_id = 0
 
-; VARABI: s_mov_b32 s33, s7
-; VARABI: s_mov_b32 s32, s33
+; VARABI: s_mov_b32 s32, 0
 ; VARABI: buffer_store_dword v0, off, s[0:3], s32{{$}}
 ; VARABI: s_swappc_b64
 
 
 ; FIXEDABI: enable_vgpr_workitem_id = 2
-; FIXEDABI: s_mov_b32 s33, s17
-; FIXEDABI-DAG: s_mov_b32 s32, s33
+; FIXEDABI-DAG: s_mov_b32 s32, 0
 ; FIXEDABI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x140{{$}}
 ; FIXEDABI-DAG:	v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
 ; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
@@ -616,11 +614,10 @@ define void @too_many_args_use_workitem_id_x_byval(
 
 ; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x_byval:
 ; VARABI: enable_vgpr_workitem_id = 0
-; VARABI-DAG: s_mov_b32 s33, s7
-; VARABI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}}
-; VARABI: buffer_store_dword [[K]], off, s[0:3], s33 offset:4
-; VARABI: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s33 offset:4
-; VARABI: s_add_u32 s32, s33, 0x400{{$}}
+; VARABI: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}}
+; VARABI: buffer_store_dword [[K]], off, s[0:3], 0 offset:4
+; VARABI: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], 0 offset:4
+; VARABI: s_movk_i32 s32, 0x400{{$}}
 
 ; VARABI-NOT: s32
 ; VARABI: buffer_store_dword v0, off, s[0:3], s32 offset:4
@@ -630,16 +627,16 @@ define void @too_many_args_use_workitem_id_x_byval(
 ; VARABI: s_swappc_b64
 
 
-; FIXEDABI: s_mov_b32 s33, s17
-; FIXEDABI-DAG: s_add_u32 s32, s33, 0x400
-; FIXEDABI-DAG: v_mov_b32_e32 [[K0:v[0-9]+]], 0x3e7
-; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], s33 offset:4{{$}}
+; FIXEDABI: v_mov_b32_e32 [[K0:v[0-9]+]], 0x3e7
+; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], 0 offset:4{{$}}
+
+; FIXEDABI: s_movk_i32 s32, 0x400{{$}}
 
 ; FIXEDABI: v_mov_b32_e32 [[K1:v[0-9]+]], 0x140
 ; FIXEDABI: buffer_store_dword [[K1]], off, s[0:3], s32{{$}}
 
 ; FIXME: Why this reload?
-; FIXEDABI: buffer_load_dword [[RELOAD:v[0-9]+]], off, s[0:3], s33 offset:4{{$}}
+; FIXEDABI: buffer_load_dword [[RELOAD:v[0-9]+]], off, s[0:3], 0 offset:4{{$}}
 
 ; FIXEDABI-DAG:	v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
 ; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
@@ -789,9 +786,7 @@ define void @too_many_args_use_workitem_id_xyz(
 ; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_xyz:
 ; GCN: enable_vgpr_workitem_id = 2
 
-; VARABI-DAG: s_mov_b32 s33, s7
-; FIXEDABI-DAG: s_mov_b32 s33, s17
-; GCN-DAG: s_mov_b32 s32, s33
+; GCN-DAG: s_mov_b32 s32, 0
 
 ; GCN-DAG:	v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
 ; GCN-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
@@ -885,16 +880,13 @@ define void @too_many_args_use_workitem_id_x_stack_yz(
 ; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x_stack_yz:
 ; GCN: enable_vgpr_workitem_id = 2
 
-; VARABI: s_mov_b32 s33, s7
-; FIXEDABI: s_mov_b32 s33, s17
-
 ; GCN-NOT: v0
 ; GCN-DAG: v_lshlrev_b32_e32 v1, 10, v1
 ; GCN-DAG: v_or_b32_e32 v0, v0, v1
 ; GCN-DAG: v_lshlrev_b32_e32 v2, 20, v2
 ; GCN-DAG: v_or_b32_e32 v31, v0, v2
 
-; GCN: s_mov_b32 s32, s33
+; GCN: s_mov_b32 s32, 0
 ; GCN: s_swappc_b64
 define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_stack_yz() #1 {
   call void @too_many_args_use_workitem_id_x_stack_yz(

diff  --git a/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll b/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll
index a091811fc7c1..0878205a53fd 100644
--- a/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll
+++ b/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll
@@ -28,8 +28,8 @@ define amdgpu_kernel void @stored_fi_to_lds(float addrspace(5)* addrspace(3)* %p
 ; Offset is applied
 ; GCN-LABEL: {{^}}stored_fi_to_lds_2_small_objects:
 ; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 4{{$}}
-; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}}
-; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:8{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8{{$}}
 
 ; GCN-DAG: s_load_dword [[LDSPTR:s[0-9]+]]
 
@@ -51,9 +51,9 @@ define amdgpu_kernel void @stored_fi_to_lds_2_small_objects(float addrspace(5)*
 ; Same frame index is used multiple times in the store
 ; GCN-LABEL: {{^}}stored_fi_to_self:
 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x4d2{{$}}
-; GCN: buffer_store_dword [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}}
+; GCN: buffer_store_dword [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}}
 ; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 4{{$}}
-; GCN: buffer_store_dword [[ZERO]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}}
+; GCN: buffer_store_dword [[ZERO]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}}
 define amdgpu_kernel void @stored_fi_to_self() #0 {
   %tmp = alloca i32 addrspace(5)*, addrspace(5)
 
@@ -66,13 +66,13 @@ define amdgpu_kernel void @stored_fi_to_self() #0 {
 
 ; GCN-LABEL: {{^}}stored_fi_to_self_offset:
 ; GCN-DAG: v_mov_b32_e32 [[K0:v[0-9]+]], 32{{$}}
-; GCN: buffer_store_dword [[K0]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}}
+; GCN: buffer_store_dword [[K0]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}}
 
 ; GCN-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x4d2{{$}}
-; GCN: buffer_store_dword [[K1]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:2052{{$}}
+; GCN: buffer_store_dword [[K1]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2052{{$}}
 
 ; GCN: v_mov_b32_e32 [[OFFSETK:v[0-9]+]], 0x804{{$}}
-; GCN: buffer_store_dword [[OFFSETK]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:2052{{$}}
+; GCN: buffer_store_dword [[OFFSETK]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2052{{$}}
 define amdgpu_kernel void @stored_fi_to_self_offset() #0 {
   %tmp0 = alloca [512 x i32], addrspace(5)
   %tmp1 = alloca i32 addrspace(5)*, addrspace(5)
@@ -89,15 +89,15 @@ define amdgpu_kernel void @stored_fi_to_self_offset() #0 {
 }
 
 ; GCN-LABEL: {{^}}stored_fi_to_fi:
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}}
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:8{{$}}
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:12{{$}}
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}}
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8{{$}}
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:12{{$}}
 
 ; GCN: v_mov_b32_e32 [[FI1:v[0-9]+]], 8{{$}}
-; GCN: buffer_store_dword [[FI1]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:12{{$}}
+; GCN: buffer_store_dword [[FI1]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:12{{$}}
 
 ; GCN: v_mov_b32_e32 [[FI2:v[0-9]+]], 12{{$}}
-; GCN: buffer_store_dword [[FI2]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:8{{$}}
+; GCN: buffer_store_dword [[FI2]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8{{$}}
 define amdgpu_kernel void @stored_fi_to_fi() #0 {
   %tmp0 = alloca i32 addrspace(5)*, addrspace(5)
   %tmp1 = alloca i32 addrspace(5)*, addrspace(5)
@@ -115,7 +115,7 @@ define amdgpu_kernel void @stored_fi_to_fi() #0 {
 }
 
 ; GCN-LABEL: {{^}}stored_fi_to_global:
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}}
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}}
 ; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}}
 ; GCN: buffer_store_dword [[FI]]
 define amdgpu_kernel void @stored_fi_to_global(float addrspace(5)* addrspace(1)* %ptr) #0 {
@@ -127,9 +127,9 @@ define amdgpu_kernel void @stored_fi_to_global(float addrspace(5)* addrspace(1)*
 
 ; Offset is applied
 ; GCN-LABEL: {{^}}stored_fi_to_global_2_small_objects:
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}}
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:8{{$}}
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:12{{$}}
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}}
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8{{$}}
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:12{{$}}
 
 ; GCN: v_mov_b32_e32 [[FI1:v[0-9]+]], 8{{$}}
 ; GCN: buffer_store_dword [[FI1]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
@@ -150,7 +150,7 @@ define amdgpu_kernel void @stored_fi_to_global_2_small_objects(float addrspace(5
 
 ; GCN-LABEL: {{^}}stored_fi_to_global_huge_frame_offset:
 ; GCN: v_mov_b32_e32 [[BASE_0:v[0-9]+]], 0{{$}}
-; GCN: buffer_store_dword [[BASE_0]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}}
+; GCN: buffer_store_dword [[BASE_0]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}}
 
 ; FIXME: Re-initialize
 ; GCN: v_mov_b32_e32 [[BASE_0_1:v[0-9]+]], 4{{$}}
@@ -160,7 +160,7 @@ define amdgpu_kernel void @stored_fi_to_global_2_small_objects(float addrspace(5
 
 
 ; GCN: v_add_i32_e32 [[BASE_1_OFF_2:v[0-9]+]], vcc, 56, [[BASE_0_1]]
-; GCN: buffer_store_dword [[K]], [[BASE_1_OFF_1]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
+; GCN: buffer_store_dword [[K]], [[BASE_1_OFF_1]], s{{\[[0-9]+:[0-9]+\]}}, 0 offen{{$}}
 
 ; GCN: buffer_store_dword [[BASE_1_OFF_2]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 define amdgpu_kernel void @stored_fi_to_global_huge_frame_offset(i32 addrspace(5)* addrspace(1)* %ptr) #0 {

diff  --git a/llvm/test/CodeGen/AMDGPU/cc-update.ll b/llvm/test/CodeGen/AMDGPU/cc-update.ll
new file mode 100644
index 000000000000..2cd8d67b355f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/cc-update.ll
@@ -0,0 +1,422 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck --check-prefix=GFX803 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX900 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX1010 %s
+
+define amdgpu_kernel void @test_kern_empty() local_unnamed_addr #0 {
+; GFX803-LABEL: test_kern_empty:
+; GFX803:       ; %bb.0: ; %entry
+; GFX803-NEXT:    s_endpgm
+;
+; GFX900-LABEL: test_kern_empty:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_endpgm
+;
+; GFX1010-LABEL: test_kern_empty:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_endpgm
+entry:
+  ret void
+}
+
+define amdgpu_kernel void @test_kern_stack() local_unnamed_addr #0 {
+; GFX803-LABEL: test_kern_stack:
+; GFX803:       ; %bb.0: ; %entry
+; GFX803-NEXT:    s_add_u32 s4, s4, s7
+; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; GFX803-NEXT:    s_add_u32 s0, s0, s7
+; GFX803-NEXT:    s_addc_u32 s1, s1, 0
+; GFX803-NEXT:    v_mov_b32_e32 v0, 0
+; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s5
+; GFX803-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
+; GFX803-NEXT:    s_endpgm
+;
+; GFX900-LABEL: test_kern_stack:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s4, s7
+; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
+; GFX900-NEXT:    s_add_u32 s0, s0, s7
+; GFX900-NEXT:    s_addc_u32 s1, s1, 0
+; GFX900-NEXT:    v_mov_b32_e32 v0, 0
+; GFX900-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
+; GFX900-NEXT:    s_endpgm
+;
+; GFX1010-LABEL: test_kern_stack:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_add_u32 s4, s4, s7
+; GFX1010-NEXT:    s_addc_u32 s5, s5, 0
+; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
+; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
+; GFX1010-NEXT:    s_add_u32 s0, s0, s7
+; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
+; GFX1010-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1010-NEXT:    ; implicit-def: $vcc_hi
+; GFX1010-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
+; GFX1010-NEXT:    s_endpgm
+entry:
+  %x = alloca i32, align 4, addrspace(5)
+  store volatile i32 0, i32 addrspace(5)* %x, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 {
+; GFX803-LABEL: test_kern_call:
+; GFX803:       ; %bb.0: ; %entry
+; GFX803-NEXT:    s_add_u32 s4, s4, s7
+; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; GFX803-NEXT:    s_add_u32 s0, s0, s7
+; GFX803-NEXT:    s_addc_u32 s1, s1, 0
+; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s5
+; GFX803-NEXT:    s_getpc_b64 s[4:5]
+; GFX803-NEXT:    s_add_u32 s4, s4, ex at rel32@lo+4
+; GFX803-NEXT:    s_addc_u32 s5, s5, ex at rel32@hi+4
+; GFX803-NEXT:    s_mov_b32 s32, 0
+; GFX803-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX803-NEXT:    s_endpgm
+;
+; GFX900-LABEL: test_kern_call:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s4, s7
+; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
+; GFX900-NEXT:    s_add_u32 s0, s0, s7
+; GFX900-NEXT:    s_addc_u32 s1, s1, 0
+; GFX900-NEXT:    s_getpc_b64 s[4:5]
+; GFX900-NEXT:    s_add_u32 s4, s4, ex at rel32@lo+4
+; GFX900-NEXT:    s_addc_u32 s5, s5, ex at rel32@hi+4
+; GFX900-NEXT:    s_mov_b32 s32, 0
+; GFX900-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX900-NEXT:    s_endpgm
+;
+; GFX1010-LABEL: test_kern_call:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_add_u32 s4, s4, s7
+; GFX1010-NEXT:    s_mov_b32 s32, 0
+; GFX1010-NEXT:    s_addc_u32 s5, s5, 0
+; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
+; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
+; GFX1010-NEXT:    s_add_u32 s0, s0, s7
+; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
+; GFX1010-NEXT:    s_getpc_b64 s[4:5]
+; GFX1010-NEXT:    s_add_u32 s4, s4, ex at rel32@lo+4
+; GFX1010-NEXT:    s_addc_u32 s5, s5, ex at rel32@hi+4
+; GFX1010-NEXT:    ; implicit-def: $vcc_hi
+; GFX1010-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX1010-NEXT:    s_endpgm
+entry:
+  tail call void @ex() #0
+  ret void
+}
+
+define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 {
+; GFX803-LABEL: test_kern_stack_and_call:
+; GFX803:       ; %bb.0: ; %entry
+; GFX803-NEXT:    s_add_u32 s4, s4, s7
+; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; GFX803-NEXT:    s_add_u32 s0, s0, s7
+; GFX803-NEXT:    s_addc_u32 s1, s1, 0
+; GFX803-NEXT:    v_mov_b32_e32 v0, 0
+; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s5
+; GFX803-NEXT:    s_getpc_b64 s[4:5]
+; GFX803-NEXT:    s_add_u32 s4, s4, ex at rel32@lo+4
+; GFX803-NEXT:    s_addc_u32 s5, s5, ex at rel32@hi+4
+; GFX803-NEXT:    s_movk_i32 s32, 0x400
+; GFX803-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
+; GFX803-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX803-NEXT:    s_endpgm
+;
+; GFX900-LABEL: test_kern_stack_and_call:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s4, s7
+; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
+; GFX900-NEXT:    s_add_u32 s0, s0, s7
+; GFX900-NEXT:    s_addc_u32 s1, s1, 0
+; GFX900-NEXT:    v_mov_b32_e32 v0, 0
+; GFX900-NEXT:    s_getpc_b64 s[4:5]
+; GFX900-NEXT:    s_add_u32 s4, s4, ex at rel32@lo+4
+; GFX900-NEXT:    s_addc_u32 s5, s5, ex at rel32@hi+4
+; GFX900-NEXT:    s_movk_i32 s32, 0x400
+; GFX900-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
+; GFX900-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX900-NEXT:    s_endpgm
+;
+; GFX1010-LABEL: test_kern_stack_and_call:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_add_u32 s4, s4, s7
+; GFX1010-NEXT:    s_movk_i32 s32, 0x200
+; GFX1010-NEXT:    s_addc_u32 s5, s5, 0
+; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
+; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
+; GFX1010-NEXT:    s_add_u32 s0, s0, s7
+; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
+; GFX1010-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1010-NEXT:    s_getpc_b64 s[4:5]
+; GFX1010-NEXT:    s_add_u32 s4, s4, ex at rel32@lo+4
+; GFX1010-NEXT:    s_addc_u32 s5, s5, ex at rel32@hi+4
+; GFX1010-NEXT:    ; implicit-def: $vcc_hi
+; GFX1010-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
+; GFX1010-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX1010-NEXT:    s_endpgm
+entry:
+  %x = alloca i32, align 4, addrspace(5)
+  store volatile i32 0, i32 addrspace(5)* %x, align 4
+  tail call void @ex() #0
+  ret void
+}
+
+define amdgpu_kernel void @test_force_fp_kern_empty() local_unnamed_addr #2 {
+; GFX803-LABEL: test_force_fp_kern_empty:
+; GFX803:       ; %bb.0: ; %entry
+; GFX803-NEXT:    s_mov_b32 s34, 0
+; GFX803-NEXT:    s_endpgm
+;
+; GFX900-LABEL: test_force_fp_kern_empty:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_mov_b32 s34, 0
+; GFX900-NEXT:    s_endpgm
+;
+; GFX1010-LABEL: test_force_fp_kern_empty:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_mov_b32 s34, 0
+; GFX1010-NEXT:    s_endpgm
+entry:
+  ret void
+}
+
+define amdgpu_kernel void @test_force_fp_kern_stack() local_unnamed_addr #2 {
+; GFX803-LABEL: test_force_fp_kern_stack:
+; GFX803:       ; %bb.0: ; %entry
+; GFX803-NEXT:    s_add_u32 s4, s4, s7
+; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; GFX803-NEXT:    s_add_u32 s0, s0, s7
+; GFX803-NEXT:    s_mov_b32 s34, 0
+; GFX803-NEXT:    s_addc_u32 s1, s1, 0
+; GFX803-NEXT:    v_mov_b32_e32 v0, 0
+; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s5
+; GFX803-NEXT:    buffer_store_dword v0, off, s[0:3], s34 offset:4
+; GFX803-NEXT:    s_endpgm
+;
+; GFX900-LABEL: test_force_fp_kern_stack:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s4, s7
+; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
+; GFX900-NEXT:    s_add_u32 s0, s0, s7
+; GFX900-NEXT:    s_mov_b32 s34, 0
+; GFX900-NEXT:    s_addc_u32 s1, s1, 0
+; GFX900-NEXT:    v_mov_b32_e32 v0, 0
+; GFX900-NEXT:    buffer_store_dword v0, off, s[0:3], s34 offset:4
+; GFX900-NEXT:    s_endpgm
+;
+; GFX1010-LABEL: test_force_fp_kern_stack:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_add_u32 s4, s4, s7
+; GFX1010-NEXT:    s_mov_b32 s34, 0
+; GFX1010-NEXT:    s_addc_u32 s5, s5, 0
+; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
+; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
+; GFX1010-NEXT:    s_add_u32 s0, s0, s7
+; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
+; GFX1010-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1010-NEXT:    ; implicit-def: $vcc_hi
+; GFX1010-NEXT:    buffer_store_dword v0, off, s[0:3], s34 offset:4
+; GFX1010-NEXT:    s_endpgm
+entry:
+  %x = alloca i32, align 4, addrspace(5)
+  store volatile i32 0, i32 addrspace(5)* %x, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 {
+; GFX803-LABEL: test_force_fp_kern_call:
+; GFX803:       ; %bb.0: ; %entry
+; GFX803-NEXT:    s_add_u32 s4, s4, s7
+; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; GFX803-NEXT:    s_add_u32 s0, s0, s7
+; GFX803-NEXT:    s_addc_u32 s1, s1, 0
+; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s5
+; GFX803-NEXT:    s_getpc_b64 s[4:5]
+; GFX803-NEXT:    s_add_u32 s4, s4, ex at rel32@lo+4
+; GFX803-NEXT:    s_addc_u32 s5, s5, ex at rel32@hi+4
+; GFX803-NEXT:    s_mov_b32 s32, 0
+; GFX803-NEXT:    s_mov_b32 s34, 0
+; GFX803-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX803-NEXT:    s_endpgm
+;
+; GFX900-LABEL: test_force_fp_kern_call:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s4, s7
+; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
+; GFX900-NEXT:    s_add_u32 s0, s0, s7
+; GFX900-NEXT:    s_addc_u32 s1, s1, 0
+; GFX900-NEXT:    s_getpc_b64 s[4:5]
+; GFX900-NEXT:    s_add_u32 s4, s4, ex at rel32@lo+4
+; GFX900-NEXT:    s_addc_u32 s5, s5, ex at rel32@hi+4
+; GFX900-NEXT:    s_mov_b32 s32, 0
+; GFX900-NEXT:    s_mov_b32 s34, 0
+; GFX900-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX900-NEXT:    s_endpgm
+;
+; GFX1010-LABEL: test_force_fp_kern_call:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_add_u32 s4, s4, s7
+; GFX1010-NEXT:    s_mov_b32 s32, 0
+; GFX1010-NEXT:    s_mov_b32 s34, 0
+; GFX1010-NEXT:    s_addc_u32 s5, s5, 0
+; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
+; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
+; GFX1010-NEXT:    s_add_u32 s0, s0, s7
+; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
+; GFX1010-NEXT:    s_getpc_b64 s[4:5]
+; GFX1010-NEXT:    s_add_u32 s4, s4, ex at rel32@lo+4
+; GFX1010-NEXT:    s_addc_u32 s5, s5, ex at rel32@hi+4
+; GFX1010-NEXT:    ; implicit-def: $vcc_hi
+; GFX1010-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX1010-NEXT:    s_endpgm
+entry:
+  tail call void @ex() #2
+  ret void
+}
+
+define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_addr #2 {
+; GFX803-LABEL: test_force_fp_kern_stack_and_call:
+; GFX803:       ; %bb.0: ; %entry
+; GFX803-NEXT:    s_add_u32 s4, s4, s7
+; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; GFX803-NEXT:    s_add_u32 s0, s0, s7
+; GFX803-NEXT:    s_mov_b32 s34, 0
+; GFX803-NEXT:    s_addc_u32 s1, s1, 0
+; GFX803-NEXT:    v_mov_b32_e32 v0, 0
+; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s5
+; GFX803-NEXT:    s_getpc_b64 s[4:5]
+; GFX803-NEXT:    s_add_u32 s4, s4, ex at rel32@lo+4
+; GFX803-NEXT:    s_addc_u32 s5, s5, ex at rel32@hi+4
+; GFX803-NEXT:    s_movk_i32 s32, 0x400
+; GFX803-NEXT:    buffer_store_dword v0, off, s[0:3], s34 offset:4
+; GFX803-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX803-NEXT:    s_endpgm
+;
+; GFX900-LABEL: test_force_fp_kern_stack_and_call:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s4, s7
+; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
+; GFX900-NEXT:    s_add_u32 s0, s0, s7
+; GFX900-NEXT:    s_addc_u32 s1, s1, 0
+; GFX900-NEXT:    s_mov_b32 s34, 0
+; GFX900-NEXT:    v_mov_b32_e32 v0, 0
+; GFX900-NEXT:    s_getpc_b64 s[4:5]
+; GFX900-NEXT:    s_add_u32 s4, s4, ex at rel32@lo+4
+; GFX900-NEXT:    s_addc_u32 s5, s5, ex at rel32@hi+4
+; GFX900-NEXT:    s_movk_i32 s32, 0x400
+; GFX900-NEXT:    buffer_store_dword v0, off, s[0:3], s34 offset:4
+; GFX900-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX900-NEXT:    s_endpgm
+;
+; GFX1010-LABEL: test_force_fp_kern_stack_and_call:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_add_u32 s4, s4, s7
+; GFX1010-NEXT:    s_movk_i32 s32, 0x200
+; GFX1010-NEXT:    s_mov_b32 s34, 0
+; GFX1010-NEXT:    s_addc_u32 s5, s5, 0
+; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
+; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
+; GFX1010-NEXT:    s_add_u32 s0, s0, s7
+; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
+; GFX1010-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1010-NEXT:    s_getpc_b64 s[4:5]
+; GFX1010-NEXT:    s_add_u32 s4, s4, ex at rel32@lo+4
+; GFX1010-NEXT:    s_addc_u32 s5, s5, ex at rel32@hi+4
+; GFX1010-NEXT:    ; implicit-def: $vcc_hi
+; GFX1010-NEXT:    buffer_store_dword v0, off, s[0:3], s34 offset:4
+; GFX1010-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX1010-NEXT:    s_endpgm
+entry:
+  %x = alloca i32, align 4, addrspace(5)
+  store volatile i32 0, i32 addrspace(5)* %x, align 4
+  tail call void @ex() #2
+  ret void
+}
+
+define amdgpu_kernel void @test_sgpr_offset_kernel() #1 {
+; GFX803-LABEL: test_sgpr_offset_kernel:
+; GFX803:       ; %bb.0: ; %entry
+; GFX803-NEXT:    s_add_u32 s4, s4, s7
+; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; GFX803-NEXT:    s_add_u32 s0, s0, s7
+; GFX803-NEXT:    s_addc_u32 s1, s1, 0
+; GFX803-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:8
+; GFX803-NEXT:    s_mov_b32 s4, 0x40000
+; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s5
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill
+; GFX803-NEXT:    ;;#ASMSTART
+; GFX803-NEXT:    ;;#ASMEND
+; GFX803-NEXT:    s_mov_b32 s4, 0x40000
+; GFX803-NEXT:    buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:8
+; GFX803-NEXT:    s_endpgm
+;
+; GFX900-LABEL: test_sgpr_offset_kernel:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s4, s7
+; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
+; GFX900-NEXT:    s_add_u32 s0, s0, s7
+; GFX900-NEXT:    s_addc_u32 s1, s1, 0
+; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:8
+; GFX900-NEXT:    s_mov_b32 s6, 0x40000
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s6, 0x40000
+; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:8
+; GFX900-NEXT:    s_endpgm
+;
+; GFX1010-LABEL: test_sgpr_offset_kernel:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_add_u32 s4, s4, s7
+; GFX1010-NEXT:    s_addc_u32 s5, s5, 0
+; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
+; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
+; GFX1010-NEXT:    s_add_u32 s0, s0, s7
+; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
+; GFX1010-NEXT:    s_mov_b32 s6, 0x20000
+; GFX1010-NEXT:    ; implicit-def: $vcc_hi
+; GFX1010-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:8
+; GFX1010-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-NEXT:    buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill
+; GFX1010-NEXT:    v_nop
+; GFX1010-NEXT:    s_mov_b32 s6, 0x20000
+; GFX1010-NEXT:    ;;#ASMSTART
+; GFX1010-NEXT:    ;;#ASMEND
+; GFX1010-NEXT:    buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload
+; GFX1010-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:8
+; GFX1010-NEXT:    s_endpgm
+entry:
+  ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
+  ; fit in the instruction, and has to live in the SGPR offset.
+  %alloca = alloca i8, i32 4092, align 4, addrspace(5)
+  %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
+
+  %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
+  ; 0x40000 / 64 = 4096 (for wave64)
+  ; CHECK: s_add_u32 s6, s7, 0x40000
+  ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 ; 4-byte Folded Spill
+  %a = load volatile i32, i32 addrspace(5)* %aptr
+
+  ; Force %a to spill
+  call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
+
+  %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
+  store volatile i32 %a, i32 addrspace(5)* %outptr
+
+  ret void
+}
+
+declare hidden void @ex() local_unnamed_addr #0
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind "amdgpu-num-vgpr"="8" }
+attributes #2 = { nounwind "frame-pointer"="all" }

diff  --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
index 1a3cc72fe5b2..a2fb893af540 100644
--- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
+++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
@@ -135,8 +135,8 @@ done:
 
 ; GCN-LABEL: {{^}}test_sink_scratch_small_offset_i32:
 ; GCN: s_and_saveexec_b64
-; GCN: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:4092{{$}}
-; GCN: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:4092{{$}}
+; GCN: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4092{{$}}
+; GCN: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4092{{$}}
 ; GCN: {{^}}BB4_2:
 define amdgpu_kernel void @test_sink_scratch_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) {
 entry:
@@ -174,9 +174,9 @@ done:
 ; GCN-LABEL: {{^}}test_sink_scratch_small_offset_i32_reserved:
 ; GCN: s_and_saveexec_b64
 ; GCN: v_mov_b32_e32 [[BASE_FI0:v[0-9]+]], 4
-; GCN: buffer_store_dword {{v[0-9]+}}, [[BASE_FI0]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:4092{{$}}
+; GCN: buffer_store_dword {{v[0-9]+}}, [[BASE_FI0]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen offset:4092{{$}}
 ; GCN: v_mov_b32_e32 [[BASE_FI1:v[0-9]+]], 4
-; GCN: buffer_load_dword {{v[0-9]+}}, [[BASE_FI1]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:4092{{$}}
+; GCN: buffer_load_dword {{v[0-9]+}}, [[BASE_FI1]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen offset:4092{{$}}
 ; GCN: {{^BB[0-9]+}}_2:
 
 define amdgpu_kernel void @test_sink_scratch_small_offset_i32_reserved(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) {
@@ -213,8 +213,8 @@ done:
 
 ; GCN-LABEL: {{^}}test_no_sink_scratch_large_offset_i32:
 ; GCN: s_and_saveexec_b64
-; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
-; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
+; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen{{$}}
+; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen{{$}}
 ; GCN: {{^BB[0-9]+}}_2:
 define amdgpu_kernel void @test_no_sink_scratch_large_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) {
 entry:

diff  --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
index 0df32537808a..a85cdcc01922 100644
--- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
@@ -5,9 +5,9 @@ define <2 x half> @chain_hi_to_lo_private() {
 ; GCN-LABEL: chain_hi_to_lo_private:
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    buffer_load_ushort v0, off, s[0:3], s33 offset:2
+; GCN-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:2
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_short_d16_hi v0, off, s[0:3], s33
+; GCN-NEXT:    buffer_load_short_d16_hi v0, off, s[0:3], 0
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 bb:
@@ -26,9 +26,9 @@ define <2 x half> @chain_hi_to_lo_private_
diff erent_bases(half addrspace(5)* %ba
 ; GCN-LABEL: chain_hi_to_lo_private_
diff erent_bases:
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    buffer_load_ushort v0, v0, s[0:3], s33 offen
+; GCN-NEXT:    buffer_load_ushort v0, v0, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_short_d16_hi v0, v1, s[0:3], s33 offen
+; GCN-NEXT:    buffer_load_short_d16_hi v0, v1, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 bb:
@@ -46,7 +46,7 @@ define <2 x half> @chain_hi_to_lo_arithmatic(half addrspace(5)* %base, half %in)
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_add_f16_e32 v1, 1.0, v1
-; GCN-NEXT:    buffer_load_short_d16_hi v1, v0, s[0:3], s33 offen
+; GCN-NEXT:    buffer_load_short_d16_hi v1, v0, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
@@ -196,6 +196,8 @@ define amdgpu_kernel void @vload2_private(i16 addrspace(1)* nocapture readonly %
 ; GCN-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
 ; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
+; GCN-NEXT:    s_add_u32 s0, s0, s9
+; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v2, s4
 ; GCN-NEXT:    v_mov_b32_e32 v3, s5
@@ -203,20 +205,20 @@ define amdgpu_kernel void @vload2_private(i16 addrspace(1)* nocapture readonly %
 ; GCN-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v4, off, s[0:3], s9 offset:4
+; GCN-NEXT:    buffer_store_short v4, off, s[0:3], 0 offset:4
 ; GCN-NEXT:    global_load_ushort v4, v[2:3], off offset:2
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v4, off, s[0:3], s9 offset:6
+; GCN-NEXT:    buffer_store_short v4, off, s[0:3], 0 offset:6
 ; GCN-NEXT:    global_load_ushort v2, v[2:3], off offset:4
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v2, off, s[0:3], s9 offset:8
-; GCN-NEXT:    buffer_load_ushort v2, off, s[0:3], s9 offset:4
-; GCN-NEXT:    buffer_load_ushort v4, off, s[0:3], s9 offset:6
+; GCN-NEXT:    buffer_store_short v2, off, s[0:3], 0 offset:8
+; GCN-NEXT:    buffer_load_ushort v2, off, s[0:3], 0 offset:4
+; GCN-NEXT:    buffer_load_ushort v4, off, s[0:3], 0 offset:6
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v3, v4
-; GCN-NEXT:    buffer_load_short_d16_hi v3, off, s[0:3], s9 offset:8
+; GCN-NEXT:    buffer_load_short_d16_hi v3, off, s[0:3], 0 offset:8
 ; GCN-NEXT:    v_lshl_or_b32 v2, v4, 16, v2
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
@@ -298,10 +300,10 @@ define <2 x i16> @chain_hi_to_lo_private_other_dep(i16 addrspace(5)* %ptr) {
 ; GCN-LABEL: chain_hi_to_lo_private_other_dep:
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    buffer_load_short_d16_hi v1, v0, s[0:3], s33 offen
+; GCN-NEXT:    buffer_load_short_d16_hi v1, v0, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0]
-; GCN-NEXT:    buffer_load_short_d16 v1, v0, s[0:3], s33 offen offset:2
+; GCN-NEXT:    buffer_load_short_d16 v1, v0, s[0:3], 0 offen offset:2
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
index 1af2ca55308b..b86db5a7ac68 100644
--- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
+++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
@@ -225,7 +225,7 @@ bb.end:                                           ; preds = %bb.then, %bb
 ; GCN: s_andn2_b64 exec, exec,
 ; GCN-NEXT: s_cbranch_execnz [[BB1_LOOP]]
 
-; GCN: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen
+; GCN: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offen
 
 ; GCN: s_and_saveexec_b64 [[SAVEEXEC_OUTER]], {{vcc|s\[[0-9:]+\]}}
 ; GCN-NEXT: s_cbranch_execz [[BB1_OUTER_LOOP]]

diff  --git a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
index f144ed263ff3..2c1074ae62fc 100644
--- a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
@@ -22,16 +22,16 @@
 ; GCN: s_and_b64 s{{\[}}[[ANDEXEC_LO:[0-9]+]]:[[ANDEXEC_HI:[0-9]+]]{{\]}}, s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}, [[CMP0]]
 
 ; Spill load
-; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s7 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill
+; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], 0 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill
 
 ; Spill saved exec
 ; VGPR: v_writelane_b32 [[SPILL_VGPR:v[0-9]+]], s[[SAVEEXEC_LO]], [[SAVEEXEC_LO_LANE:[0-9]+]]
 ; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[SAVEEXEC_HI]], [[SAVEEXEC_HI_LANE:[0-9]+]]
 
 ; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_LO:[0-9]+]], s[[SAVEEXEC_LO]]
-; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 offset:20 ; 4-byte Folded Spill
+; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], 0 offset:20 ; 4-byte Folded Spill
 ; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_HI:[0-9]+]], s[[SAVEEXEC_HI]]
-; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:24 ; 4-byte Folded Spill
+; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], 0 offset:24 ; 4-byte Folded Spill
 
 ; GCN: s_mov_b64 exec, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}}
 
@@ -40,13 +40,13 @@
 ; GCN: ; %bb.{{[0-9]+}}: ; %if
 ; GCN: s_mov_b32 m0, -1
 ; GCN: ds_read_b32 [[LOAD1:v[0-9]+]]
-; GCN: buffer_load_dword [[RELOAD_LOAD0:v[0-9]+]], off, s[0:3], s7 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload
+; GCN: buffer_load_dword [[RELOAD_LOAD0:v[0-9]+]], off, s[0:3], 0 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload
 ; GCN: s_waitcnt vmcnt(0) lgkmcnt(0)
 
 
 ; Spill val register
 ; GCN: v_add_i32_e32 [[VAL:v[0-9]+]], vcc, [[LOAD1]], [[RELOAD_LOAD0]]
-; GCN: buffer_store_dword [[VAL]], off, s[0:3], s7 offset:[[VAL_OFFSET:[0-9]+]] ; 4-byte Folded Spill
+; GCN: buffer_store_dword [[VAL]], off, s[0:3], 0 offset:[[VAL_OFFSET:[0-9]+]] ; 4-byte Folded Spill
 
 ; VMEM: [[ENDIF]]:
 
@@ -56,18 +56,18 @@
 
 
 
-; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 offset:20 ; 4-byte Folded Reload
+; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], 0 offset:20 ; 4-byte Folded Reload
 ; VMEM: s_waitcnt vmcnt(0)
 ; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC_LO]]
 
-; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:24 ; 4-byte Folded Reload
+; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], 0 offset:24 ; 4-byte Folded Reload
 ; VMEM: s_waitcnt vmcnt(0)
 ; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC_HI]]
 
 ; GCN: s_or_b64 exec, exec, s{{\[}}[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]{{\]}}
 
 ; Restore val
-; GCN: buffer_load_dword [[RELOAD_VAL:v[0-9]+]], off, s[0:3], s7 offset:[[VAL_OFFSET]] ; 4-byte Folded Reload
+; GCN: buffer_load_dword [[RELOAD_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[VAL_OFFSET]] ; 4-byte Folded Reload
 
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RELOAD_VAL]]
 define amdgpu_kernel void @divergent_if_endif(i32 addrspace(1)* %out) #0 {
@@ -102,7 +102,7 @@ endif:
 ; GCN: s_and_b64 s{{\[}}[[ANDEXEC_LO:[0-9]+]]:[[ANDEXEC_HI:[0-9]+]]{{\]}}, s{{\[}}[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]{{\]}}, [[CMP0]]
 
 ; Spill load
-; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s7 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill
+; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], 0 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill
 
 ; Spill saved exec
 ; VGPR: v_writelane_b32 [[SPILL_VGPR:v[0-9]+]], s[[SAVEEXEC_LO]], [[SAVEEXEC_LO_LANE:[0-9]+]]
@@ -110,9 +110,9 @@ endif:
 
 
 ; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_LO:[0-9]+]], s[[SAVEEXEC_LO]]
-; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 offset:24 ; 4-byte Folded Spill
+; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], 0 offset:24 ; 4-byte Folded Spill
 ; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_HI:[0-9]+]], s[[SAVEEXEC_HI]]
-; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:28 ; 4-byte Folded Spill
+; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], 0 offset:28 ; 4-byte Folded Spill
 
 ; GCN: s_mov_b64 exec, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}}
 
@@ -120,10 +120,10 @@ endif:
 
 
 ; GCN: [[LOOP:BB[0-9]+_[0-9]+]]:
-; GCN: buffer_load_dword v[[VAL_LOOP_RELOAD:[0-9]+]], off, s[0:3], s7 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload
+; GCN: buffer_load_dword v[[VAL_LOOP_RELOAD:[0-9]+]], off, s[0:3], 0 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload
 ; GCN: v_subrev_i32_e32 [[VAL_LOOP:v[0-9]+]], vcc, v{{[0-9]+}}, v[[VAL_LOOP_RELOAD]]
 ; GCN: s_cmp_lg_u32
-; GCN: buffer_store_dword [[VAL_LOOP]], off, s[0:3], s7 offset:[[VAL_SUB_OFFSET:[0-9]+]] ; 4-byte Folded Spill
+; GCN: buffer_store_dword [[VAL_LOOP]], off, s[0:3], 0 offset:[[VAL_SUB_OFFSET:[0-9]+]] ; 4-byte Folded Spill
 ; GCN-NEXT: s_cbranch_scc1 [[LOOP]]
 
 
@@ -131,16 +131,16 @@ endif:
 ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]]
 ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]]
 
-; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 offset:24 ; 4-byte Folded Reload
+; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], 0 offset:24 ; 4-byte Folded Reload
 ; VMEM: s_waitcnt vmcnt(0)
 ; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC_LO]]
 
-; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:28 ; 4-byte Folded Reload
+; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], 0 offset:28 ; 4-byte Folded Reload
 ; VMEM: s_waitcnt vmcnt(0)
 ; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC_HI]]
 
 ; GCN: s_or_b64 exec, exec, s{{\[}}[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]{{\]}}
-; GCN: buffer_load_dword v[[VAL_END:[0-9]+]], off, s[0:3], s7 offset:[[VAL_SUB_OFFSET]] ; 4-byte Folded Reload
+; GCN: buffer_load_dword v[[VAL_END:[0-9]+]], off, s[0:3], 0 offset:[[VAL_SUB_OFFSET]] ; 4-byte Folded Reload
 
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[VAL_END]]
 define amdgpu_kernel void @divergent_loop(i32 addrspace(1)* %out) #0 {
@@ -179,16 +179,16 @@ end:
 ; GCN: s_xor_b64 s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}}, s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}
 
 ; Spill load
-; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s7 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill
+; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], 0 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill
 
 ; Spill saved exec
 ; VGPR: v_writelane_b32 [[SPILL_VGPR:v[0-9]+]], s[[SAVEEXEC_LO]], [[SAVEEXEC_LO_LANE:[0-9]+]]
 ; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[SAVEEXEC_HI]], [[SAVEEXEC_HI_LANE:[0-9]+]]
 
 ; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_LO:[0-9]+]], s[[SAVEEXEC_LO]]
-; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 offset:[[SAVEEXEC_LO_OFFSET:[0-9]+]] ; 4-byte Folded Spill
+; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], 0 offset:[[SAVEEXEC_LO_OFFSET:[0-9]+]] ; 4-byte Folded Spill
 ; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_HI:[0-9]+]], s[[SAVEEXEC_HI]]
-; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:[[SAVEEXEC_HI_OFFSET:[0-9]+]] ; 4-byte Folded Spill
+; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], 0 offset:[[SAVEEXEC_HI_OFFSET:[0-9]+]] ; 4-byte Folded Spill
 
 ; GCN: s_mov_b64 exec, [[CMP0]]
 
@@ -201,18 +201,18 @@ end:
 ; VGPR: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]]
 
 
-; VMEM: buffer_load_dword v[[FLOW_V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 offset:[[SAVEEXEC_LO_OFFSET]]
+; VMEM: buffer_load_dword v[[FLOW_V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], 0 offset:[[SAVEEXEC_LO_OFFSET]]
 ; VMEM: s_waitcnt vmcnt(0)
 ; VMEM: v_readfirstlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[FLOW_V_RELOAD_SAVEEXEC_LO]]
 
-; VMEM: buffer_load_dword v[[FLOW_V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:[[SAVEEXEC_HI_OFFSET]] ; 4-byte Folded Reload
+; VMEM: buffer_load_dword v[[FLOW_V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], 0 offset:[[SAVEEXEC_HI_OFFSET]] ; 4-byte Folded Reload
 ; VMEM: s_waitcnt vmcnt(0)
 ; VMEM: v_readfirstlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[FLOW_V_RELOAD_SAVEEXEC_HI]]
 
 ; GCN: s_or_saveexec_b64 s{{\[}}[[FLOW_S_RELOAD_SAVEEXEC_LO]]:[[FLOW_S_RELOAD_SAVEEXEC_HI]]{{\]}}, s{{\[}}[[FLOW_S_RELOAD_SAVEEXEC_LO]]:[[FLOW_S_RELOAD_SAVEEXEC_HI]]{{\]}}
 
 ; Regular spill value restored after exec modification
-; GCN: buffer_load_dword [[FLOW_VAL:v[0-9]+]], off, s[0:3], s7 offset:[[FLOW_VAL_OFFSET:[0-9]+]] ; 4-byte Folded Reload
+; GCN: buffer_load_dword [[FLOW_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[FLOW_VAL_OFFSET:[0-9]+]] ; 4-byte Folded Reload
 
 
 ; Spill saved exec
@@ -221,26 +221,26 @@ end:
 
 
 ; VMEM: v_mov_b32_e32 v[[FLOW_V_SAVEEXEC_LO:[0-9]+]], s[[FLOW_S_RELOAD_SAVEEXEC_LO]]
-; VMEM: buffer_store_dword v[[FLOW_V_SAVEEXEC_LO]], off, s[0:3], s7 offset:[[FLOW_SAVEEXEC_LO_OFFSET:[0-9]+]] ; 4-byte Folded Spill
+; VMEM: buffer_store_dword v[[FLOW_V_SAVEEXEC_LO]], off, s[0:3], 0 offset:[[FLOW_SAVEEXEC_LO_OFFSET:[0-9]+]] ; 4-byte Folded Spill
 ; VMEM: v_mov_b32_e32 v[[FLOW_V_SAVEEXEC_HI:[0-9]+]], s[[FLOW_S_RELOAD_SAVEEXEC_HI]]
-; VMEM: buffer_store_dword v[[FLOW_V_SAVEEXEC_HI]], off, s[0:3], s7 offset:[[FLOW_SAVEEXEC_HI_OFFSET:[0-9]+]] ; 4-byte Folded Spill
+; VMEM: buffer_store_dword v[[FLOW_V_SAVEEXEC_HI]], off, s[0:3], 0 offset:[[FLOW_SAVEEXEC_HI_OFFSET:[0-9]+]] ; 4-byte Folded Spill
 
-; GCN: buffer_store_dword [[FLOW_VAL]], off, s[0:3], s7 offset:[[RESULT_OFFSET:[0-9]+]] ; 4-byte Folded Spill
+; GCN: buffer_store_dword [[FLOW_VAL]], off, s[0:3], 0 offset:[[RESULT_OFFSET:[0-9]+]] ; 4-byte Folded Spill
 ; GCN: s_xor_b64 exec, exec, s{{\[}}[[FLOW_S_RELOAD_SAVEEXEC_LO]]:[[FLOW_S_RELOAD_SAVEEXEC_HI]]{{\]}}
 ; GCN-NEXT: s_cbranch_execz [[ENDIF:BB[0-9]+_[0-9]+]]
 
 
 ; GCN: ; %bb.{{[0-9]+}}: ; %if
 ; GCN: ds_read_b32
-; GCN: buffer_load_dword v[[LOAD0_RELOAD:[0-9]+]], off, s[0:3], s7 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload
+; GCN: buffer_load_dword v[[LOAD0_RELOAD:[0-9]+]], off, s[0:3], 0 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload
 ; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, v{{[0-9]+}}, v[[LOAD0_RELOAD]]
-; GCN: buffer_store_dword [[ADD]], off, s[0:3], s7 offset:[[RESULT_OFFSET]] ; 4-byte Folded Spill
+; GCN: buffer_store_dword [[ADD]], off, s[0:3], 0 offset:[[RESULT_OFFSET]] ; 4-byte Folded Spill
 ; GCN-NEXT: s_branch [[ENDIF:BB[0-9]+_[0-9]+]]
 
 ; GCN: [[ELSE]]: ; %else
-; GCN: buffer_load_dword v[[LOAD0_RELOAD:[0-9]+]], off, s[0:3], s7 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload
+; GCN: buffer_load_dword v[[LOAD0_RELOAD:[0-9]+]], off, s[0:3], 0 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload
 ; GCN: v_subrev_i32_e32 [[SUB:v[0-9]+]], vcc, v{{[0-9]+}}, v[[LOAD0_RELOAD]]
-; GCN: buffer_store_dword [[ADD]], off, s[0:3], s7 offset:[[FLOW_RESULT_OFFSET:[0-9]+]] ; 4-byte Folded Spill
+; GCN: buffer_store_dword [[ADD]], off, s[0:3], 0 offset:[[FLOW_RESULT_OFFSET:[0-9]+]] ; 4-byte Folded Spill
 ; GCN-NEXT: s_branch [[FLOW]]
 
 ; GCN: [[ENDIF]]:
@@ -248,17 +248,17 @@ end:
 ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[FLOW_SAVEEXEC_HI_LANE]]
 
 
-; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 offset:[[FLOW_SAVEEXEC_LO_OFFSET]] ; 4-byte Folded Reload
+; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], 0 offset:[[FLOW_SAVEEXEC_LO_OFFSET]] ; 4-byte Folded Reload
 ; VMEM: s_waitcnt vmcnt(0)
 ; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC_LO]]
 
-; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:[[FLOW_SAVEEXEC_HI_OFFSET]] ; 4-byte Folded Reload
+; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], 0 offset:[[FLOW_SAVEEXEC_HI_OFFSET]] ; 4-byte Folded Reload
 ; VMEM: s_waitcnt vmcnt(0)
 ; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC_HI]]
 
 ; GCN: s_or_b64 exec, exec, s{{\[}}[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]{{\]}}
 
-; GCN: buffer_load_dword v[[RESULT:[0-9]+]], off, s[0:3], s7 offset:[[RESULT_OFFSET]] ; 4-byte Folded Reload
+; GCN: buffer_load_dword v[[RESULT:[0-9]+]], off, s[0:3], 0 offset:[[RESULT_OFFSET]] ; 4-byte Folded Reload
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[RESULT]]
 define amdgpu_kernel void @divergent_if_else_endif(i32 addrspace(1)* %out) #0 {
 entry:

diff  --git a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
index 019beea0ab17..0fc880251b2c 100644
--- a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
@@ -169,14 +169,15 @@ define amdgpu_kernel void @v3i16_registers(i1 %cond) #0 {
 ; GCN-LABEL: v3i16_registers:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GCN-NEXT:    s_mov_b32 s33, s9
-; GCN-NEXT:    s_add_u32 flat_scratch_lo, s6, s33
+; GCN-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
 ; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
-; GCN-NEXT:    s_mov_b32 s32, s33
+; GCN-NEXT:    s_add_u32 s0, s0, s9
+; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_and_b32 s4, 1, s4
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], s4, 1
 ; GCN-NEXT:    s_and_b64 vcc, exec, s[4:5]
+; GCN-NEXT:    s_mov_b32 s32, 0
 ; GCN-NEXT:    s_cbranch_vccz BB4_2
 ; GCN-NEXT:  ; %bb.1:
 ; GCN-NEXT:    s_mov_b32 s4, 0
@@ -213,14 +214,15 @@ define amdgpu_kernel void @v3f16_registers(i1 %cond) #0 {
 ; GCN-LABEL: v3f16_registers:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GCN-NEXT:    s_mov_b32 s33, s9
-; GCN-NEXT:    s_add_u32 flat_scratch_lo, s6, s33
+; GCN-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
 ; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
-; GCN-NEXT:    s_mov_b32 s32, s33
+; GCN-NEXT:    s_add_u32 s0, s0, s9
+; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_and_b32 s4, 1, s4
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], s4, 1
 ; GCN-NEXT:    s_and_b64 vcc, exec, s[4:5]
+; GCN-NEXT:    s_mov_b32 s32, 0
 ; GCN-NEXT:    s_cbranch_vccz BB5_2
 ; GCN-NEXT:  ; %bb.1:
 ; GCN-NEXT:    s_mov_b32 s4, 0

diff  --git a/llvm/test/CodeGen/AMDGPU/extload-private.ll b/llvm/test/CodeGen/AMDGPU/extload-private.ll
index f119af24038d..be8c3b5a44b5 100644
--- a/llvm/test/CodeGen/AMDGPU/extload-private.ll
+++ b/llvm/test/CodeGen/AMDGPU/extload-private.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}load_i8_sext_private:
-; SI: buffer_load_sbyte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4{{$}}
+; SI: buffer_load_sbyte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4{{$}}
 define amdgpu_kernel void @load_i8_sext_private(i32 addrspace(1)* %out) {
 entry:
   %tmp0 = alloca i8, addrspace(5)
@@ -13,7 +13,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}load_i8_zext_private:
-; SI: buffer_load_ubyte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4{{$}}
+; SI: buffer_load_ubyte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4{{$}}
 define amdgpu_kernel void @load_i8_zext_private(i32 addrspace(1)* %out) {
 entry:
   %tmp0 = alloca i8, addrspace(5)
@@ -24,7 +24,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}load_i16_sext_private:
-; SI: buffer_load_sshort v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4{{$}}
+; SI: buffer_load_sshort v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4{{$}}
 define amdgpu_kernel void @load_i16_sext_private(i32 addrspace(1)* %out) {
 entry:
   %tmp0 = alloca i16, addrspace(5)
@@ -35,7 +35,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}load_i16_zext_private:
-; SI: buffer_load_ushort v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4{{$}}
+; SI: buffer_load_ushort v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4{{$}}
 define amdgpu_kernel void @load_i16_zext_private(i32 addrspace(1)* %out) {
 entry:
   %tmp0 = alloca i16, addrspace(5)

diff  --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll
index 6cdd03b75491..85f9ea173eb5 100644
--- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll
+++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll
@@ -9,8 +9,8 @@ define i32 @private_load_2xi16_align2(i16 addrspace(5)* %p) #0 {
 ; GFX7-ALIGNED:       ; %bb.0:
 ; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-ALIGNED-NEXT:    v_add_i32_e32 v1, vcc, 2, v0
-; GFX7-ALIGNED-NEXT:    buffer_load_ushort v0, v0, s[0:3], s33 offen
-; GFX7-ALIGNED-NEXT:    buffer_load_ushort v1, v1, s[0:3], s33 offen
+; GFX7-ALIGNED-NEXT:    buffer_load_ushort v0, v0, s[0:3], 0 offen
+; GFX7-ALIGNED-NEXT:    buffer_load_ushort v1, v1, s[0:3], 0 offen
 ; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-ALIGNED-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX7-ALIGNED-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -20,8 +20,8 @@ define i32 @private_load_2xi16_align2(i16 addrspace(5)* %p) #0 {
 ; GFX7-UNALIGNED:       ; %bb.0:
 ; GFX7-UNALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-UNALIGNED-NEXT:    v_add_i32_e32 v1, vcc, 2, v0
-; GFX7-UNALIGNED-NEXT:    buffer_load_ushort v0, v0, s[0:3], s33 offen
-; GFX7-UNALIGNED-NEXT:    buffer_load_ushort v1, v1, s[0:3], s33 offen
+; GFX7-UNALIGNED-NEXT:    buffer_load_ushort v0, v0, s[0:3], 0 offen
+; GFX7-UNALIGNED-NEXT:    buffer_load_ushort v1, v1, s[0:3], 0 offen
 ; GFX7-UNALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-UNALIGNED-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX7-UNALIGNED-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -30,8 +30,8 @@ define i32 @private_load_2xi16_align2(i16 addrspace(5)* %p) #0 {
 ; GFX9-LABEL: private_load_2xi16_align2:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    buffer_load_ushort v1, v0, s[0:3], s33 offen
-; GFX9-NEXT:    buffer_load_ushort v0, v0, s[0:3], s33 offen offset:2
+; GFX9-NEXT:    buffer_load_ushort v1, v0, s[0:3], 0 offen
+; GFX9-NEXT:    buffer_load_ushort v0, v0, s[0:3], 0 offen offset:2
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -53,8 +53,8 @@ define void @private_store_2xi16_align2(i16 addrspace(5)* %p, i16 addrspace(5)*
 ; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v3, 1
 ; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v0, 2
 ; GFX7-ALIGNED-NEXT:    v_add_i32_e32 v2, vcc, 2, v1
-; GFX7-ALIGNED-NEXT:    buffer_store_short v3, v1, s[0:3], s33 offen
-; GFX7-ALIGNED-NEXT:    buffer_store_short v0, v2, s[0:3], s33 offen
+; GFX7-ALIGNED-NEXT:    buffer_store_short v3, v1, s[0:3], 0 offen
+; GFX7-ALIGNED-NEXT:    buffer_store_short v0, v2, s[0:3], 0 offen
 ; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-ALIGNED-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -64,8 +64,8 @@ define void @private_store_2xi16_align2(i16 addrspace(5)* %p, i16 addrspace(5)*
 ; GFX7-UNALIGNED-NEXT:    v_mov_b32_e32 v3, 1
 ; GFX7-UNALIGNED-NEXT:    v_mov_b32_e32 v0, 2
 ; GFX7-UNALIGNED-NEXT:    v_add_i32_e32 v2, vcc, 2, v1
-; GFX7-UNALIGNED-NEXT:    buffer_store_short v3, v1, s[0:3], s33 offen
-; GFX7-UNALIGNED-NEXT:    buffer_store_short v0, v2, s[0:3], s33 offen
+; GFX7-UNALIGNED-NEXT:    buffer_store_short v3, v1, s[0:3], 0 offen
+; GFX7-UNALIGNED-NEXT:    buffer_store_short v0, v2, s[0:3], 0 offen
 ; GFX7-UNALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-UNALIGNED-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -73,9 +73,9 @@ define void @private_store_2xi16_align2(i16 addrspace(5)* %p, i16 addrspace(5)*
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 1
-; GFX9-NEXT:    buffer_store_short v0, v1, s[0:3], s33 offen
-; GFX9-NEXT:    v_mov_b32_e32 v0, 2
-; GFX9-NEXT:    buffer_store_short v0, v1, s[0:3], s33 offen offset:2
+; GFX9-NEXT:    v_mov_b32_e32 v2, 2
+; GFX9-NEXT:    buffer_store_short v0, v1, s[0:3], 0 offen
+; GFX9-NEXT:    buffer_store_short v2, v1, s[0:3], 0 offen offset:2
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep.r = getelementptr i16, i16 addrspace(5)* %r, i64 1
@@ -89,36 +89,35 @@ define i32 @private_load_2xi16_align1(i16 addrspace(5)* %p) #0 {
 ; GFX7-ALIGNED-LABEL: private_load_2xi16_align1:
 ; GFX7-ALIGNED:       ; %bb.0:
 ; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-ALIGNED-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
-; GFX7-ALIGNED-NEXT:    buffer_load_ubyte v2, v2, s[0:3], s33 offen
 ; GFX7-ALIGNED-NEXT:    v_add_i32_e32 v1, vcc, 2, v0
-; GFX7-ALIGNED-NEXT:    v_add_i32_e32 v3, vcc, 3, v0
-; GFX7-ALIGNED-NEXT:    buffer_load_ubyte v3, v3, s[0:3], s33 offen
-; GFX7-ALIGNED-NEXT:    buffer_load_ubyte v1, v1, s[0:3], s33 offen
-; GFX7-ALIGNED-NEXT:    buffer_load_ubyte v0, v0, s[0:3], s33 offen
-; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(3)
-; GFX7-ALIGNED-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; GFX7-ALIGNED-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
+; GFX7-ALIGNED-NEXT:    buffer_load_ubyte v3, v0, s[0:3], 0 offen
+; GFX7-ALIGNED-NEXT:    v_add_i32_e32 v0, vcc, 3, v0
+; GFX7-ALIGNED-NEXT:    buffer_load_ubyte v0, v0, s[0:3], 0 offen
+; GFX7-ALIGNED-NEXT:    buffer_load_ubyte v2, v2, s[0:3], 0 offen
+; GFX7-ALIGNED-NEXT:    buffer_load_ubyte v1, v1, s[0:3], 0 offen
 ; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(2)
-; GFX7-ALIGNED-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
+; GFX7-ALIGNED-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-ALIGNED-NEXT:    v_or_b32_e32 v1, v3, v1
+; GFX7-ALIGNED-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-ALIGNED-NEXT:    v_or_b32_e32 v0, v2, v0
-; GFX7-ALIGNED-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX7-ALIGNED-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-ALIGNED-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX7-ALIGNED-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-ALIGNED-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX7-ALIGNED-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-UNALIGNED-LABEL: private_load_2xi16_align1:
 ; GFX7-UNALIGNED:       ; %bb.0:
 ; GFX7-UNALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-UNALIGNED-NEXT:    buffer_load_dword v0, v0, s[0:3], s33 offen
+; GFX7-UNALIGNED-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
 ; GFX7-UNALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-UNALIGNED-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: private_load_2xi16_align1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    buffer_load_dword v0, v0, s[0:3], s33 offen
+; GFX9-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffff
 ; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -141,15 +140,15 @@ define void @private_store_2xi16_align1(i16 addrspace(5)* %p, i16 addrspace(5)*
 ; GFX7-ALIGNED:       ; %bb.0:
 ; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v3, 1
-; GFX7-ALIGNED-NEXT:    buffer_store_byte v3, v1, s[0:3], s33 offen
 ; GFX7-ALIGNED-NEXT:    v_add_i32_e32 v2, vcc, 2, v1
-; GFX7-ALIGNED-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
-; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v4, 0
+; GFX7-ALIGNED-NEXT:    v_add_i32_e32 v4, vcc, 1, v1
+; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v5, 0
+; GFX7-ALIGNED-NEXT:    buffer_store_byte v3, v1, s[0:3], 0 offen
+; GFX7-ALIGNED-NEXT:    buffer_store_byte v5, v4, s[0:3], 0 offen
 ; GFX7-ALIGNED-NEXT:    v_add_i32_e32 v1, vcc, 3, v1
 ; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v0, 2
-; GFX7-ALIGNED-NEXT:    buffer_store_byte v4, v3, s[0:3], s33 offen
-; GFX7-ALIGNED-NEXT:    buffer_store_byte v4, v1, s[0:3], s33 offen
-; GFX7-ALIGNED-NEXT:    buffer_store_byte v0, v2, s[0:3], s33 offen
+; GFX7-ALIGNED-NEXT:    buffer_store_byte v5, v1, s[0:3], 0 offen
+; GFX7-ALIGNED-NEXT:    buffer_store_byte v0, v2, s[0:3], 0 offen
 ; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-ALIGNED-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -157,7 +156,7 @@ define void @private_store_2xi16_align1(i16 addrspace(5)* %p, i16 addrspace(5)*
 ; GFX7-UNALIGNED:       ; %bb.0:
 ; GFX7-UNALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-UNALIGNED-NEXT:    v_mov_b32_e32 v0, 0x20001
-; GFX7-UNALIGNED-NEXT:    buffer_store_dword v0, v1, s[0:3], s33 offen
+; GFX7-UNALIGNED-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GFX7-UNALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-UNALIGNED-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -165,7 +164,7 @@ define void @private_store_2xi16_align1(i16 addrspace(5)* %p, i16 addrspace(5)*
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x20001
-; GFX9-NEXT:    buffer_store_dword v0, v1, s[0:3], s33 offen
+; GFX9-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep.r = getelementptr i16, i16 addrspace(5)* %r, i64 1
@@ -186,21 +185,21 @@ define i32 @private_load_2xi16_align4(i16 addrspace(5)* %p) #0 {
 ; GFX7-ALIGNED-LABEL: private_load_2xi16_align4:
 ; GFX7-ALIGNED:       ; %bb.0:
 ; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-ALIGNED-NEXT:    buffer_load_dword v0, v0, s[0:3], s33 offen
+; GFX7-ALIGNED-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
 ; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-ALIGNED-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-UNALIGNED-LABEL: private_load_2xi16_align4:
 ; GFX7-UNALIGNED:       ; %bb.0:
 ; GFX7-UNALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-UNALIGNED-NEXT:    buffer_load_dword v0, v0, s[0:3], s33 offen
+; GFX7-UNALIGNED-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
 ; GFX7-UNALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-UNALIGNED-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: private_load_2xi16_align4:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    buffer_load_dword v0, v0, s[0:3], s33 offen
+; GFX9-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffff
 ; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -233,7 +232,7 @@ define void @private_store_2xi16_align4(i16 addrspace(5)* %p, i16 addrspace(5)*
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0x20001
-; GCN-NEXT:    buffer_store_dword v0, v1, s[0:3], s33 offen
+; GCN-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %gep.r = getelementptr i16, i16 addrspace(5)* %r, i64 1

diff  --git a/llvm/test/CodeGen/AMDGPU/fold-fi-mubuf.mir b/llvm/test/CodeGen/AMDGPU/fold-fi-mubuf.mir
index f80176508bef..cf4fd49c3d60 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-fi-mubuf.mir
+++ b/llvm/test/CodeGen/AMDGPU/fold-fi-mubuf.mir
@@ -1,8 +1,9 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -run-pass si-fold-operands,dead-mi-elimination  %s -o - | FileCheck -check-prefix=GCN %s
 
+# Kernels can have no FP
 ---
-name: no_fold_fi_non_stack_rsrc_soffset
+name: kernel_no_fold_fi_non_stack_rsrc_and_soffset
 tracksRegLiveness: true
 frameInfo:
   maxAlignment:    4
@@ -12,14 +13,12 @@ stack:
 machineFunctionInfo:
   isEntryFunction: true
   scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
-  scratchWaveOffsetReg: '$sgpr6'
-  frameOffsetReg:  '$sgpr6'
-  stackPtrOffsetReg: '$sgpr6'
+  stackPtrOffsetReg: '$sgpr32'
 body:             |
   bb.0:
     liveins: $sgpr12_sgpr13_sgpr14_sgpr15
 
-    ; GCN-LABEL: name: no_fold_fi_non_stack_rsrc_soffset
+    ; GCN-LABEL: name: kernel_no_fold_fi_non_stack_rsrc_and_soffset
     ; GCN: liveins: $sgpr12_sgpr13_sgpr14_sgpr15
     ; GCN: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15
     ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
@@ -36,7 +35,7 @@ body:             |
 ...
 
 ---
-name: no_fold_fi_non_stack_rsrc
+name: kernel_no_fold_fi_non_stack_rsrc
 tracksRegLiveness: true
 frameInfo:
   maxAlignment:    4
@@ -46,14 +45,12 @@ stack:
 machineFunctionInfo:
   isEntryFunction: true
   scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
-  scratchWaveOffsetReg: '$sgpr6'
-  frameOffsetReg:  '$sgpr6'
   stackPtrOffsetReg: '$sgpr32'
 body:             |
   bb.0:
     liveins: $sgpr12_sgpr13_sgpr14_sgpr15
 
-    ; GCN-LABEL: name: no_fold_fi_non_stack_rsrc
+    ; GCN-LABEL: name: kernel_no_fold_fi_non_stack_rsrc
     ; GCN: liveins: $sgpr12_sgpr13_sgpr14_sgpr15
     ; GCN: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15
     ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
@@ -68,9 +65,8 @@ body:             |
 
 ...
 
-# Offset is from global scratch wave offset.
 ---
-name: fold_fi_mubuf_scratch_scratch_wave_offset
+name: kernel_no_fold_fi_non_stack_soffset
 tracksRegLiveness: true
 frameInfo:
   maxAlignment:    4
@@ -80,12 +76,44 @@ stack:
 machineFunctionInfo:
   isEntryFunction: true
   scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
-  scratchWaveOffsetReg: '$sgpr33'
   stackPtrOffsetReg: '$sgpr32'
 body:             |
   bb.0:
 
-    ; GCN-LABEL: name: fold_fi_mubuf_scratch_scratch_wave_offset
+    ; GCN-LABEL: name: kernel_no_fold_fi_non_stack_soffset
+    ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GCN: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 7, implicit $exec
+    ; GCN: BUFFER_STORE_DWORD_OFFEN [[V_MOV_B32_e32_1]], [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
+    ; GCN: S_ENDPGM 0, implicit $vgpr0
+    %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    %1:vgpr_32 = V_MOV_B32_e32 7, implicit $exec
+    %2:sreg_32_xm0 = S_MOV_B32 0
+
+    BUFFER_STORE_DWORD_OFFEN %1:vgpr_32, %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, %2, 0, 0, 0, 0, 0, 0, implicit $exec
+    %3:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, %2, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr0 = COPY %3
+    S_ENDPGM 0, implicit $vgpr0
+
+...
+
+---
+name: kernel_fold_fi_mubuf
+tracksRegLiveness: true
+frameInfo:
+  maxAlignment:    4
+  localFrameSize:  4
+stack:
+  - { id: 0, size: 4, alignment: 4, local-offset: 0 }
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+
+    ; GCN-LABEL: name: kernel_fold_fi_mubuf
     ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 7, implicit $exec
     ; GCN: BUFFER_STORE_DWORD_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec
     ; GCN: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec
@@ -94,15 +122,17 @@ body:             |
     %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     %1:vgpr_32 = V_MOV_B32_e32 7, implicit $exec
 
-    BUFFER_STORE_DWORD_OFFEN %1:vgpr_32, %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, 0, 0, 0, implicit $exec
-    %2:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, 0, 0, 0, implicit $exec
+    BUFFER_STORE_DWORD_OFFEN %1:vgpr_32, %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec
+    %2:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec
     $vgpr0 = COPY %2
     S_ENDPGM 0, implicit $vgpr0
 
 ...
 
+
+# Functions have an unswizzled SP/FP relative to the wave offset
 ---
-name: no_fold_fi_mubuf_scratch_sp_offset
+name: function_no_fold_fi_non_stack_rsrc_and_soffset
 tracksRegLiveness: true
 frameInfo:
   maxAlignment:    4
@@ -110,14 +140,143 @@ frameInfo:
 stack:
   - { id: 0, size: 4, alignment: 4, local-offset: 0 }
 machineFunctionInfo:
-  isEntryFunction: true
+  isEntryFunction: false
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  frameOffsetReg:  '$sgpr32'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    liveins: $sgpr12_sgpr13_sgpr14_sgpr15
+
+    ; GCN-LABEL: name: function_no_fold_fi_non_stack_rsrc_and_soffset
+    ; GCN: liveins: $sgpr12_sgpr13_sgpr14_sgpr15
+    ; GCN: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15
+    ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GCN: [[BUFFER_LOAD_DWORD_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN [[V_MOV_B32_e32_]], [[COPY]], 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_IDXEN]]
+    ; GCN: SI_RETURN_TO_EPILOG $vgpr0
+    %0:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15
+    %1:sreg_32_xm0 = S_MOV_B32 0
+    %2:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    %3:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN %2, %0, %1, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr0 = COPY %3
+    SI_RETURN_TO_EPILOG $vgpr0
+
+...
+
+---
+name: function_no_fold_fi_non_stack_rsrc
+tracksRegLiveness: true
+frameInfo:
+  maxAlignment:    4
+  localFrameSize:  4
+stack:
+  - { id: 0, size: 4, alignment: 4, local-offset: 0 }
+machineFunctionInfo:
+  isEntryFunction: false
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  frameOffsetReg:  '$sgpr32'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    liveins: $sgpr12_sgpr13_sgpr14_sgpr15
+
+    ; GCN-LABEL: name: function_no_fold_fi_non_stack_rsrc
+    ; GCN: liveins: $sgpr12_sgpr13_sgpr14_sgpr15
+    ; GCN: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15
+    ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GCN: [[BUFFER_LOAD_DWORD_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN [[V_MOV_B32_e32_]], [[COPY]], $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_IDXEN]]
+    ; GCN: SI_RETURN_TO_EPILOG $vgpr0
+    %0:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15
+    %2:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    %3:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN %2, %0, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr0 = COPY %3
+    SI_RETURN_TO_EPILOG $vgpr0
+
+...
+
+---
+name: function_no_fold_fi_non_stack_soffset
+tracksRegLiveness: true
+frameInfo:
+  maxAlignment:    4
+  localFrameSize:  4
+stack:
+  - { id: 0, size: 4, alignment: 4, local-offset: 0 }
+machineFunctionInfo:
+  isEntryFunction: false
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  frameOffsetReg:  '$sgpr32'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+
+    ; GCN-LABEL: name: function_no_fold_fi_non_stack_soffset
+    ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 7, implicit $exec
+    ; GCN: BUFFER_STORE_DWORD_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
+    ; GCN: S_ENDPGM 0, implicit $vgpr0
+    %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    %1:vgpr_32 = V_MOV_B32_e32 7, implicit $exec
+
+    BUFFER_STORE_DWORD_OFFEN %1:vgpr_32, %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    %2:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr0 = COPY %2
+    S_ENDPGM 0, implicit $vgpr0
+
+...
+
+---
+name: function_fold_fi_mubuf_wave_relative
+tracksRegLiveness: true
+frameInfo:
+  maxAlignment:    4
+  localFrameSize:  4
+stack:
+  - { id: 0, size: 4, alignment: 4, local-offset: 0 }
+machineFunctionInfo:
+  isEntryFunction: false
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  frameOffsetReg:  '$sgpr32'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+
+    ; GCN-LABEL: name: function_fold_fi_mubuf_wave_relative
+    ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 7, implicit $exec
+    ; GCN: BUFFER_STORE_DWORD_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
+    ; GCN: S_ENDPGM 0, implicit $vgpr0
+    %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    %1:vgpr_32 = V_MOV_B32_e32 7, implicit $exec
+
+    BUFFER_STORE_DWORD_OFFEN %1:vgpr_32, %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    %2:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr0 = COPY %2
+    S_ENDPGM 0, implicit $vgpr0
+
+...
+
+---
+name: function_fold_fi_mubuf_stack_relative
+tracksRegLiveness: true
+frameInfo:
+  maxAlignment:    4
+  localFrameSize:  4
+stack:
+  - { id: 0, size: 4, alignment: 4, local-offset: 0 }
+machineFunctionInfo:
+  isEntryFunction: false
   scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
-  scratchWaveOffsetReg: '$sgpr33'
+  frameOffsetReg:  '$sgpr32'
   stackPtrOffsetReg: '$sgpr32'
 body:             |
   bb.0:
 
-    ; GCN-LABEL: name: no_fold_fi_mubuf_scratch_sp_offset
+    ; GCN-LABEL: name: function_fold_fi_mubuf_stack_relative
     ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 7, implicit $exec
     ; GCN: BUFFER_STORE_DWORD_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec
     ; GCN: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec

diff  --git a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
index a928384457a4..2b60004f3a8b 100644
--- a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
+++ b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
@@ -7,10 +7,9 @@
 ; Materialize into a mov. Make sure there isn't an unnecessary copy.
 ; GCN-LABEL: {{^}}func_mov_fi_i32:
 ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN: s_sub_u32 [[SUB:s[0-9]+|vcc_lo|vcc_hi]], s32, s33
 
-; CI-NEXT: v_lshr_b32_e64 v0, [[SUB]], 6
-; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, [[SUB]]
+; CI-NEXT: v_lshr_b32_e64 v0, s32, 6
+; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, s32
 
 ; GCN-NOT: v_mov
 ; GCN: ds_write_b32 v0, v0
@@ -24,19 +23,15 @@ define void @func_mov_fi_i32() #0 {
 ; GCN-LABEL: {{^}}func_mov_fi_i32_offset:
 ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 
-; CI: s_sub_u32 [[SUB0:s[0-9]+|vcc_lo|vcc_hi]], s32, s33
-; CI-NEXT: s_sub_u32 [[SUB1:s[0-9]+|vcc_lo|vcc_hi]], s32, s33
-; CI-DAG: v_lshr_b32_e64 v0, [[SUB0]], 6
-; CI-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], [[SUB1]], 6
+; CI-DAG: v_lshr_b32_e64 v0, s32, 6
+; CI-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s32, 6
 ; CI-NOT: v_mov
 ; CI: ds_write_b32 v0, v0
-; CI-NEXT: v_add_i32_e64 v0, s{{\[[0-9]+:[0-9]+\]}}, 4, [[SCALED]]
+; CI-NEXT: v_add_i32_e{{32|64}} v0, {{s\[[0-9]+:[0-9]+\]|vcc}}, 4, [[SCALED]]
 ; CI-NEXT: ds_write_b32 v0, v0
 
-; GFX9: s_sub_u32 [[SUB0:s[0-9]+|vcc_lo|vcc_hi]], s32, s33
-; GFX9-NEXT: s_sub_u32 [[SUB1:s[0-9]+|vcc_lo|vcc_hi]], s32, s33
-; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, [[SUB0]]
-; GFX9-NEXT: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, [[SUB1]]
+; GFX9: v_lshrrev_b32_e64 v0, 6, s32
+; GFX9-NEXT: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s32
 ; GFX9-DAG: ds_write_b32 v0, v0
 ; GFX9-NEXT: v_add_u32_e32 v0, 4, [[SCALED]]
 ; GFX9-NEXT: ds_write_b32 v0, v0
@@ -53,15 +48,13 @@ define void @func_mov_fi_i32_offset() #0 {
 
 ; GCN-LABEL: {{^}}func_add_constant_to_fi_i32:
 ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN: s_sub_u32 [[SUB:s[0-9]+|vcc_lo|vcc_hi]], s32, s33
 
-; CI-NEXT: v_lshr_b32_e64 [[SCALED:v[0-9]+]], [[SUB]], 6
+; CI: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s32, 6
 ; CI-NEXT: v_add_i32_e32 v0, vcc, 4, [[SCALED]]
 
-; GFX9-NEXT: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, [[SUB]]
+; GFX9: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s32
 ; GFX9-NEXT: v_add_u32_e32 v0, 4, [[SCALED]]
 
-
 ; GCN-NOT: v_mov
 ; GCN: ds_write_b32 v0, v0
 define void @func_add_constant_to_fi_i32() #0 {
@@ -75,11 +68,10 @@ define void @func_add_constant_to_fi_i32() #0 {
 ; into.
 
 ; GCN-LABEL: {{^}}func_other_fi_user_i32:
-; GCN: s_sub_u32 [[SUB:s[0-9]+|vcc_lo|vcc_hi]], s32, s33
 
-; CI-NEXT: v_lshr_b32_e64 v0, [[SUB]], 6
+; CI: v_lshr_b32_e64 v0, s32, 6
 
-; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, [[SUB]]
+; GFX9: v_lshrrev_b32_e64 v0, 6, s32
 
 ; GCN-NEXT: v_mul_u32_u24_e32 v0, 9, v0
 ; GCN-NOT: v_mov
@@ -94,7 +86,7 @@ define void @func_other_fi_user_i32() #0 {
 
 ; GCN-LABEL: {{^}}func_store_private_arg_i32_ptr:
 ; GCN: v_mov_b32_e32 v1, 15{{$}}
-; GCN: buffer_store_dword v1, v0, s[0:3], s33 offen{{$}}
+; GCN: buffer_store_dword v1, v0, s[0:3], 0 offen{{$}}
 define void @func_store_private_arg_i32_ptr(i32 addrspace(5)* %ptr) #0 {
   store volatile i32 15, i32 addrspace(5)* %ptr
   ret void
@@ -102,7 +94,7 @@ define void @func_store_private_arg_i32_ptr(i32 addrspace(5)* %ptr) #0 {
 
 ; GCN-LABEL: {{^}}func_load_private_arg_i32_ptr:
 ; GCN: s_waitcnt
-; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], s33 offen{{$}}
+; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen{{$}}
 define void @func_load_private_arg_i32_ptr(i32 addrspace(5)* %ptr) #0 {
   %val = load volatile i32, i32 addrspace(5)* %ptr
   ret void
@@ -110,12 +102,11 @@ define void @func_load_private_arg_i32_ptr(i32 addrspace(5)* %ptr) #0 {
 
 ; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_ptr:
 ; GCN: s_waitcnt
-; GCN-NEXT: s_sub_u32 [[SUB_OFFSET:s[0-9]+|vcc_lo|vcc_hi]], s32, s33
 
-; CI-NEXT: v_lshr_b32_e64 [[SHIFT:v[0-9]+]], [[SUB_OFFSET]], 6
+; CI: v_lshr_b32_e64 [[SHIFT:v[0-9]+]], s32, 6
 ; CI-NEXT: v_or_b32_e32 v0, 4, [[SHIFT]]
 
-; GFX9-NEXT: v_lshrrev_b32_e64 [[SHIFT:v[0-9]+]], 6, [[SUB_OFFSET]]
+; GFX9: v_lshrrev_b32_e64 [[SHIFT:v[0-9]+]], 6, s32
 ; GFX9-NEXT: v_or_b32_e32 v0, 4, [[SHIFT]]
 
 ; GCN-NOT: v_mov
@@ -143,11 +134,10 @@ define void @void_func_byval_struct_i8_i32_ptr_value({ i8, i32 } addrspace(5)* b
 }
 
 ; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_ptr_nonentry_block:
-; GCN: s_sub_u32 [[SUB_OFFSET:s[0-9]+]], s32, s33
 
-; CI: v_lshr_b32_e64 [[SHIFT:v[0-9]+]], [[SUB_OFFSET]], 6
+; CI: v_lshr_b32_e64 [[SHIFT:v[0-9]+]], s32, 6
 
-; GFX9: v_lshrrev_b32_e64 [[SHIFT:v[0-9]+]], 6, [[SUB_OFFSET]]
+; GFX9: v_lshrrev_b32_e64 [[SHIFT:v[0-9]+]], 6, s32
 
 ; GCN: s_and_saveexec_b64
 
@@ -175,13 +165,12 @@ ret:
 
 ; Added offset can't be used with VOP3 add
 ; GCN-LABEL: {{^}}func_other_fi_user_non_inline_imm_offset_i32:
-; GCN: s_sub_u32 [[SUB:s[0-9]+|vcc_lo|vcc_hi]], s32, s33
-; CI-DAG: s_movk_i32 [[K:s[0-9]+|vcc_lo|vcc_hi]], 0x200
 
-; CI-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], [[SUB]], 6
+; CI-DAG: s_movk_i32 [[K:s[0-9]+|vcc_lo|vcc_hi]], 0x200
+; CI-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s32, 6
 ; CI: v_add_i32_e32 [[VZ:v[0-9]+]], vcc, [[K]], [[SCALED]]
 
-; GFX9-DAG: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, [[SUB]]
+; GFX9-DAG: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s32
 ; GFX9: v_add_u32_e32 [[VZ:v[0-9]+]], 0x200, [[SCALED]]
 
 ; GCN: v_mul_u32_u24_e32 [[VZ]], 9, [[VZ]]
@@ -199,13 +188,12 @@ define void @func_other_fi_user_non_inline_imm_offset_i32() #0 {
 }
 
 ; GCN-LABEL: {{^}}func_other_fi_user_non_inline_imm_offset_i32_vcc_live:
-; GCN: s_sub_u32 [[DIFF:s[0-9]+]], s32, s33
-; CI-DAG: s_movk_i32 [[OFFSET:s[0-9]+]], 0x200
 
-; CI-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], [[DIFF]], 6
+; CI-DAG: s_movk_i32 [[OFFSET:s[0-9]+]], 0x200
+; CI-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s32, 6
 ; CI: v_add_i32_e64 [[VZ:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, [[OFFSET]], [[SCALED]]
 
-; GFX9-DAG: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, [[DIFF]]
+; GFX9-DAG: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s32
 ; GFX9: v_add_u32_e32 [[VZ:v[0-9]+]], 0x200, [[SCALED]]
 
 ; GCN: v_mul_u32_u24_e32 [[VZ]], 9, [[VZ]]
@@ -256,12 +244,11 @@ bb5:
 ; GCN-LABEL: {{^}}alloca_ptr_nonentry_block:
 ; GCN: s_and_saveexec_b64
 ; GCN: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4
-; GCN: s_sub_u32 [[SUB_OFFSET:s[0-9]+|vcc_lo|vcc_hi]], s32, s33
 
-; CI: v_lshr_b32_e64 [[SHIFT:v[0-9]+]], [[SUB_OFFSET]], 6
+; CI: v_lshr_b32_e64 [[SHIFT:v[0-9]+]], s32, 6
 ; CI-NEXT: v_or_b32_e32 [[PTR:v[0-9]+]], 4, [[SHIFT]]
 
-; GFX9: v_lshrrev_b32_e64 [[SHIFT:v[0-9]+]], 6, [[SUB_OFFSET]]
+; GFX9: v_lshrrev_b32_e64 [[SHIFT:v[0-9]+]], 6, s32
 ; GFX9-NEXT: v_or_b32_e32 [[PTR:v[0-9]+]], 4, [[SHIFT]]
 
 ; GCN: ds_write_b32 v{{[0-9]+}}, [[PTR]]

diff  --git a/llvm/test/CodeGen/AMDGPU/frame-lowering-entry-all-sgpr-used.mir b/llvm/test/CodeGen/AMDGPU/frame-lowering-entry-all-sgpr-used.mir
index 686a617a064f..d8542bd07567 100644
--- a/llvm/test/CodeGen/AMDGPU/frame-lowering-entry-all-sgpr-used.mir
+++ b/llvm/test/CodeGen/AMDGPU/frame-lowering-entry-all-sgpr-used.mir
@@ -26,7 +26,6 @@ machineFunctionInfo:
   isEntryFunction: true
   waveLimiter:     true
   scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
-  scratchWaveOffsetReg: '$sgpr101'
   frameOffsetReg:  '$sgpr101'
   stackPtrOffsetReg: '$sgpr32'
   argumentInfo:    

diff  --git a/llvm/test/CodeGen/AMDGPU/frame-lowering-fp-adjusted.mir b/llvm/test/CodeGen/AMDGPU/frame-lowering-fp-adjusted.mir
index 42484398f9d0..5cc4782c6062 100644
--- a/llvm/test/CodeGen/AMDGPU/frame-lowering-fp-adjusted.mir
+++ b/llvm/test/CodeGen/AMDGPU/frame-lowering-fp-adjusted.mir
@@ -29,9 +29,8 @@ machineFunctionInfo:
   isEntryFunction: true
   waveLimiter:     true
   scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
-  scratchWaveOffsetReg: '$sgpr101'
-  frameOffsetReg:  '$sgpr101'
   stackPtrOffsetReg: '$sgpr32'
+  frameOffsetReg: '$sgpr34'
   argumentInfo:
     privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
     dispatchPtr:     { reg: '$sgpr4_sgpr5' }

diff  --git a/llvm/test/CodeGen/AMDGPU/function-returns.ll b/llvm/test/CodeGen/AMDGPU/function-returns.ll
index 49a69386e632..9c8499637948 100644
--- a/llvm/test/CodeGen/AMDGPU/function-returns.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-returns.ll
@@ -457,8 +457,8 @@ define {i8, i32} @struct_i8_i32_func_void() #0 {
 ; GCN-LABEL: {{^}}void_func_sret_struct_i8_i32:
 ; GCN: buffer_load_ubyte [[VAL0:v[0-9]+]]
 ; GCN: buffer_load_dword [[VAL1:v[0-9]+]]
-; GCN: buffer_store_byte [[VAL0]], v0, s[0:3], s33 offen{{$}}
-; GCN: buffer_store_dword [[VAL1]], v0, s[0:3], s33 offen offset:4{{$}}
+; GCN: buffer_store_byte [[VAL0]], v0, s[0:3], 0 offen{{$}}
+; GCN: buffer_store_dword [[VAL1]], v0, s[0:3], 0 offen offset:4{{$}}
 define void @void_func_sret_struct_i8_i32({ i8, i32 } addrspace(5)* sret %arg0) #0 {
   %val0 = load volatile i8, i8 addrspace(1)* undef
   %val1 = load volatile i32, i32 addrspace(1)* undef
@@ -474,39 +474,39 @@ define void @void_func_sret_struct_i8_i32({ i8, i32 } addrspace(5)* sret %arg0)
 ; AssertZext inserted. Not using it introduces the spills.
 
 ; GCN-LABEL: {{^}}v33i32_func_void:
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:4{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:8{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:12{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:16{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:20{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:24{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:28{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:32{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:36{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:40{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:44{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:48{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:52{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:56{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:60{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:64{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:68{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:72{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:76{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:80{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:84{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:88{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:92{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:96{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:100{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:104{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:108{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:112{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:116{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:120{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:124{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:128{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:4{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:8{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:12{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:16{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:20{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:24{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:28{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:32{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:36{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:40{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:44{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:48{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:52{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:56{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:60{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:64{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:68{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:72{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:76{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:80{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:84{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:88{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:92{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:96{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:100{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:104{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:108{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:112{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:116{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:120{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:124{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:128{{$}}
 ; GFX9: s_waitcnt vmcnt(0)
 ; GFX9-NEXT: s_setpc_b64
 define <33 x i32> @v33i32_func_void() #0 {
@@ -516,39 +516,39 @@ define <33 x i32> @v33i32_func_void() #0 {
 }
 
 ; GCN-LABEL: {{^}}struct_v32i32_i32_func_void:
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:4{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:8{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:12{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:16{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:20{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:24{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:28{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:32{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:36{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:40{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:44{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:48{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:52{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:56{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:60{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:64{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:68{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:72{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:76{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:80{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:84{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:88{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:92{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:96{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:100{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:104{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:108{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:112{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:116{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:120{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:124{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:128{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:4{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:8{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:12{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:16{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:20{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:24{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:28{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:32{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:36{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:40{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:44{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:48{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:52{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:56{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:60{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:64{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:68{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:72{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:76{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:80{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:84{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:88{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:92{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:96{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:100{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:104{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:108{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:112{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:116{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:120{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:124{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:128{{$}}
 ; GFX9: s_waitcnt vmcnt(0)
 ; GFX9-NEXT: s_setpc_b64
 define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 {
@@ -558,39 +558,39 @@ define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 {
 }
 
 ; GCN-LABEL: {{^}}struct_i32_v32i32_func_void:
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:128{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:132{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:136{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:140{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:144{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:148{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:152{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:156{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:160{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:164{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:168{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:172{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:176{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:180{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:184{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:188{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:192{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:196{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:200{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:204{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:208{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:212{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:216{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:220{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:224{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:228{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:232{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:236{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:240{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:244{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:248{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:252{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:128{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:132{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:136{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:140{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:144{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:148{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:152{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:156{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:160{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:164{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:168{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:172{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:176{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:180{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:184{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:188{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:192{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:196{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:200{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:204{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:208{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:212{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:216{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:220{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:224{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:228{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:232{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:236{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:240{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:244{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:248{{$}}
+; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:252{{$}}
 ; GFX9: s_waitcnt vmcnt(0)
 ; GFX9-NEXT: s_setpc_b64
 define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 {

diff  --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll
index 14e8e609e5f3..858bd5412367 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll
@@ -48,10 +48,10 @@ entry:
 }
 
 ; CHECK:   .name:       num_spilled_sgprs
-; GFX700:   .sgpr_spill_count: 40
-; GFX803:   .sgpr_spill_count: 24
-; GFX900:   .sgpr_spill_count: 24
-; GFX1010:  .sgpr_spill_count: 24
+; GFX700:   .sgpr_spill_count: 38
+; GFX803:   .sgpr_spill_count: 22
+; GFX900:   .sgpr_spill_count: 22
+; GFX1010:  .sgpr_spill_count: 22
 ; CHECK:   .symbol:     num_spilled_sgprs.kd
 define amdgpu_kernel void @num_spilled_sgprs(
     i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, [8 x i32],

diff  --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
index 11dc60a5e2a2..12d8973333f0 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
@@ -57,9 +57,9 @@ entry:
 ; CHECK-LABEL: - Name:       num_spilled_sgprs
 ; CHECK:   SymbolName: 'num_spilled_sgprs at kd'
 ; CHECK:   CodeProps:
-; GFX700:     NumSpilledSGPRs: 40
-; GFX803:     NumSpilledSGPRs: 24
-; GFX900:     NumSpilledSGPRs: 24
+; GFX700:     NumSpilledSGPRs: 38
+; GFX803:     NumSpilledSGPRs: 22
+; GFX900:     NumSpilledSGPRs: 22
 define amdgpu_kernel void @num_spilled_sgprs(
     i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, [8 x i32],
     i32 addrspace(1)* %out2, i32 addrspace(1)* %out3, [8 x i32],

diff  --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll
index 4ec5d77b64f7..a54b2f59e475 100644
--- a/llvm/test/CodeGen/AMDGPU/idot8s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll
@@ -9,49 +9,49 @@
 define amdgpu_kernel void @idot8_acc32(<8 x i4> addrspace(1)* %src1,
 ; GFX7-LABEL: idot8_acc32:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
-; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_load_dword s0, s[8:9], 0x0
-; GFX7-NEXT:    s_load_dword s1, s[10:11], 0x0
-; GFX7-NEXT:    s_load_dword s21, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s20, s[0:1], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_bfe_i32 s2, s0, 0x40000
-; GFX7-NEXT:    s_bfe_i32 s8, s1, 0x40000
-; GFX7-NEXT:    s_bfe_i32 s10, s1, 0x40004
-; GFX7-NEXT:    v_mov_b32_e32 v0, s8
-; GFX7-NEXT:    v_mov_b32_e32 v1, s21
-; GFX7-NEXT:    v_mad_i32_i24 v0, s2, v0, v1
-; GFX7-NEXT:    s_bfe_i32 s9, s0, 0x40004
-; GFX7-NEXT:    v_mov_b32_e32 v1, s10
-; GFX7-NEXT:    s_bfe_i32 s12, s1, 0x40008
-; GFX7-NEXT:    v_mad_i32_i24 v0, s9, v1, v0
-; GFX7-NEXT:    s_bfe_i32 s11, s0, 0x40008
-; GFX7-NEXT:    v_mov_b32_e32 v1, s12
-; GFX7-NEXT:    s_bfe_i32 s14, s1, 0x4000c
-; GFX7-NEXT:    v_mad_i32_i24 v0, s11, v1, v0
-; GFX7-NEXT:    s_bfe_i32 s13, s0, 0x4000c
-; GFX7-NEXT:    v_mov_b32_e32 v1, s14
-; GFX7-NEXT:    s_bfe_i32 s16, s1, 0x40010
-; GFX7-NEXT:    v_mad_i32_i24 v0, s13, v1, v0
-; GFX7-NEXT:    s_bfe_i32 s15, s0, 0x40010
-; GFX7-NEXT:    v_mov_b32_e32 v1, s16
-; GFX7-NEXT:    s_bfe_i32 s18, s1, 0x40014
-; GFX7-NEXT:    s_bfe_i32 s20, s1, 0x40018
-; GFX7-NEXT:    v_mad_i32_i24 v0, s15, v1, v0
-; GFX7-NEXT:    s_bfe_i32 s17, s0, 0x40014
-; GFX7-NEXT:    v_mov_b32_e32 v1, s18
-; GFX7-NEXT:    s_bfe_i32 s19, s0, 0x40018
-; GFX7-NEXT:    v_mad_i32_i24 v0, s17, v1, v0
+; GFX7-NEXT:    s_bfe_i32 s6, s4, 0x40000
+; GFX7-NEXT:    s_bfe_i32 s7, s5, 0x40000
+; GFX7-NEXT:    s_bfe_i32 s9, s5, 0x40004
+; GFX7-NEXT:    v_mov_b32_e32 v0, s7
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s20
-; GFX7-NEXT:    s_ashr_i32 s1, s1, 28
-; GFX7-NEXT:    v_mad_i32_i24 v0, s19, v1, v0
-; GFX7-NEXT:    s_ashr_i32 s0, s0, 28
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mad_i32_i24 v0, s0, v1, v0
-; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX7-NEXT:    v_mad_i32_i24 v0, s6, v0, v1
+; GFX7-NEXT:    s_bfe_i32 s8, s4, 0x40004
+; GFX7-NEXT:    v_mov_b32_e32 v1, s9
+; GFX7-NEXT:    s_bfe_i32 s11, s5, 0x40008
+; GFX7-NEXT:    v_mad_i32_i24 v0, s8, v1, v0
+; GFX7-NEXT:    s_bfe_i32 s10, s4, 0x40008
+; GFX7-NEXT:    v_mov_b32_e32 v1, s11
+; GFX7-NEXT:    s_bfe_i32 s13, s5, 0x4000c
+; GFX7-NEXT:    v_mad_i32_i24 v0, s10, v1, v0
+; GFX7-NEXT:    s_bfe_i32 s12, s4, 0x4000c
+; GFX7-NEXT:    v_mov_b32_e32 v1, s13
+; GFX7-NEXT:    s_bfe_i32 s15, s5, 0x40010
+; GFX7-NEXT:    v_mad_i32_i24 v0, s12, v1, v0
+; GFX7-NEXT:    s_bfe_i32 s14, s4, 0x40010
+; GFX7-NEXT:    v_mov_b32_e32 v1, s15
+; GFX7-NEXT:    s_bfe_i32 s17, s5, 0x40014
+; GFX7-NEXT:    s_bfe_i32 s19, s5, 0x40018
+; GFX7-NEXT:    v_mad_i32_i24 v0, s14, v1, v0
+; GFX7-NEXT:    s_bfe_i32 s16, s4, 0x40014
+; GFX7-NEXT:    v_mov_b32_e32 v1, s17
+; GFX7-NEXT:    s_bfe_i32 s18, s4, 0x40018
+; GFX7-NEXT:    v_mad_i32_i24 v0, s16, v1, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s19
+; GFX7-NEXT:    s_ashr_i32 s5, s5, 28
+; GFX7-NEXT:    v_mad_i32_i24 v0, s18, v1, v0
+; GFX7-NEXT:    s_ashr_i32 s4, s4, 28
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mad_i32_i24 v0, s4, v1, v0
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: idot8_acc32:
@@ -60,41 +60,41 @@ define amdgpu_kernel void @idot8_acc32(<8 x i4> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX8-NEXT:    s_load_dword s4, s[6:7], 0x0
-; GFX8-NEXT:    s_load_dword s19, s[0:1], 0x0
+; GFX8-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX8-NEXT:    s_load_dword s18, s[0:1], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_bfe_i32 s5, s2, 0x40000
-; GFX8-NEXT:    s_bfe_i32 s6, s4, 0x40000
-; GFX8-NEXT:    s_bfe_i32 s8, s4, 0x40004
-; GFX8-NEXT:    v_mov_b32_e32 v0, s6
-; GFX8-NEXT:    v_mov_b32_e32 v1, s19
-; GFX8-NEXT:    v_mad_i32_i24 v0, s5, v0, v1
-; GFX8-NEXT:    s_bfe_i32 s7, s2, 0x40004
-; GFX8-NEXT:    v_mov_b32_e32 v1, s8
-; GFX8-NEXT:    s_bfe_i32 s10, s4, 0x40008
-; GFX8-NEXT:    v_mad_i32_i24 v0, s7, v1, v0
-; GFX8-NEXT:    s_bfe_i32 s9, s2, 0x40008
-; GFX8-NEXT:    v_mov_b32_e32 v1, s10
-; GFX8-NEXT:    s_bfe_i32 s12, s4, 0x4000c
-; GFX8-NEXT:    v_mad_i32_i24 v0, s9, v1, v0
-; GFX8-NEXT:    s_bfe_i32 s11, s2, 0x4000c
-; GFX8-NEXT:    v_mov_b32_e32 v1, s12
-; GFX8-NEXT:    s_bfe_i32 s14, s4, 0x40010
-; GFX8-NEXT:    v_mad_i32_i24 v0, s11, v1, v0
-; GFX8-NEXT:    s_bfe_i32 s13, s2, 0x40010
-; GFX8-NEXT:    v_mov_b32_e32 v1, s14
-; GFX8-NEXT:    s_bfe_i32 s16, s4, 0x40014
-; GFX8-NEXT:    s_bfe_i32 s18, s4, 0x40018
-; GFX8-NEXT:    v_mad_i32_i24 v0, s13, v1, v0
-; GFX8-NEXT:    s_bfe_i32 s15, s2, 0x40014
-; GFX8-NEXT:    v_mov_b32_e32 v1, s16
-; GFX8-NEXT:    s_bfe_i32 s17, s2, 0x40018
-; GFX8-NEXT:    v_mad_i32_i24 v0, s15, v1, v0
+; GFX8-NEXT:    s_bfe_i32 s4, s2, 0x40000
+; GFX8-NEXT:    s_bfe_i32 s5, s3, 0x40000
+; GFX8-NEXT:    s_bfe_i32 s7, s3, 0x40004
+; GFX8-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s18
-; GFX8-NEXT:    s_ashr_i32 s4, s4, 28
-; GFX8-NEXT:    v_mad_i32_i24 v0, s17, v1, v0
+; GFX8-NEXT:    v_mad_i32_i24 v0, s4, v0, v1
+; GFX8-NEXT:    s_bfe_i32 s6, s2, 0x40004
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    s_bfe_i32 s9, s3, 0x40008
+; GFX8-NEXT:    v_mad_i32_i24 v0, s6, v1, v0
+; GFX8-NEXT:    s_bfe_i32 s8, s2, 0x40008
+; GFX8-NEXT:    v_mov_b32_e32 v1, s9
+; GFX8-NEXT:    s_bfe_i32 s11, s3, 0x4000c
+; GFX8-NEXT:    v_mad_i32_i24 v0, s8, v1, v0
+; GFX8-NEXT:    s_bfe_i32 s10, s2, 0x4000c
+; GFX8-NEXT:    v_mov_b32_e32 v1, s11
+; GFX8-NEXT:    s_bfe_i32 s13, s3, 0x40010
+; GFX8-NEXT:    v_mad_i32_i24 v0, s10, v1, v0
+; GFX8-NEXT:    s_bfe_i32 s12, s2, 0x40010
+; GFX8-NEXT:    v_mov_b32_e32 v1, s13
+; GFX8-NEXT:    s_bfe_i32 s15, s3, 0x40014
+; GFX8-NEXT:    s_bfe_i32 s17, s3, 0x40018
+; GFX8-NEXT:    v_mad_i32_i24 v0, s12, v1, v0
+; GFX8-NEXT:    s_bfe_i32 s14, s2, 0x40014
+; GFX8-NEXT:    v_mov_b32_e32 v1, s15
+; GFX8-NEXT:    s_bfe_i32 s16, s2, 0x40018
+; GFX8-NEXT:    v_mad_i32_i24 v0, s14, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s17
+; GFX8-NEXT:    s_ashr_i32 s3, s3, 28
+; GFX8-NEXT:    v_mad_i32_i24 v0, s16, v1, v0
 ; GFX8-NEXT:    s_ashr_i32 s2, s2, 28
-; GFX8-NEXT:    v_mov_b32_e32 v1, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX8-NEXT:    v_mad_i32_i24 v2, s2, v1, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
@@ -107,41 +107,41 @@ define amdgpu_kernel void @idot8_acc32(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX9-NEXT:    s_load_dword s4, s[6:7], 0x0
-; GFX9-NEXT:    s_load_dword s19, s[0:1], 0x0
+; GFX9-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX9-NEXT:    s_load_dword s18, s[0:1], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_bfe_i32 s5, s2, 0x40000
-; GFX9-NEXT:    s_bfe_i32 s6, s4, 0x40000
-; GFX9-NEXT:    s_bfe_i32 s8, s4, 0x40004
-; GFX9-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-NEXT:    v_mov_b32_e32 v1, s19
-; GFX9-NEXT:    v_mad_i32_i24 v0, s5, v0, v1
-; GFX9-NEXT:    s_bfe_i32 s7, s2, 0x40004
-; GFX9-NEXT:    v_mov_b32_e32 v1, s8
-; GFX9-NEXT:    s_bfe_i32 s10, s4, 0x40008
-; GFX9-NEXT:    v_mad_i32_i24 v0, s7, v1, v0
-; GFX9-NEXT:    s_bfe_i32 s9, s2, 0x40008
-; GFX9-NEXT:    v_mov_b32_e32 v1, s10
-; GFX9-NEXT:    s_bfe_i32 s12, s4, 0x4000c
-; GFX9-NEXT:    v_mad_i32_i24 v0, s9, v1, v0
-; GFX9-NEXT:    s_bfe_i32 s11, s2, 0x4000c
-; GFX9-NEXT:    v_mov_b32_e32 v1, s12
-; GFX9-NEXT:    s_bfe_i32 s14, s4, 0x40010
-; GFX9-NEXT:    v_mad_i32_i24 v0, s11, v1, v0
-; GFX9-NEXT:    s_bfe_i32 s13, s2, 0x40010
-; GFX9-NEXT:    v_mov_b32_e32 v1, s14
-; GFX9-NEXT:    s_bfe_i32 s16, s4, 0x40014
-; GFX9-NEXT:    s_bfe_i32 s18, s4, 0x40018
-; GFX9-NEXT:    v_mad_i32_i24 v0, s13, v1, v0
-; GFX9-NEXT:    s_bfe_i32 s15, s2, 0x40014
-; GFX9-NEXT:    v_mov_b32_e32 v1, s16
-; GFX9-NEXT:    s_bfe_i32 s17, s2, 0x40018
-; GFX9-NEXT:    v_mad_i32_i24 v0, s15, v1, v0
+; GFX9-NEXT:    s_bfe_i32 s4, s2, 0x40000
+; GFX9-NEXT:    s_bfe_i32 s5, s3, 0x40000
+; GFX9-NEXT:    s_bfe_i32 s7, s3, 0x40004
+; GFX9-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s18
-; GFX9-NEXT:    s_ashr_i32 s4, s4, 28
-; GFX9-NEXT:    v_mad_i32_i24 v0, s17, v1, v0
+; GFX9-NEXT:    v_mad_i32_i24 v0, s4, v0, v1
+; GFX9-NEXT:    s_bfe_i32 s6, s2, 0x40004
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    s_bfe_i32 s9, s3, 0x40008
+; GFX9-NEXT:    v_mad_i32_i24 v0, s6, v1, v0
+; GFX9-NEXT:    s_bfe_i32 s8, s2, 0x40008
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    s_bfe_i32 s11, s3, 0x4000c
+; GFX9-NEXT:    v_mad_i32_i24 v0, s8, v1, v0
+; GFX9-NEXT:    s_bfe_i32 s10, s2, 0x4000c
+; GFX9-NEXT:    v_mov_b32_e32 v1, s11
+; GFX9-NEXT:    s_bfe_i32 s13, s3, 0x40010
+; GFX9-NEXT:    v_mad_i32_i24 v0, s10, v1, v0
+; GFX9-NEXT:    s_bfe_i32 s12, s2, 0x40010
+; GFX9-NEXT:    v_mov_b32_e32 v1, s13
+; GFX9-NEXT:    s_bfe_i32 s15, s3, 0x40014
+; GFX9-NEXT:    s_bfe_i32 s17, s3, 0x40018
+; GFX9-NEXT:    v_mad_i32_i24 v0, s12, v1, v0
+; GFX9-NEXT:    s_bfe_i32 s14, s2, 0x40014
+; GFX9-NEXT:    v_mov_b32_e32 v1, s15
+; GFX9-NEXT:    s_bfe_i32 s16, s2, 0x40018
+; GFX9-NEXT:    v_mad_i32_i24 v0, s14, v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s17
+; GFX9-NEXT:    s_ashr_i32 s3, s3, 28
+; GFX9-NEXT:    v_mad_i32_i24 v0, s16, v1, v0
 ; GFX9-NEXT:    s_ashr_i32 s2, s2, 28
-; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_mad_i32_i24 v2, s2, v1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
@@ -154,11 +154,11 @@ define amdgpu_kernel void @idot8_acc32(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
-; GFX9-DL-NEXT:    s_load_dword s6, s[0:1], 0x0
+; GFX9-DL-NEXT:    s_load_dword s3, s[0:1], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-DL-NEXT:    v_dot8_i32_i4 v2, s4, v0, v1
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
@@ -167,18 +167,18 @@ define amdgpu_kernel void @idot8_acc32(<8 x i4> addrspace(1)* %src1,
 ;
 ; GFX10-DL-LABEL: idot8_acc32:
 ; GFX10-DL:       ; %bb.0: ; %entry
-; GFX10-DL-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
-; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_load_dword s0, s[8:9], 0x0
-; GFX10-DL-NEXT:    s_load_dword s1, s[4:5], 0x0
-; GFX10-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT:    v_dot8_i32_i4 v2, s1, s2, v0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s8
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s9
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-DL-NEXT:    v_dot8_i32_i4 v2, s0, s1, v0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                        <8 x i4> addrspace(1)* %src2,
@@ -254,66 +254,66 @@ entry:
 define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1,
 ; GFX7-LABEL: idot8_acc16:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
-; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s6, -1
-; GFX7-NEXT:    s_mov_b32 s0, 0xffff
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b32 s8, 0xffff
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
-; GFX7-NEXT:    s_load_dword s1, s[8:9], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[10:11], 0x0
+; GFX7-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_bfe_i32 s8, s1, 0x40000
-; GFX7-NEXT:    s_bfe_i32 s9, s2, 0x40000
-; GFX7-NEXT:    s_bfe_i32 s11, s2, 0x40004
-; GFX7-NEXT:    s_and_b32 s9, s9, s0
-; GFX7-NEXT:    s_bfe_i32 s10, s1, 0x40004
-; GFX7-NEXT:    s_bfe_i32 s13, s2, 0x40008
-; GFX7-NEXT:    s_and_b32 s11, s11, s0
-; GFX7-NEXT:    s_and_b32 s8, s8, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s9
-; GFX7-NEXT:    s_bfe_i32 s12, s1, 0x40008
-; GFX7-NEXT:    s_bfe_i32 s15, s2, 0x4000c
-; GFX7-NEXT:    s_and_b32 s13, s13, s0
-; GFX7-NEXT:    s_and_b32 s10, s10, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s11
-; GFX7-NEXT:    s_bfe_i32 s14, s1, 0x4000c
-; GFX7-NEXT:    s_bfe_i32 s17, s2, 0x40010
-; GFX7-NEXT:    s_and_b32 s15, s15, s0
-; GFX7-NEXT:    s_and_b32 s12, s12, s0
-; GFX7-NEXT:    v_mov_b32_e32 v3, s13
-; GFX7-NEXT:    s_bfe_i32 s16, s1, 0x40010
-; GFX7-NEXT:    s_bfe_i32 s19, s2, 0x40014
-; GFX7-NEXT:    s_and_b32 s17, s17, s0
-; GFX7-NEXT:    s_and_b32 s14, s14, s0
-; GFX7-NEXT:    v_mov_b32_e32 v4, s15
-; GFX7-NEXT:    s_bfe_i32 s21, s2, 0x40018
-; GFX7-NEXT:    s_bfe_i32 s18, s1, 0x40014
-; GFX7-NEXT:    s_and_b32 s19, s19, s0
-; GFX7-NEXT:    s_and_b32 s16, s16, s0
-; GFX7-NEXT:    v_mov_b32_e32 v5, s17
-; GFX7-NEXT:    s_bfe_i32 s20, s1, 0x40018
-; GFX7-NEXT:    s_ashr_i32 s2, s2, 28
-; GFX7-NEXT:    s_and_b32 s21, s21, s0
-; GFX7-NEXT:    s_and_b32 s18, s18, s0
-; GFX7-NEXT:    v_mov_b32_e32 v6, s19
-; GFX7-NEXT:    s_ashr_i32 s1, s1, 28
-; GFX7-NEXT:    s_and_b32 s20, s20, s0
-; GFX7-NEXT:    s_and_b32 s2, s2, s0
-; GFX7-NEXT:    v_mov_b32_e32 v7, s21
-; GFX7-NEXT:    s_and_b32 s0, s1, s0
+; GFX7-NEXT:    s_bfe_i32 s6, s4, 0x40000
+; GFX7-NEXT:    s_bfe_i32 s7, s5, 0x40000
+; GFX7-NEXT:    s_bfe_i32 s10, s5, 0x40004
+; GFX7-NEXT:    s_and_b32 s7, s7, s8
+; GFX7-NEXT:    s_bfe_i32 s9, s4, 0x40004
+; GFX7-NEXT:    s_bfe_i32 s12, s5, 0x40008
+; GFX7-NEXT:    s_and_b32 s10, s10, s8
+; GFX7-NEXT:    s_and_b32 s6, s6, s8
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_bfe_i32 s11, s4, 0x40008
+; GFX7-NEXT:    s_bfe_i32 s14, s5, 0x4000c
+; GFX7-NEXT:    s_and_b32 s12, s12, s8
+; GFX7-NEXT:    s_and_b32 s9, s9, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s10
+; GFX7-NEXT:    s_bfe_i32 s13, s4, 0x4000c
+; GFX7-NEXT:    s_bfe_i32 s16, s5, 0x40010
+; GFX7-NEXT:    s_and_b32 s14, s14, s8
+; GFX7-NEXT:    s_and_b32 s11, s11, s8
+; GFX7-NEXT:    v_mov_b32_e32 v3, s12
+; GFX7-NEXT:    s_bfe_i32 s15, s4, 0x40010
+; GFX7-NEXT:    s_bfe_i32 s18, s5, 0x40014
+; GFX7-NEXT:    s_and_b32 s16, s16, s8
+; GFX7-NEXT:    s_and_b32 s13, s13, s8
+; GFX7-NEXT:    v_mov_b32_e32 v4, s14
+; GFX7-NEXT:    s_bfe_i32 s20, s5, 0x40018
+; GFX7-NEXT:    s_bfe_i32 s17, s4, 0x40014
+; GFX7-NEXT:    s_and_b32 s18, s18, s8
+; GFX7-NEXT:    s_and_b32 s15, s15, s8
+; GFX7-NEXT:    v_mov_b32_e32 v5, s16
+; GFX7-NEXT:    s_bfe_i32 s19, s4, 0x40018
+; GFX7-NEXT:    s_ashr_i32 s5, s5, 28
+; GFX7-NEXT:    s_and_b32 s20, s20, s8
+; GFX7-NEXT:    s_and_b32 s17, s17, s8
+; GFX7-NEXT:    v_mov_b32_e32 v6, s18
+; GFX7-NEXT:    s_ashr_i32 s4, s4, 28
+; GFX7-NEXT:    s_and_b32 s19, s19, s8
+; GFX7-NEXT:    s_and_b32 s5, s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v7, s20
+; GFX7-NEXT:    s_and_b32 s4, s4, s8
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mad_u32_u24 v0, s8, v1, v0
-; GFX7-NEXT:    v_mad_u32_u24 v0, s10, v2, v0
-; GFX7-NEXT:    v_mad_u32_u24 v0, s12, v3, v0
-; GFX7-NEXT:    v_mad_u32_u24 v0, s14, v4, v0
-; GFX7-NEXT:    v_mad_u32_u24 v0, s16, v5, v0
-; GFX7-NEXT:    v_mad_u32_u24 v0, s18, v6, v0
-; GFX7-NEXT:    v_mad_u32_u24 v0, s20, v7, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mad_u32_u24 v0, s0, v1, v0
-; GFX7-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s6, v1, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s9, v2, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s11, v3, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s13, v4, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s15, v5, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s17, v6, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s19, v7, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
+; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: idot8_acc16:
@@ -327,41 +327,41 @@ define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX8-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_bfe_i32 s5, s0, 0x40000
-; GFX8-NEXT:    s_bfe_i32 s6, s1, 0x40000
-; GFX8-NEXT:    s_bfe_i32 s8, s1, 0x40004
-; GFX8-NEXT:    s_bfe_i32 s10, s1, 0x40008
-; GFX8-NEXT:    v_mov_b32_e32 v6, s6
+; GFX8-NEXT:    s_bfe_i32 s4, s0, 0x40000
+; GFX8-NEXT:    s_bfe_i32 s5, s1, 0x40000
+; GFX8-NEXT:    s_bfe_i32 s7, s1, 0x40004
+; GFX8-NEXT:    s_bfe_i32 s9, s1, 0x40008
+; GFX8-NEXT:    v_mov_b32_e32 v6, s5
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 12
-; GFX8-NEXT:    s_lshr_b32 s4, s1, 12
-; GFX8-NEXT:    s_bfe_i32 s7, s0, 0x40004
-; GFX8-NEXT:    s_bfe_i32 s9, s0, 0x40008
-; GFX8-NEXT:    v_mov_b32_e32 v3, s10
-; GFX8-NEXT:    v_mov_b32_e32 v7, s8
+; GFX8-NEXT:    s_lshr_b32 s3, s1, 12
+; GFX8-NEXT:    s_bfe_i32 s6, s0, 0x40004
+; GFX8-NEXT:    s_bfe_i32 s8, s0, 0x40008
+; GFX8-NEXT:    v_mov_b32_e32 v3, s9
+; GFX8-NEXT:    v_mov_b32_e32 v7, s7
 ; GFX8-NEXT:    v_lshlrev_b16_e64 v4, 12, s2
-; GFX8-NEXT:    v_lshlrev_b16_e64 v5, 12, s4
-; GFX8-NEXT:    v_mul_i32_i24_e32 v3, s9, v3
-; GFX8-NEXT:    s_bfe_i32 s12, s1, 0x40010
+; GFX8-NEXT:    v_lshlrev_b16_e64 v5, 12, s3
+; GFX8-NEXT:    v_mul_i32_i24_e32 v3, s8, v3
+; GFX8-NEXT:    s_bfe_i32 s11, s1, 0x40010
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v4, 12, v4
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v5, 12, v5
-; GFX8-NEXT:    s_bfe_i32 s14, s1, 0x40014
-; GFX8-NEXT:    s_bfe_i32 s11, s0, 0x40010
-; GFX8-NEXT:    v_mov_b32_e32 v8, s12
-; GFX8-NEXT:    s_bfe_i32 s16, s1, 0x40018
-; GFX8-NEXT:    s_bfe_i32 s13, s0, 0x40014
-; GFX8-NEXT:    v_mov_b32_e32 v9, s14
-; GFX8-NEXT:    s_bfe_i32 s15, s0, 0x40018
+; GFX8-NEXT:    s_bfe_i32 s13, s1, 0x40014
+; GFX8-NEXT:    s_bfe_i32 s10, s0, 0x40010
+; GFX8-NEXT:    v_mov_b32_e32 v8, s11
+; GFX8-NEXT:    s_bfe_i32 s15, s1, 0x40018
+; GFX8-NEXT:    s_bfe_i32 s12, s0, 0x40014
+; GFX8-NEXT:    v_mov_b32_e32 v9, s13
+; GFX8-NEXT:    s_bfe_i32 s14, s0, 0x40018
 ; GFX8-NEXT:    s_ashr_i32 s1, s1, 28
-; GFX8-NEXT:    v_mov_b32_e32 v10, s16
+; GFX8-NEXT:    v_mov_b32_e32 v10, s15
 ; GFX8-NEXT:    s_ashr_i32 s0, s0, 28
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mad_i32_i24 v2, s5, v6, v2
-; GFX8-NEXT:    v_mad_i32_i24 v2, s7, v7, v2
+; GFX8-NEXT:    v_mad_i32_i24 v2, s4, v6, v2
+; GFX8-NEXT:    v_mad_i32_i24 v2, s6, v7, v2
 ; GFX8-NEXT:    v_add_u32_sdwa v2, vcc, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
 ; GFX8-NEXT:    v_mad_u32_u24 v2, v4, v5, v2
-; GFX8-NEXT:    v_mad_i32_i24 v2, s11, v8, v2
-; GFX8-NEXT:    v_mad_i32_i24 v2, s13, v9, v2
-; GFX8-NEXT:    v_mad_i32_i24 v2, s15, v10, v2
+; GFX8-NEXT:    v_mad_i32_i24 v2, s10, v8, v2
+; GFX8-NEXT:    v_mad_i32_i24 v2, s12, v9, v2
+; GFX8-NEXT:    v_mad_i32_i24 v2, s14, v10, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX8-NEXT:    v_mad_i32_i24 v2, s0, v3, v2
 ; GFX8-NEXT:    flat_store_short v[0:1], v2
@@ -378,41 +378,41 @@ define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX9-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_bfe_i32 s5, s0, 0x40000
-; GFX9-NEXT:    s_bfe_i32 s6, s1, 0x40000
-; GFX9-NEXT:    s_bfe_i32 s8, s1, 0x40004
-; GFX9-NEXT:    s_bfe_i32 s10, s1, 0x40008
-; GFX9-NEXT:    v_mov_b32_e32 v6, s6
+; GFX9-NEXT:    s_bfe_i32 s4, s0, 0x40000
+; GFX9-NEXT:    s_bfe_i32 s5, s1, 0x40000
+; GFX9-NEXT:    s_bfe_i32 s7, s1, 0x40004
+; GFX9-NEXT:    s_bfe_i32 s9, s1, 0x40008
+; GFX9-NEXT:    v_mov_b32_e32 v6, s5
 ; GFX9-NEXT:    s_lshr_b32 s2, s0, 12
-; GFX9-NEXT:    s_lshr_b32 s4, s1, 12
-; GFX9-NEXT:    s_bfe_i32 s7, s0, 0x40004
-; GFX9-NEXT:    s_bfe_i32 s9, s0, 0x40008
-; GFX9-NEXT:    v_mov_b32_e32 v3, s10
-; GFX9-NEXT:    v_mov_b32_e32 v7, s8
+; GFX9-NEXT:    s_lshr_b32 s3, s1, 12
+; GFX9-NEXT:    s_bfe_i32 s6, s0, 0x40004
+; GFX9-NEXT:    s_bfe_i32 s8, s0, 0x40008
+; GFX9-NEXT:    v_mov_b32_e32 v3, s9
+; GFX9-NEXT:    v_mov_b32_e32 v7, s7
 ; GFX9-NEXT:    v_lshlrev_b16_e64 v4, 12, s2
-; GFX9-NEXT:    v_lshlrev_b16_e64 v5, 12, s4
-; GFX9-NEXT:    v_mul_i32_i24_e32 v3, s9, v3
-; GFX9-NEXT:    s_bfe_i32 s12, s1, 0x40010
+; GFX9-NEXT:    v_lshlrev_b16_e64 v5, 12, s3
+; GFX9-NEXT:    v_mul_i32_i24_e32 v3, s8, v3
+; GFX9-NEXT:    s_bfe_i32 s11, s1, 0x40010
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v4, 12, v4
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v5, 12, v5
-; GFX9-NEXT:    s_bfe_i32 s14, s1, 0x40014
-; GFX9-NEXT:    s_bfe_i32 s11, s0, 0x40010
-; GFX9-NEXT:    v_mov_b32_e32 v8, s12
-; GFX9-NEXT:    s_bfe_i32 s16, s1, 0x40018
-; GFX9-NEXT:    s_bfe_i32 s13, s0, 0x40014
-; GFX9-NEXT:    v_mov_b32_e32 v9, s14
-; GFX9-NEXT:    s_bfe_i32 s15, s0, 0x40018
+; GFX9-NEXT:    s_bfe_i32 s13, s1, 0x40014
+; GFX9-NEXT:    s_bfe_i32 s10, s0, 0x40010
+; GFX9-NEXT:    v_mov_b32_e32 v8, s11
+; GFX9-NEXT:    s_bfe_i32 s15, s1, 0x40018
+; GFX9-NEXT:    s_bfe_i32 s12, s0, 0x40014
+; GFX9-NEXT:    v_mov_b32_e32 v9, s13
+; GFX9-NEXT:    s_bfe_i32 s14, s0, 0x40018
 ; GFX9-NEXT:    s_ashr_i32 s1, s1, 28
-; GFX9-NEXT:    v_mov_b32_e32 v10, s16
+; GFX9-NEXT:    v_mov_b32_e32 v10, s15
 ; GFX9-NEXT:    s_ashr_i32 s0, s0, 28
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mad_i32_i24 v2, s5, v6, v2
-; GFX9-NEXT:    v_mad_i32_i24 v2, s7, v7, v2
+; GFX9-NEXT:    v_mad_i32_i24 v2, s4, v6, v2
+; GFX9-NEXT:    v_mad_i32_i24 v2, s6, v7, v2
 ; GFX9-NEXT:    v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
 ; GFX9-NEXT:    v_mad_u32_u24 v2, v4, v5, v2
-; GFX9-NEXT:    v_mad_i32_i24 v2, s11, v8, v2
-; GFX9-NEXT:    v_mad_i32_i24 v2, s13, v9, v2
-; GFX9-NEXT:    v_mad_i32_i24 v2, s15, v10, v2
+; GFX9-NEXT:    v_mad_i32_i24 v2, s10, v8, v2
+; GFX9-NEXT:    v_mad_i32_i24 v2, s12, v9, v2
+; GFX9-NEXT:    v_mad_i32_i24 v2, s14, v10, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-NEXT:    v_mad_i32_i24 v2, s0, v3, v2
 ; GFX9-NEXT:    global_store_short v[0:1], v2, off
@@ -429,41 +429,41 @@ define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_bfe_i32 s5, s0, 0x40000
-; GFX9-DL-NEXT:    s_bfe_i32 s6, s1, 0x40000
-; GFX9-DL-NEXT:    s_bfe_i32 s8, s1, 0x40004
-; GFX9-DL-NEXT:    s_bfe_i32 s10, s1, 0x40008
-; GFX9-DL-NEXT:    v_mov_b32_e32 v6, s6
+; GFX9-DL-NEXT:    s_bfe_i32 s4, s0, 0x40000
+; GFX9-DL-NEXT:    s_bfe_i32 s5, s1, 0x40000
+; GFX9-DL-NEXT:    s_bfe_i32 s7, s1, 0x40004
+; GFX9-DL-NEXT:    s_bfe_i32 s9, s1, 0x40008
+; GFX9-DL-NEXT:    v_mov_b32_e32 v6, s5
 ; GFX9-DL-NEXT:    s_lshr_b32 s2, s0, 12
-; GFX9-DL-NEXT:    s_lshr_b32 s4, s1, 12
-; GFX9-DL-NEXT:    s_bfe_i32 s7, s0, 0x40004
-; GFX9-DL-NEXT:    s_bfe_i32 s9, s0, 0x40008
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s10
-; GFX9-DL-NEXT:    v_mov_b32_e32 v7, s8
+; GFX9-DL-NEXT:    s_lshr_b32 s3, s1, 12
+; GFX9-DL-NEXT:    s_bfe_i32 s6, s0, 0x40004
+; GFX9-DL-NEXT:    s_bfe_i32 s8, s0, 0x40008
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s9
+; GFX9-DL-NEXT:    v_mov_b32_e32 v7, s7
 ; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v4, 12, s2
-; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v5, 12, s4
-; GFX9-DL-NEXT:    v_mul_i32_i24_e32 v3, s9, v3
-; GFX9-DL-NEXT:    s_bfe_i32 s12, s1, 0x40010
+; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v5, 12, s3
+; GFX9-DL-NEXT:    v_mul_i32_i24_e32 v3, s8, v3
+; GFX9-DL-NEXT:    s_bfe_i32 s11, s1, 0x40010
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v4, 12, v4
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v5, 12, v5
-; GFX9-DL-NEXT:    s_bfe_i32 s14, s1, 0x40014
-; GFX9-DL-NEXT:    s_bfe_i32 s11, s0, 0x40010
-; GFX9-DL-NEXT:    v_mov_b32_e32 v8, s12
-; GFX9-DL-NEXT:    s_bfe_i32 s16, s1, 0x40018
-; GFX9-DL-NEXT:    s_bfe_i32 s13, s0, 0x40014
-; GFX9-DL-NEXT:    v_mov_b32_e32 v9, s14
-; GFX9-DL-NEXT:    s_bfe_i32 s15, s0, 0x40018
+; GFX9-DL-NEXT:    s_bfe_i32 s13, s1, 0x40014
+; GFX9-DL-NEXT:    s_bfe_i32 s10, s0, 0x40010
+; GFX9-DL-NEXT:    v_mov_b32_e32 v8, s11
+; GFX9-DL-NEXT:    s_bfe_i32 s15, s1, 0x40018
+; GFX9-DL-NEXT:    s_bfe_i32 s12, s0, 0x40014
+; GFX9-DL-NEXT:    v_mov_b32_e32 v9, s13
+; GFX9-DL-NEXT:    s_bfe_i32 s14, s0, 0x40018
 ; GFX9-DL-NEXT:    s_ashr_i32 s1, s1, 28
-; GFX9-DL-NEXT:    v_mov_b32_e32 v10, s16
+; GFX9-DL-NEXT:    v_mov_b32_e32 v10, s15
 ; GFX9-DL-NEXT:    s_ashr_i32 s0, s0, 28
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s5, v6, v2
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s7, v7, v2
+; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s4, v6, v2
+; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s6, v7, v2
 ; GFX9-DL-NEXT:    v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
 ; GFX9-DL-NEXT:    v_mad_u32_u24 v2, v4, v5, v2
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s11, v8, v2
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s13, v9, v2
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s15, v10, v2
+; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s10, v8, v2
+; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s12, v9, v2
+; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s14, v10, v2
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s0, v3, v2
 ; GFX9-DL-NEXT:    global_store_short v[0:1], v2, off
@@ -471,49 +471,49 @@ define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1,
 ;
 ; GFX10-DL-LABEL: idot8_acc16:
 ; GFX10-DL:       ; %bb.0: ; %entry
-; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-DL-NEXT:    global_load_ushort v2, v[0:1], off
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX10-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_lshr_b32 s2, s0, 12
-; GFX10-DL-NEXT:    s_lshr_b32 s4, s1, 12
-; GFX10-DL-NEXT:    s_bfe_i32 s5, s0, 0x40000
-; GFX10-DL-NEXT:    s_bfe_i32 s6, s1, 0x40000
-; GFX10-DL-NEXT:    s_bfe_i32 s7, s0, 0x40004
+; GFX10-DL-NEXT:    s_lshr_b32 s3, s1, 12
+; GFX10-DL-NEXT:    s_bfe_i32 s4, s0, 0x40000
+; GFX10-DL-NEXT:    s_bfe_i32 s5, s1, 0x40000
+; GFX10-DL-NEXT:    s_bfe_i32 s6, s0, 0x40004
 ; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v3, 12, s2
-; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v4, 12, s4
-; GFX10-DL-NEXT:    s_bfe_i32 s8, s0, 0x40008
-; GFX10-DL-NEXT:    s_bfe_i32 s9, s1, 0x40008
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v4, 12, s3
+; GFX10-DL-NEXT:    s_bfe_i32 s7, s0, 0x40008
+; GFX10-DL-NEXT:    s_bfe_i32 s8, s1, 0x40008
 ; GFX10-DL-NEXT:    s_bfe_i32 s2, s1, 0x40004
 ; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v3, 12, v3
-; GFX10-DL-NEXT:    s_mov_b32 s4, 0xffff
+; GFX10-DL-NEXT:    s_mov_b32 s3, 0xffff
 ; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v4, 12, v4
-; GFX10-DL-NEXT:    v_mul_i32_i24_e64 v5, s8, s9
-; GFX10-DL-NEXT:    v_and_b32_e32 v3, s4, v3
-; GFX10-DL-NEXT:    v_and_b32_e32 v4, s4, v4
-; GFX10-DL-NEXT:    s_bfe_i32 s4, s1, 0x40010
+; GFX10-DL-NEXT:    v_mul_i32_i24_e64 v5, s7, s8
+; GFX10-DL-NEXT:    v_and_b32_e32 v3, s3, v3
+; GFX10-DL-NEXT:    v_and_b32_e32 v4, s3, v4
+; GFX10-DL-NEXT:    s_bfe_i32 s3, s1, 0x40010
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s5, s6, v2
-; GFX10-DL-NEXT:    s_bfe_i32 s5, s0, 0x40014
-; GFX10-DL-NEXT:    s_bfe_i32 s6, s1, 0x40014
-; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s7, s2, v2
+; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s4, s5, v2
+; GFX10-DL-NEXT:    s_bfe_i32 s4, s0, 0x40014
+; GFX10-DL-NEXT:    s_bfe_i32 s5, s1, 0x40014
+; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s6, s2, v2
 ; GFX10-DL-NEXT:    s_bfe_i32 s2, s0, 0x40010
 ; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
 ; GFX10-DL-NEXT:    v_mad_u32_u24 v2, v3, v4, v2
-; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s2, s4, v2
+; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s2, s3, v2
 ; GFX10-DL-NEXT:    s_bfe_i32 s2, s0, 0x40018
-; GFX10-DL-NEXT:    s_bfe_i32 s4, s1, 0x40018
+; GFX10-DL-NEXT:    s_bfe_i32 s3, s1, 0x40018
 ; GFX10-DL-NEXT:    s_ashr_i32 s0, s0, 28
 ; GFX10-DL-NEXT:    s_ashr_i32 s1, s1, 28
-; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s5, s6, v2
-; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s2, s4, v2
+; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s4, s5, v2
+; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s2, s3, v2
 ; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s0, s1, v2
 ; GFX10-DL-NEXT:    global_store_short v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
@@ -589,66 +589,66 @@ entry:
 define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1,
 ; GFX7-LABEL: idot8_acc8:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
-; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s6, -1
-; GFX7-NEXT:    s_movk_i32 s0, 0xff
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_movk_i32 s8, 0xff
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
-; GFX7-NEXT:    s_load_dword s1, s[8:9], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[10:11], 0x0
+; GFX7-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_bfe_i32 s8, s1, 0x40000
-; GFX7-NEXT:    s_bfe_i32 s9, s2, 0x40000
-; GFX7-NEXT:    s_bfe_i32 s11, s2, 0x40004
-; GFX7-NEXT:    s_and_b32 s9, s9, s0
-; GFX7-NEXT:    s_bfe_i32 s10, s1, 0x40004
-; GFX7-NEXT:    s_bfe_i32 s13, s2, 0x40008
-; GFX7-NEXT:    s_and_b32 s11, s11, s0
-; GFX7-NEXT:    s_and_b32 s8, s8, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s9
-; GFX7-NEXT:    s_bfe_i32 s12, s1, 0x40008
-; GFX7-NEXT:    s_bfe_i32 s15, s2, 0x4000c
-; GFX7-NEXT:    s_and_b32 s13, s13, s0
-; GFX7-NEXT:    s_and_b32 s10, s10, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s11
-; GFX7-NEXT:    s_bfe_i32 s14, s1, 0x4000c
-; GFX7-NEXT:    s_bfe_i32 s17, s2, 0x40010
-; GFX7-NEXT:    s_and_b32 s15, s15, s0
-; GFX7-NEXT:    s_and_b32 s12, s12, s0
-; GFX7-NEXT:    v_mov_b32_e32 v3, s13
-; GFX7-NEXT:    s_bfe_i32 s16, s1, 0x40010
-; GFX7-NEXT:    s_bfe_i32 s19, s2, 0x40014
-; GFX7-NEXT:    s_and_b32 s17, s17, s0
-; GFX7-NEXT:    s_and_b32 s14, s14, s0
-; GFX7-NEXT:    v_mov_b32_e32 v4, s15
-; GFX7-NEXT:    s_bfe_i32 s21, s2, 0x40018
-; GFX7-NEXT:    s_bfe_i32 s18, s1, 0x40014
-; GFX7-NEXT:    s_and_b32 s19, s19, s0
-; GFX7-NEXT:    s_and_b32 s16, s16, s0
-; GFX7-NEXT:    v_mov_b32_e32 v5, s17
-; GFX7-NEXT:    s_bfe_i32 s20, s1, 0x40018
-; GFX7-NEXT:    s_ashr_i32 s2, s2, 28
-; GFX7-NEXT:    s_and_b32 s21, s21, s0
-; GFX7-NEXT:    s_and_b32 s18, s18, s0
-; GFX7-NEXT:    v_mov_b32_e32 v6, s19
-; GFX7-NEXT:    s_ashr_i32 s1, s1, 28
-; GFX7-NEXT:    s_and_b32 s20, s20, s0
-; GFX7-NEXT:    s_and_b32 s2, s2, s0
-; GFX7-NEXT:    v_mov_b32_e32 v7, s21
-; GFX7-NEXT:    s_and_b32 s0, s1, s0
+; GFX7-NEXT:    s_bfe_i32 s6, s4, 0x40000
+; GFX7-NEXT:    s_bfe_i32 s7, s5, 0x40000
+; GFX7-NEXT:    s_bfe_i32 s10, s5, 0x40004
+; GFX7-NEXT:    s_and_b32 s7, s7, s8
+; GFX7-NEXT:    s_bfe_i32 s9, s4, 0x40004
+; GFX7-NEXT:    s_bfe_i32 s12, s5, 0x40008
+; GFX7-NEXT:    s_and_b32 s10, s10, s8
+; GFX7-NEXT:    s_and_b32 s6, s6, s8
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_bfe_i32 s11, s4, 0x40008
+; GFX7-NEXT:    s_bfe_i32 s14, s5, 0x4000c
+; GFX7-NEXT:    s_and_b32 s12, s12, s8
+; GFX7-NEXT:    s_and_b32 s9, s9, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s10
+; GFX7-NEXT:    s_bfe_i32 s13, s4, 0x4000c
+; GFX7-NEXT:    s_bfe_i32 s16, s5, 0x40010
+; GFX7-NEXT:    s_and_b32 s14, s14, s8
+; GFX7-NEXT:    s_and_b32 s11, s11, s8
+; GFX7-NEXT:    v_mov_b32_e32 v3, s12
+; GFX7-NEXT:    s_bfe_i32 s15, s4, 0x40010
+; GFX7-NEXT:    s_bfe_i32 s18, s5, 0x40014
+; GFX7-NEXT:    s_and_b32 s16, s16, s8
+; GFX7-NEXT:    s_and_b32 s13, s13, s8
+; GFX7-NEXT:    v_mov_b32_e32 v4, s14
+; GFX7-NEXT:    s_bfe_i32 s20, s5, 0x40018
+; GFX7-NEXT:    s_bfe_i32 s17, s4, 0x40014
+; GFX7-NEXT:    s_and_b32 s18, s18, s8
+; GFX7-NEXT:    s_and_b32 s15, s15, s8
+; GFX7-NEXT:    v_mov_b32_e32 v5, s16
+; GFX7-NEXT:    s_bfe_i32 s19, s4, 0x40018
+; GFX7-NEXT:    s_ashr_i32 s5, s5, 28
+; GFX7-NEXT:    s_and_b32 s20, s20, s8
+; GFX7-NEXT:    s_and_b32 s17, s17, s8
+; GFX7-NEXT:    v_mov_b32_e32 v6, s18
+; GFX7-NEXT:    s_ashr_i32 s4, s4, 28
+; GFX7-NEXT:    s_and_b32 s19, s19, s8
+; GFX7-NEXT:    s_and_b32 s5, s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v7, s20
+; GFX7-NEXT:    s_and_b32 s4, s4, s8
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mad_u32_u24 v0, s8, v1, v0
-; GFX7-NEXT:    v_mad_u32_u24 v0, s10, v2, v0
-; GFX7-NEXT:    v_mad_u32_u24 v0, s12, v3, v0
-; GFX7-NEXT:    v_mad_u32_u24 v0, s14, v4, v0
-; GFX7-NEXT:    v_mad_u32_u24 v0, s16, v5, v0
-; GFX7-NEXT:    v_mad_u32_u24 v0, s18, v6, v0
-; GFX7-NEXT:    v_mad_u32_u24 v0, s20, v7, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mad_u32_u24 v0, s0, v1, v0
-; GFX7-NEXT:    buffer_store_byte v0, off, s[4:7], 0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s6, v1, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s9, v2, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s11, v3, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s13, v4, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s15, v5, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s17, v6, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s19, v7, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
+; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: idot8_acc8:
@@ -657,50 +657,50 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX8-NEXT:    s_movk_i32 s2, 0xff
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_load_dword s6, s[6:7], 0x0
+; GFX8-NEXT:    s_load_dword s3, s[6:7], 0x0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_load_ubyte v2, v[0:1]
 ; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_bfe_i32 s7, s6, 0x40000
-; GFX8-NEXT:    s_lshr_b32 s4, s6, 12
-; GFX8-NEXT:    s_bfe_i32 s9, s6, 0x40004
-; GFX8-NEXT:    s_bfe_i32 s11, s6, 0x40008
+; GFX8-NEXT:    s_bfe_i32 s6, s3, 0x40000
+; GFX8-NEXT:    s_lshr_b32 s4, s3, 12
+; GFX8-NEXT:    s_bfe_i32 s8, s3, 0x40004
+; GFX8-NEXT:    s_bfe_i32 s10, s3, 0x40008
 ; GFX8-NEXT:    s_lshr_b32 s1, s0, 12
 ; GFX8-NEXT:    s_bfe_i32 s5, s0, 0x40000
-; GFX8-NEXT:    v_mov_b32_e32 v6, s7
+; GFX8-NEXT:    v_mov_b32_e32 v6, s6
 ; GFX8-NEXT:    v_lshlrev_b16_e64 v4, 12, s1
 ; GFX8-NEXT:    v_lshlrev_b16_e64 v5, 12, s4
-; GFX8-NEXT:    s_bfe_i32 s8, s0, 0x40004
-; GFX8-NEXT:    s_bfe_i32 s10, s0, 0x40008
-; GFX8-NEXT:    v_mov_b32_e32 v3, s11
-; GFX8-NEXT:    v_mov_b32_e32 v7, s9
+; GFX8-NEXT:    s_bfe_i32 s7, s0, 0x40004
+; GFX8-NEXT:    s_bfe_i32 s9, s0, 0x40008
+; GFX8-NEXT:    v_mov_b32_e32 v3, s10
+; GFX8-NEXT:    v_mov_b32_e32 v7, s8
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v4, 12, v4
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v5, 12, v5
-; GFX8-NEXT:    v_mul_i32_i24_e32 v3, s10, v3
-; GFX8-NEXT:    s_bfe_i32 s13, s6, 0x40010
+; GFX8-NEXT:    v_mul_i32_i24_e32 v3, s9, v3
+; GFX8-NEXT:    s_bfe_i32 s12, s3, 0x40010
 ; GFX8-NEXT:    v_and_b32_e32 v4, s2, v4
 ; GFX8-NEXT:    v_and_b32_e32 v5, s2, v5
-; GFX8-NEXT:    s_bfe_i32 s15, s6, 0x40014
-; GFX8-NEXT:    s_bfe_i32 s12, s0, 0x40010
-; GFX8-NEXT:    v_mov_b32_e32 v8, s13
-; GFX8-NEXT:    s_bfe_i32 s17, s6, 0x40018
-; GFX8-NEXT:    s_bfe_i32 s14, s0, 0x40014
-; GFX8-NEXT:    v_mov_b32_e32 v9, s15
-; GFX8-NEXT:    s_bfe_i32 s16, s0, 0x40018
-; GFX8-NEXT:    s_ashr_i32 s6, s6, 28
-; GFX8-NEXT:    v_mov_b32_e32 v10, s17
+; GFX8-NEXT:    s_bfe_i32 s14, s3, 0x40014
+; GFX8-NEXT:    s_bfe_i32 s11, s0, 0x40010
+; GFX8-NEXT:    v_mov_b32_e32 v8, s12
+; GFX8-NEXT:    s_bfe_i32 s16, s3, 0x40018
+; GFX8-NEXT:    s_bfe_i32 s13, s0, 0x40014
+; GFX8-NEXT:    v_mov_b32_e32 v9, s14
+; GFX8-NEXT:    s_bfe_i32 s15, s0, 0x40018
+; GFX8-NEXT:    s_ashr_i32 s3, s3, 28
+; GFX8-NEXT:    v_mov_b32_e32 v10, s16
 ; GFX8-NEXT:    s_ashr_i32 s0, s0, 28
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mad_i32_i24 v2, s5, v6, v2
-; GFX8-NEXT:    v_mad_i32_i24 v2, s8, v7, v2
+; GFX8-NEXT:    v_mad_i32_i24 v2, s7, v7, v2
 ; GFX8-NEXT:    v_add_u32_sdwa v2, vcc, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
 ; GFX8-NEXT:    v_mad_u32_u24 v2, v4, v5, v2
-; GFX8-NEXT:    v_mad_i32_i24 v2, s12, v8, v2
-; GFX8-NEXT:    v_mad_i32_i24 v2, s14, v9, v2
-; GFX8-NEXT:    v_mad_i32_i24 v2, s16, v10, v2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s6
+; GFX8-NEXT:    v_mad_i32_i24 v2, s11, v8, v2
+; GFX8-NEXT:    v_mad_i32_i24 v2, s13, v9, v2
+; GFX8-NEXT:    v_mad_i32_i24 v2, s15, v10, v2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX8-NEXT:    v_mad_i32_i24 v2, s0, v3, v2
 ; GFX8-NEXT:    flat_store_byte v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
@@ -711,50 +711,50 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-NEXT:    s_movk_i32 s2, 0xff
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s6, s[6:7], 0x0
+; GFX9-NEXT:    s_load_dword s3, s[6:7], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    global_load_ubyte v2, v[0:1], off
 ; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_bfe_i32 s7, s6, 0x40000
-; GFX9-NEXT:    s_lshr_b32 s4, s6, 12
-; GFX9-NEXT:    s_bfe_i32 s9, s6, 0x40004
-; GFX9-NEXT:    s_bfe_i32 s11, s6, 0x40008
+; GFX9-NEXT:    s_bfe_i32 s6, s3, 0x40000
+; GFX9-NEXT:    s_lshr_b32 s4, s3, 12
+; GFX9-NEXT:    s_bfe_i32 s8, s3, 0x40004
+; GFX9-NEXT:    s_bfe_i32 s10, s3, 0x40008
 ; GFX9-NEXT:    s_lshr_b32 s1, s0, 12
 ; GFX9-NEXT:    s_bfe_i32 s5, s0, 0x40000
-; GFX9-NEXT:    v_mov_b32_e32 v6, s7
+; GFX9-NEXT:    v_mov_b32_e32 v6, s6
 ; GFX9-NEXT:    v_lshlrev_b16_e64 v4, 12, s1
 ; GFX9-NEXT:    v_lshlrev_b16_e64 v5, 12, s4
-; GFX9-NEXT:    s_bfe_i32 s8, s0, 0x40004
-; GFX9-NEXT:    s_bfe_i32 s10, s0, 0x40008
-; GFX9-NEXT:    v_mov_b32_e32 v3, s11
-; GFX9-NEXT:    v_mov_b32_e32 v7, s9
+; GFX9-NEXT:    s_bfe_i32 s7, s0, 0x40004
+; GFX9-NEXT:    s_bfe_i32 s9, s0, 0x40008
+; GFX9-NEXT:    v_mov_b32_e32 v3, s10
+; GFX9-NEXT:    v_mov_b32_e32 v7, s8
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v4, 12, v4
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v5, 12, v5
-; GFX9-NEXT:    v_mul_i32_i24_e32 v3, s10, v3
-; GFX9-NEXT:    s_bfe_i32 s13, s6, 0x40010
+; GFX9-NEXT:    v_mul_i32_i24_e32 v3, s9, v3
+; GFX9-NEXT:    s_bfe_i32 s12, s3, 0x40010
 ; GFX9-NEXT:    v_and_b32_e32 v4, s2, v4
 ; GFX9-NEXT:    v_and_b32_e32 v5, s2, v5
-; GFX9-NEXT:    s_bfe_i32 s15, s6, 0x40014
-; GFX9-NEXT:    s_bfe_i32 s12, s0, 0x40010
-; GFX9-NEXT:    v_mov_b32_e32 v8, s13
-; GFX9-NEXT:    s_bfe_i32 s17, s6, 0x40018
-; GFX9-NEXT:    s_bfe_i32 s14, s0, 0x40014
-; GFX9-NEXT:    v_mov_b32_e32 v9, s15
-; GFX9-NEXT:    s_bfe_i32 s16, s0, 0x40018
-; GFX9-NEXT:    s_ashr_i32 s6, s6, 28
-; GFX9-NEXT:    v_mov_b32_e32 v10, s17
+; GFX9-NEXT:    s_bfe_i32 s14, s3, 0x40014
+; GFX9-NEXT:    s_bfe_i32 s11, s0, 0x40010
+; GFX9-NEXT:    v_mov_b32_e32 v8, s12
+; GFX9-NEXT:    s_bfe_i32 s16, s3, 0x40018
+; GFX9-NEXT:    s_bfe_i32 s13, s0, 0x40014
+; GFX9-NEXT:    v_mov_b32_e32 v9, s14
+; GFX9-NEXT:    s_bfe_i32 s15, s0, 0x40018
+; GFX9-NEXT:    s_ashr_i32 s3, s3, 28
+; GFX9-NEXT:    v_mov_b32_e32 v10, s16
 ; GFX9-NEXT:    s_ashr_i32 s0, s0, 28
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mad_i32_i24 v2, s5, v6, v2
-; GFX9-NEXT:    v_mad_i32_i24 v2, s8, v7, v2
+; GFX9-NEXT:    v_mad_i32_i24 v2, s7, v7, v2
 ; GFX9-NEXT:    v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
 ; GFX9-NEXT:    v_mad_u32_u24 v2, v4, v5, v2
-; GFX9-NEXT:    v_mad_i32_i24 v2, s12, v8, v2
-; GFX9-NEXT:    v_mad_i32_i24 v2, s14, v9, v2
-; GFX9-NEXT:    v_mad_i32_i24 v2, s16, v10, v2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s6
+; GFX9-NEXT:    v_mad_i32_i24 v2, s11, v8, v2
+; GFX9-NEXT:    v_mad_i32_i24 v2, s13, v9, v2
+; GFX9-NEXT:    v_mad_i32_i24 v2, s15, v10, v2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX9-NEXT:    v_mad_i32_i24 v2, s0, v3, v2
 ; GFX9-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
@@ -765,99 +765,99 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-DL-NEXT:    s_movk_i32 s2, 0xff
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_load_dword s6, s[6:7], 0x0
+; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DL-NEXT:    global_load_ubyte v2, v[0:1], off
 ; GFX9-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_bfe_i32 s7, s6, 0x40000
-; GFX9-DL-NEXT:    s_lshr_b32 s4, s6, 12
-; GFX9-DL-NEXT:    s_bfe_i32 s9, s6, 0x40004
-; GFX9-DL-NEXT:    s_bfe_i32 s11, s6, 0x40008
+; GFX9-DL-NEXT:    s_bfe_i32 s6, s3, 0x40000
+; GFX9-DL-NEXT:    s_lshr_b32 s4, s3, 12
+; GFX9-DL-NEXT:    s_bfe_i32 s8, s3, 0x40004
+; GFX9-DL-NEXT:    s_bfe_i32 s10, s3, 0x40008
 ; GFX9-DL-NEXT:    s_lshr_b32 s1, s0, 12
 ; GFX9-DL-NEXT:    s_bfe_i32 s5, s0, 0x40000
-; GFX9-DL-NEXT:    v_mov_b32_e32 v6, s7
+; GFX9-DL-NEXT:    v_mov_b32_e32 v6, s6
 ; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v4, 12, s1
 ; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v5, 12, s4
-; GFX9-DL-NEXT:    s_bfe_i32 s8, s0, 0x40004
-; GFX9-DL-NEXT:    s_bfe_i32 s10, s0, 0x40008
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s11
-; GFX9-DL-NEXT:    v_mov_b32_e32 v7, s9
+; GFX9-DL-NEXT:    s_bfe_i32 s7, s0, 0x40004
+; GFX9-DL-NEXT:    s_bfe_i32 s9, s0, 0x40008
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s10
+; GFX9-DL-NEXT:    v_mov_b32_e32 v7, s8
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v4, 12, v4
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v5, 12, v5
-; GFX9-DL-NEXT:    v_mul_i32_i24_e32 v3, s10, v3
-; GFX9-DL-NEXT:    s_bfe_i32 s13, s6, 0x40010
+; GFX9-DL-NEXT:    v_mul_i32_i24_e32 v3, s9, v3
+; GFX9-DL-NEXT:    s_bfe_i32 s12, s3, 0x40010
 ; GFX9-DL-NEXT:    v_and_b32_e32 v4, s2, v4
 ; GFX9-DL-NEXT:    v_and_b32_e32 v5, s2, v5
-; GFX9-DL-NEXT:    s_bfe_i32 s15, s6, 0x40014
-; GFX9-DL-NEXT:    s_bfe_i32 s12, s0, 0x40010
-; GFX9-DL-NEXT:    v_mov_b32_e32 v8, s13
-; GFX9-DL-NEXT:    s_bfe_i32 s17, s6, 0x40018
-; GFX9-DL-NEXT:    s_bfe_i32 s14, s0, 0x40014
-; GFX9-DL-NEXT:    v_mov_b32_e32 v9, s15
-; GFX9-DL-NEXT:    s_bfe_i32 s16, s0, 0x40018
-; GFX9-DL-NEXT:    s_ashr_i32 s6, s6, 28
-; GFX9-DL-NEXT:    v_mov_b32_e32 v10, s17
+; GFX9-DL-NEXT:    s_bfe_i32 s14, s3, 0x40014
+; GFX9-DL-NEXT:    s_bfe_i32 s11, s0, 0x40010
+; GFX9-DL-NEXT:    v_mov_b32_e32 v8, s12
+; GFX9-DL-NEXT:    s_bfe_i32 s16, s3, 0x40018
+; GFX9-DL-NEXT:    s_bfe_i32 s13, s0, 0x40014
+; GFX9-DL-NEXT:    v_mov_b32_e32 v9, s14
+; GFX9-DL-NEXT:    s_bfe_i32 s15, s0, 0x40018
+; GFX9-DL-NEXT:    s_ashr_i32 s3, s3, 28
+; GFX9-DL-NEXT:    v_mov_b32_e32 v10, s16
 ; GFX9-DL-NEXT:    s_ashr_i32 s0, s0, 28
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s5, v6, v2
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s8, v7, v2
+; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s7, v7, v2
 ; GFX9-DL-NEXT:    v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
 ; GFX9-DL-NEXT:    v_mad_u32_u24 v2, v4, v5, v2
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s12, v8, v2
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s14, v9, v2
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s16, v10, v2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s6
+; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s11, v8, v2
+; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s13, v9, v2
+; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s15, v10, v2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s0, v3, v2
 ; GFX9-DL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: idot8_acc8:
 ; GFX10-DL:       ; %bb.0: ; %entry
-; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-DL-NEXT:    global_load_ubyte v2, v[0:1], off
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX10-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_lshr_b32 s2, s0, 12
-; GFX10-DL-NEXT:    s_lshr_b32 s4, s1, 12
-; GFX10-DL-NEXT:    s_bfe_i32 s5, s0, 0x40000
-; GFX10-DL-NEXT:    s_bfe_i32 s6, s1, 0x40000
-; GFX10-DL-NEXT:    s_bfe_i32 s7, s0, 0x40004
+; GFX10-DL-NEXT:    s_lshr_b32 s3, s1, 12
+; GFX10-DL-NEXT:    s_bfe_i32 s4, s0, 0x40000
+; GFX10-DL-NEXT:    s_bfe_i32 s5, s1, 0x40000
+; GFX10-DL-NEXT:    s_bfe_i32 s6, s0, 0x40004
 ; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v3, 12, s2
-; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v4, 12, s4
-; GFX10-DL-NEXT:    s_bfe_i32 s8, s0, 0x40008
-; GFX10-DL-NEXT:    s_bfe_i32 s9, s1, 0x40008
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v4, 12, s3
+; GFX10-DL-NEXT:    s_bfe_i32 s7, s0, 0x40008
+; GFX10-DL-NEXT:    s_bfe_i32 s8, s1, 0x40008
 ; GFX10-DL-NEXT:    s_bfe_i32 s2, s1, 0x40004
 ; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v3, 12, v3
-; GFX10-DL-NEXT:    s_movk_i32 s4, 0xff
+; GFX10-DL-NEXT:    s_movk_i32 s3, 0xff
 ; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v4, 12, v4
-; GFX10-DL-NEXT:    v_mul_i32_i24_e64 v5, s8, s9
-; GFX10-DL-NEXT:    v_and_b32_e32 v3, s4, v3
-; GFX10-DL-NEXT:    v_and_b32_e32 v4, s4, v4
-; GFX10-DL-NEXT:    s_bfe_i32 s4, s1, 0x40010
+; GFX10-DL-NEXT:    v_mul_i32_i24_e64 v5, s7, s8
+; GFX10-DL-NEXT:    v_and_b32_e32 v3, s3, v3
+; GFX10-DL-NEXT:    v_and_b32_e32 v4, s3, v4
+; GFX10-DL-NEXT:    s_bfe_i32 s3, s1, 0x40010
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s5, s6, v2
-; GFX10-DL-NEXT:    s_bfe_i32 s5, s0, 0x40014
-; GFX10-DL-NEXT:    s_bfe_i32 s6, s1, 0x40014
-; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s7, s2, v2
+; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s4, s5, v2
+; GFX10-DL-NEXT:    s_bfe_i32 s4, s0, 0x40014
+; GFX10-DL-NEXT:    s_bfe_i32 s5, s1, 0x40014
+; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s6, s2, v2
 ; GFX10-DL-NEXT:    s_bfe_i32 s2, s0, 0x40010
 ; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
 ; GFX10-DL-NEXT:    v_mad_u32_u24 v2, v3, v4, v2
-; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s2, s4, v2
+; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s2, s3, v2
 ; GFX10-DL-NEXT:    s_bfe_i32 s2, s0, 0x40018
-; GFX10-DL-NEXT:    s_bfe_i32 s4, s1, 0x40018
+; GFX10-DL-NEXT:    s_bfe_i32 s3, s1, 0x40018
 ; GFX10-DL-NEXT:    s_ashr_i32 s0, s0, 28
 ; GFX10-DL-NEXT:    s_ashr_i32 s1, s1, 28
-; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s5, s6, v2
-; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s2, s4, v2
+; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s4, s5, v2
+; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s2, s3, v2
 ; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s0, s1, v2
 ; GFX10-DL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
@@ -934,51 +934,51 @@ entry:
 define amdgpu_kernel void @idot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1,
 ; GFX7-LABEL: idot8_multiuses_mul1:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
-; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_load_dword s0, s[8:9], 0x0
-; GFX7-NEXT:    s_load_dword s1, s[10:11], 0x0
-; GFX7-NEXT:    s_load_dword s21, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s20, s[0:1], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_bfe_i32 s2, s0, 0x40000
-; GFX7-NEXT:    s_bfe_i32 s8, s1, 0x40000
-; GFX7-NEXT:    v_mov_b32_e32 v0, s8
-; GFX7-NEXT:    v_mov_b32_e32 v1, s21
-; GFX7-NEXT:    v_mad_i32_i24 v1, s2, v0, v1
-; GFX7-NEXT:    s_bfe_i32 s10, s1, 0x40004
-; GFX7-NEXT:    s_bfe_i32 s9, s0, 0x40004
-; GFX7-NEXT:    s_bfe_i32 s12, s1, 0x40008
-; GFX7-NEXT:    v_mad_i32_i24 v0, s2, v0, v1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s10
-; GFX7-NEXT:    v_mad_i32_i24 v0, s9, v2, v0
-; GFX7-NEXT:    s_bfe_i32 s11, s0, 0x40008
-; GFX7-NEXT:    v_mov_b32_e32 v2, s12
-; GFX7-NEXT:    s_bfe_i32 s14, s1, 0x4000c
-; GFX7-NEXT:    v_mad_i32_i24 v0, s11, v2, v0
-; GFX7-NEXT:    s_bfe_i32 s13, s0, 0x4000c
-; GFX7-NEXT:    v_mov_b32_e32 v2, s14
-; GFX7-NEXT:    s_bfe_i32 s16, s1, 0x40010
-; GFX7-NEXT:    v_mad_i32_i24 v0, s13, v2, v0
-; GFX7-NEXT:    s_bfe_i32 s15, s0, 0x40010
-; GFX7-NEXT:    v_mov_b32_e32 v2, s16
-; GFX7-NEXT:    s_bfe_i32 s18, s1, 0x40014
-; GFX7-NEXT:    s_bfe_i32 s20, s1, 0x40018
-; GFX7-NEXT:    v_mad_i32_i24 v0, s15, v2, v0
-; GFX7-NEXT:    s_bfe_i32 s17, s0, 0x40014
-; GFX7-NEXT:    v_mov_b32_e32 v2, s18
-; GFX7-NEXT:    s_bfe_i32 s19, s0, 0x40018
-; GFX7-NEXT:    v_mad_i32_i24 v0, s17, v2, v0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s20
-; GFX7-NEXT:    s_ashr_i32 s1, s1, 28
-; GFX7-NEXT:    v_mad_i32_i24 v0, s19, v2, v0
-; GFX7-NEXT:    s_ashr_i32 s0, s0, 28
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
-; GFX7-NEXT:    v_mad_i32_i24 v0, s0, v2, v0
+; GFX7-NEXT:    s_bfe_i32 s6, s4, 0x40000
+; GFX7-NEXT:    s_bfe_i32 s7, s5, 0x40000
+; GFX7-NEXT:    v_mov_b32_e32 v0, s7
+; GFX7-NEXT:    v_mov_b32_e32 v1, s20
+; GFX7-NEXT:    v_mad_i32_i24 v1, s6, v0, v1
+; GFX7-NEXT:    s_bfe_i32 s9, s5, 0x40004
+; GFX7-NEXT:    s_bfe_i32 s8, s4, 0x40004
+; GFX7-NEXT:    s_bfe_i32 s11, s5, 0x40008
+; GFX7-NEXT:    v_mad_i32_i24 v0, s6, v0, v1
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mad_i32_i24 v0, s8, v2, v0
+; GFX7-NEXT:    s_bfe_i32 s10, s4, 0x40008
+; GFX7-NEXT:    v_mov_b32_e32 v2, s11
+; GFX7-NEXT:    s_bfe_i32 s13, s5, 0x4000c
+; GFX7-NEXT:    v_mad_i32_i24 v0, s10, v2, v0
+; GFX7-NEXT:    s_bfe_i32 s12, s4, 0x4000c
+; GFX7-NEXT:    v_mov_b32_e32 v2, s13
+; GFX7-NEXT:    s_bfe_i32 s15, s5, 0x40010
+; GFX7-NEXT:    v_mad_i32_i24 v0, s12, v2, v0
+; GFX7-NEXT:    s_bfe_i32 s14, s4, 0x40010
+; GFX7-NEXT:    v_mov_b32_e32 v2, s15
+; GFX7-NEXT:    s_bfe_i32 s17, s5, 0x40014
+; GFX7-NEXT:    s_bfe_i32 s19, s5, 0x40018
+; GFX7-NEXT:    v_mad_i32_i24 v0, s14, v2, v0
+; GFX7-NEXT:    s_bfe_i32 s16, s4, 0x40014
+; GFX7-NEXT:    v_mov_b32_e32 v2, s17
+; GFX7-NEXT:    s_bfe_i32 s18, s4, 0x40018
+; GFX7-NEXT:    v_mad_i32_i24 v0, s16, v2, v0
+; GFX7-NEXT:    v_mov_b32_e32 v2, s19
+; GFX7-NEXT:    s_ashr_i32 s5, s5, 28
+; GFX7-NEXT:    v_mad_i32_i24 v0, s18, v2, v0
+; GFX7-NEXT:    s_ashr_i32 s4, s4, 28
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    v_mad_i32_i24 v0, s4, v2, v0
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: idot8_multiuses_mul1:
@@ -987,42 +987,42 @@ define amdgpu_kernel void @idot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX8-NEXT:    s_load_dword s4, s[6:7], 0x0
-; GFX8-NEXT:    s_load_dword s19, s[0:1], 0x0
+; GFX8-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX8-NEXT:    s_load_dword s18, s[0:1], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_bfe_i32 s5, s2, 0x40000
-; GFX8-NEXT:    s_bfe_i32 s6, s4, 0x40000
-; GFX8-NEXT:    v_mov_b32_e32 v0, s6
-; GFX8-NEXT:    v_mov_b32_e32 v1, s19
-; GFX8-NEXT:    v_mad_i32_i24 v1, s5, v0, v1
-; GFX8-NEXT:    s_bfe_i32 s8, s4, 0x40004
-; GFX8-NEXT:    s_bfe_i32 s7, s2, 0x40004
-; GFX8-NEXT:    s_bfe_i32 s10, s4, 0x40008
-; GFX8-NEXT:    v_mad_i32_i24 v0, s5, v0, v1
-; GFX8-NEXT:    v_mov_b32_e32 v2, s8
-; GFX8-NEXT:    v_mad_i32_i24 v0, s7, v2, v0
-; GFX8-NEXT:    s_bfe_i32 s9, s2, 0x40008
-; GFX8-NEXT:    v_mov_b32_e32 v2, s10
-; GFX8-NEXT:    s_bfe_i32 s12, s4, 0x4000c
-; GFX8-NEXT:    v_mad_i32_i24 v0, s9, v2, v0
-; GFX8-NEXT:    s_bfe_i32 s11, s2, 0x4000c
-; GFX8-NEXT:    v_mov_b32_e32 v2, s12
-; GFX8-NEXT:    s_bfe_i32 s14, s4, 0x40010
-; GFX8-NEXT:    v_mad_i32_i24 v0, s11, v2, v0
-; GFX8-NEXT:    s_bfe_i32 s13, s2, 0x40010
-; GFX8-NEXT:    v_mov_b32_e32 v2, s14
-; GFX8-NEXT:    s_bfe_i32 s16, s4, 0x40014
-; GFX8-NEXT:    s_bfe_i32 s18, s4, 0x40018
-; GFX8-NEXT:    v_mad_i32_i24 v0, s13, v2, v0
-; GFX8-NEXT:    s_bfe_i32 s15, s2, 0x40014
-; GFX8-NEXT:    v_mov_b32_e32 v2, s16
-; GFX8-NEXT:    s_bfe_i32 s17, s2, 0x40018
-; GFX8-NEXT:    v_mad_i32_i24 v0, s15, v2, v0
-; GFX8-NEXT:    v_mov_b32_e32 v2, s18
-; GFX8-NEXT:    s_ashr_i32 s4, s4, 28
-; GFX8-NEXT:    v_mad_i32_i24 v0, s17, v2, v0
+; GFX8-NEXT:    s_bfe_i32 s4, s2, 0x40000
+; GFX8-NEXT:    s_bfe_i32 s5, s3, 0x40000
+; GFX8-NEXT:    v_mov_b32_e32 v0, s5
+; GFX8-NEXT:    v_mov_b32_e32 v1, s18
+; GFX8-NEXT:    v_mad_i32_i24 v1, s4, v0, v1
+; GFX8-NEXT:    s_bfe_i32 s7, s3, 0x40004
+; GFX8-NEXT:    s_bfe_i32 s6, s2, 0x40004
+; GFX8-NEXT:    s_bfe_i32 s9, s3, 0x40008
+; GFX8-NEXT:    v_mad_i32_i24 v0, s4, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s7
+; GFX8-NEXT:    v_mad_i32_i24 v0, s6, v2, v0
+; GFX8-NEXT:    s_bfe_i32 s8, s2, 0x40008
+; GFX8-NEXT:    v_mov_b32_e32 v2, s9
+; GFX8-NEXT:    s_bfe_i32 s11, s3, 0x4000c
+; GFX8-NEXT:    v_mad_i32_i24 v0, s8, v2, v0
+; GFX8-NEXT:    s_bfe_i32 s10, s2, 0x4000c
+; GFX8-NEXT:    v_mov_b32_e32 v2, s11
+; GFX8-NEXT:    s_bfe_i32 s13, s3, 0x40010
+; GFX8-NEXT:    v_mad_i32_i24 v0, s10, v2, v0
+; GFX8-NEXT:    s_bfe_i32 s12, s2, 0x40010
+; GFX8-NEXT:    v_mov_b32_e32 v2, s13
+; GFX8-NEXT:    s_bfe_i32 s15, s3, 0x40014
+; GFX8-NEXT:    s_bfe_i32 s17, s3, 0x40018
+; GFX8-NEXT:    v_mad_i32_i24 v0, s12, v2, v0
+; GFX8-NEXT:    s_bfe_i32 s14, s2, 0x40014
+; GFX8-NEXT:    v_mov_b32_e32 v2, s15
+; GFX8-NEXT:    s_bfe_i32 s16, s2, 0x40018
+; GFX8-NEXT:    v_mad_i32_i24 v0, s14, v2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s17
+; GFX8-NEXT:    s_ashr_i32 s3, s3, 28
+; GFX8-NEXT:    v_mad_i32_i24 v0, s16, v2, v0
 ; GFX8-NEXT:    s_ashr_i32 s2, s2, 28
-; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX8-NEXT:    v_mad_i32_i24 v0, s2, v2, v0
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v0, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
@@ -1036,42 +1036,42 @@ define amdgpu_kernel void @idot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX9-NEXT:    s_load_dword s4, s[6:7], 0x0
-; GFX9-NEXT:    s_load_dword s19, s[0:1], 0x0
+; GFX9-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX9-NEXT:    s_load_dword s18, s[0:1], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_bfe_i32 s5, s2, 0x40000
-; GFX9-NEXT:    s_bfe_i32 s6, s4, 0x40000
-; GFX9-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-NEXT:    v_mov_b32_e32 v1, s19
-; GFX9-NEXT:    v_mad_i32_i24 v1, s5, v0, v1
-; GFX9-NEXT:    s_bfe_i32 s8, s4, 0x40004
-; GFX9-NEXT:    s_bfe_i32 s7, s2, 0x40004
-; GFX9-NEXT:    s_bfe_i32 s10, s4, 0x40008
-; GFX9-NEXT:    v_mad_i32_i24 v0, s5, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v2, s8
-; GFX9-NEXT:    v_mad_i32_i24 v0, s7, v2, v0
-; GFX9-NEXT:    s_bfe_i32 s9, s2, 0x40008
-; GFX9-NEXT:    v_mov_b32_e32 v2, s10
-; GFX9-NEXT:    s_bfe_i32 s12, s4, 0x4000c
-; GFX9-NEXT:    v_mad_i32_i24 v0, s9, v2, v0
-; GFX9-NEXT:    s_bfe_i32 s11, s2, 0x4000c
-; GFX9-NEXT:    v_mov_b32_e32 v2, s12
-; GFX9-NEXT:    s_bfe_i32 s14, s4, 0x40010
-; GFX9-NEXT:    v_mad_i32_i24 v0, s11, v2, v0
-; GFX9-NEXT:    s_bfe_i32 s13, s2, 0x40010
-; GFX9-NEXT:    v_mov_b32_e32 v2, s14
-; GFX9-NEXT:    s_bfe_i32 s16, s4, 0x40014
-; GFX9-NEXT:    s_bfe_i32 s18, s4, 0x40018
-; GFX9-NEXT:    v_mad_i32_i24 v0, s13, v2, v0
-; GFX9-NEXT:    s_bfe_i32 s15, s2, 0x40014
-; GFX9-NEXT:    v_mov_b32_e32 v2, s16
-; GFX9-NEXT:    s_bfe_i32 s17, s2, 0x40018
-; GFX9-NEXT:    v_mad_i32_i24 v0, s15, v2, v0
-; GFX9-NEXT:    v_mov_b32_e32 v2, s18
-; GFX9-NEXT:    s_ashr_i32 s4, s4, 28
-; GFX9-NEXT:    v_mad_i32_i24 v0, s17, v2, v0
+; GFX9-NEXT:    s_bfe_i32 s4, s2, 0x40000
+; GFX9-NEXT:    s_bfe_i32 s5, s3, 0x40000
+; GFX9-NEXT:    v_mov_b32_e32 v0, s5
+; GFX9-NEXT:    v_mov_b32_e32 v1, s18
+; GFX9-NEXT:    v_mad_i32_i24 v1, s4, v0, v1
+; GFX9-NEXT:    s_bfe_i32 s7, s3, 0x40004
+; GFX9-NEXT:    s_bfe_i32 s6, s2, 0x40004
+; GFX9-NEXT:    s_bfe_i32 s9, s3, 0x40008
+; GFX9-NEXT:    v_mad_i32_i24 v0, s4, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s7
+; GFX9-NEXT:    v_mad_i32_i24 v0, s6, v2, v0
+; GFX9-NEXT:    s_bfe_i32 s8, s2, 0x40008
+; GFX9-NEXT:    v_mov_b32_e32 v2, s9
+; GFX9-NEXT:    s_bfe_i32 s11, s3, 0x4000c
+; GFX9-NEXT:    v_mad_i32_i24 v0, s8, v2, v0
+; GFX9-NEXT:    s_bfe_i32 s10, s2, 0x4000c
+; GFX9-NEXT:    v_mov_b32_e32 v2, s11
+; GFX9-NEXT:    s_bfe_i32 s13, s3, 0x40010
+; GFX9-NEXT:    v_mad_i32_i24 v0, s10, v2, v0
+; GFX9-NEXT:    s_bfe_i32 s12, s2, 0x40010
+; GFX9-NEXT:    v_mov_b32_e32 v2, s13
+; GFX9-NEXT:    s_bfe_i32 s15, s3, 0x40014
+; GFX9-NEXT:    s_bfe_i32 s17, s3, 0x40018
+; GFX9-NEXT:    v_mad_i32_i24 v0, s12, v2, v0
+; GFX9-NEXT:    s_bfe_i32 s14, s2, 0x40014
+; GFX9-NEXT:    v_mov_b32_e32 v2, s15
+; GFX9-NEXT:    s_bfe_i32 s16, s2, 0x40018
+; GFX9-NEXT:    v_mad_i32_i24 v0, s14, v2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, s17
+; GFX9-NEXT:    s_ashr_i32 s3, s3, 28
+; GFX9-NEXT:    v_mad_i32_i24 v0, s16, v2, v0
 ; GFX9-NEXT:    s_ashr_i32 s2, s2, 28
-; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX9-NEXT:    v_mad_i32_i24 v0, s2, v2, v0
 ; GFX9-NEXT:    v_add_u32_e32 v2, v1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
@@ -1085,42 +1085,42 @@ define amdgpu_kernel void @idot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX9-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
-; GFX9-DL-NEXT:    s_load_dword s19, s[0:1], 0x0
+; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX9-DL-NEXT:    s_load_dword s18, s[0:1], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_bfe_i32 s5, s2, 0x40000
-; GFX9-DL-NEXT:    s_bfe_i32 s6, s4, 0x40000
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s19
-; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s5, v0, v1
-; GFX9-DL-NEXT:    s_bfe_i32 s8, s4, 0x40004
-; GFX9-DL-NEXT:    s_bfe_i32 s7, s2, 0x40004
-; GFX9-DL-NEXT:    s_bfe_i32 s10, s4, 0x40008
-; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s5, v0, v1
-; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s8
-; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s7, v2, v0
-; GFX9-DL-NEXT:    s_bfe_i32 s9, s2, 0x40008
-; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s10
-; GFX9-DL-NEXT:    s_bfe_i32 s12, s4, 0x4000c
-; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s9, v2, v0
-; GFX9-DL-NEXT:    s_bfe_i32 s11, s2, 0x4000c
-; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s12
-; GFX9-DL-NEXT:    s_bfe_i32 s14, s4, 0x40010
-; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s11, v2, v0
-; GFX9-DL-NEXT:    s_bfe_i32 s13, s2, 0x40010
-; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s14
-; GFX9-DL-NEXT:    s_bfe_i32 s16, s4, 0x40014
-; GFX9-DL-NEXT:    s_bfe_i32 s18, s4, 0x40018
-; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s13, v2, v0
-; GFX9-DL-NEXT:    s_bfe_i32 s15, s2, 0x40014
-; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s16
-; GFX9-DL-NEXT:    s_bfe_i32 s17, s2, 0x40018
-; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s15, v2, v0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s18
-; GFX9-DL-NEXT:    s_ashr_i32 s4, s4, 28
-; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s17, v2, v0
+; GFX9-DL-NEXT:    s_bfe_i32 s4, s2, 0x40000
+; GFX9-DL-NEXT:    s_bfe_i32 s5, s3, 0x40000
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s5
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s18
+; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s4, v0, v1
+; GFX9-DL-NEXT:    s_bfe_i32 s7, s3, 0x40004
+; GFX9-DL-NEXT:    s_bfe_i32 s6, s2, 0x40004
+; GFX9-DL-NEXT:    s_bfe_i32 s9, s3, 0x40008
+; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s4, v0, v1
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s7
+; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s6, v2, v0
+; GFX9-DL-NEXT:    s_bfe_i32 s8, s2, 0x40008
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s9
+; GFX9-DL-NEXT:    s_bfe_i32 s11, s3, 0x4000c
+; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s8, v2, v0
+; GFX9-DL-NEXT:    s_bfe_i32 s10, s2, 0x4000c
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s11
+; GFX9-DL-NEXT:    s_bfe_i32 s13, s3, 0x40010
+; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s10, v2, v0
+; GFX9-DL-NEXT:    s_bfe_i32 s12, s2, 0x40010
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s13
+; GFX9-DL-NEXT:    s_bfe_i32 s15, s3, 0x40014
+; GFX9-DL-NEXT:    s_bfe_i32 s17, s3, 0x40018
+; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s12, v2, v0
+; GFX9-DL-NEXT:    s_bfe_i32 s14, s2, 0x40014
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s15
+; GFX9-DL-NEXT:    s_bfe_i32 s16, s2, 0x40018
+; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s14, v2, v0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s17
+; GFX9-DL-NEXT:    s_ashr_i32 s3, s3, 28
+; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s16, v2, v0
 ; GFX9-DL-NEXT:    s_ashr_i32 s2, s2, 28
-; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s2, v2, v0
 ; GFX9-DL-NEXT:    v_add_u32_e32 v2, v1, v0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
@@ -1135,35 +1135,35 @@ define amdgpu_kernel void @idot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
-; GFX10-DL-NEXT:    s_load_dword s5, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_load_dword s4, s[0:1], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_bfe_i32 s6, s2, 0x40000
-; GFX10-DL-NEXT:    s_bfe_i32 s7, s4, 0x40000
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s5
-; GFX10-DL-NEXT:    s_bfe_i32 s5, s2, 0x40004
-; GFX10-DL-NEXT:    s_bfe_i32 s8, s4, 0x40004
-; GFX10-DL-NEXT:    v_mad_i32_i24 v0, s6, s7, v0
-; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s6, s7, v0
-; GFX10-DL-NEXT:    s_bfe_i32 s6, s2, 0x40008
-; GFX10-DL-NEXT:    s_bfe_i32 s7, s4, 0x40008
-; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s5, s8, v1
-; GFX10-DL-NEXT:    s_bfe_i32 s5, s2, 0x4000c
-; GFX10-DL-NEXT:    s_bfe_i32 s8, s4, 0x4000c
-; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s6, s7, v1
-; GFX10-DL-NEXT:    s_bfe_i32 s6, s2, 0x40010
-; GFX10-DL-NEXT:    s_bfe_i32 s7, s4, 0x40010
-; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s5, s8, v1
-; GFX10-DL-NEXT:    s_bfe_i32 s5, s2, 0x40014
-; GFX10-DL-NEXT:    s_bfe_i32 s8, s4, 0x40014
-; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s6, s7, v1
-; GFX10-DL-NEXT:    s_bfe_i32 s6, s2, 0x40018
-; GFX10-DL-NEXT:    s_bfe_i32 s7, s4, 0x40018
+; GFX10-DL-NEXT:    s_bfe_i32 s5, s2, 0x40000
+; GFX10-DL-NEXT:    s_bfe_i32 s6, s3, 0x40000
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-DL-NEXT:    s_bfe_i32 s4, s2, 0x40004
+; GFX10-DL-NEXT:    s_bfe_i32 s7, s3, 0x40004
+; GFX10-DL-NEXT:    v_mad_i32_i24 v0, s5, s6, v0
+; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s5, s6, v0
+; GFX10-DL-NEXT:    s_bfe_i32 s5, s2, 0x40008
+; GFX10-DL-NEXT:    s_bfe_i32 s6, s3, 0x40008
+; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s4, s7, v1
+; GFX10-DL-NEXT:    s_bfe_i32 s4, s2, 0x4000c
+; GFX10-DL-NEXT:    s_bfe_i32 s7, s3, 0x4000c
+; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s5, s6, v1
+; GFX10-DL-NEXT:    s_bfe_i32 s5, s2, 0x40010
+; GFX10-DL-NEXT:    s_bfe_i32 s6, s3, 0x40010
+; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s4, s7, v1
+; GFX10-DL-NEXT:    s_bfe_i32 s4, s2, 0x40014
+; GFX10-DL-NEXT:    s_bfe_i32 s7, s3, 0x40014
+; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s5, s6, v1
+; GFX10-DL-NEXT:    s_bfe_i32 s5, s2, 0x40018
+; GFX10-DL-NEXT:    s_bfe_i32 s6, s3, 0x40018
 ; GFX10-DL-NEXT:    s_ashr_i32 s2, s2, 28
-; GFX10-DL-NEXT:    s_ashr_i32 s4, s4, 28
-; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s5, s8, v1
-; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s6, s7, v1
-; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s2, s4, v1
+; GFX10-DL-NEXT:    s_ashr_i32 s3, s3, 28
+; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s4, s7, v1
+; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s5, s6, v1
+; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s2, s3, v1
 ; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v0, v1
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
@@ -1243,64 +1243,64 @@ entry:
 define amdgpu_kernel void @idot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX7-LABEL: idot8_acc32_vecMul:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
-; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_load_dword s1, s[8:9], 0x0
-; GFX7-NEXT:    s_load_dword s9, s[10:11], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[6:7], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_ashr_i64 s[10:11], s[0:1], 60
-; GFX7-NEXT:    s_lshl_b32 s11, s1, 4
-; GFX7-NEXT:    s_ashr_i64 s[16:17], s[10:11], 60
-; GFX7-NEXT:    s_lshl_b32 s11, s1, 16
-; GFX7-NEXT:    s_ashr_i64 s[18:19], s[10:11], 60
-; GFX7-NEXT:    s_lshl_b32 s11, s1, 20
-; GFX7-NEXT:    s_lshl_b32 s13, s1, 8
-; GFX7-NEXT:    s_lshl_b32 s15, s1, 12
-; GFX7-NEXT:    s_ashr_i64 s[20:21], s[10:11], 60
-; GFX7-NEXT:    s_lshl_b32 s11, s1, 24
-; GFX7-NEXT:    s_lshl_b32 s1, s1, 28
-; GFX7-NEXT:    s_ashr_i64 s[0:1], s[0:1], 60
-; GFX7-NEXT:    s_lshl_b32 s1, s9, 4
-; GFX7-NEXT:    s_ashr_i64 s[26:27], s[0:1], 60
-; GFX7-NEXT:    s_lshl_b32 s1, s9, 8
-; GFX7-NEXT:    s_ashr_i64 s[28:29], s[0:1], 60
-; GFX7-NEXT:    s_lshl_b32 s1, s9, 12
-; GFX7-NEXT:    s_ashr_i64 s[30:31], s[0:1], 60
-; GFX7-NEXT:    s_lshl_b32 s1, s9, 16
-; GFX7-NEXT:    s_ashr_i64 s[32:33], s[0:1], 60
-; GFX7-NEXT:    s_lshl_b32 s1, s9, 20
-; GFX7-NEXT:    s_ashr_i64 s[34:35], s[0:1], 60
-; GFX7-NEXT:    s_lshl_b32 s1, s9, 24
-; GFX7-NEXT:    s_ashr_i64 s[36:37], s[0:1], 60
-; GFX7-NEXT:    s_lshl_b32 s1, s9, 28
-; GFX7-NEXT:    s_ashr_i64 s[24:25], s[8:9], 60
-; GFX7-NEXT:    s_ashr_i64 s[8:9], s[0:1], 60
-; GFX7-NEXT:    s_load_dword s1, s[4:5], 0x0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s8
-; GFX7-NEXT:    s_ashr_i64 s[22:23], s[10:11], 60
-; GFX7-NEXT:    s_ashr_i64 s[14:15], s[14:15], 60
+; GFX7-NEXT:    s_ashr_i64 s[8:9], s[4:5], 60
+; GFX7-NEXT:    s_lshl_b32 s9, s5, 4
+; GFX7-NEXT:    s_ashr_i64 s[14:15], s[8:9], 60
+; GFX7-NEXT:    s_lshl_b32 s9, s5, 16
+; GFX7-NEXT:    s_ashr_i64 s[16:17], s[8:9], 60
+; GFX7-NEXT:    s_lshl_b32 s9, s5, 20
+; GFX7-NEXT:    s_lshl_b32 s11, s5, 8
+; GFX7-NEXT:    s_lshl_b32 s13, s5, 12
+; GFX7-NEXT:    s_ashr_i64 s[18:19], s[8:9], 60
+; GFX7-NEXT:    s_lshl_b32 s9, s5, 24
+; GFX7-NEXT:    s_lshl_b32 s5, s5, 28
+; GFX7-NEXT:    s_ashr_i64 s[4:5], s[4:5], 60
+; GFX7-NEXT:    s_lshl_b32 s5, s7, 4
+; GFX7-NEXT:    s_ashr_i64 s[24:25], s[4:5], 60
+; GFX7-NEXT:    s_lshl_b32 s5, s7, 8
+; GFX7-NEXT:    s_ashr_i64 s[26:27], s[4:5], 60
+; GFX7-NEXT:    s_lshl_b32 s5, s7, 12
+; GFX7-NEXT:    s_ashr_i64 s[28:29], s[4:5], 60
+; GFX7-NEXT:    s_lshl_b32 s5, s7, 16
+; GFX7-NEXT:    s_ashr_i64 s[30:31], s[4:5], 60
+; GFX7-NEXT:    s_lshl_b32 s5, s7, 20
+; GFX7-NEXT:    s_ashr_i64 s[34:35], s[4:5], 60
+; GFX7-NEXT:    s_lshl_b32 s5, s7, 24
+; GFX7-NEXT:    s_ashr_i64 s[36:37], s[4:5], 60
+; GFX7-NEXT:    s_lshl_b32 s5, s7, 28
+; GFX7-NEXT:    s_ashr_i64 s[22:23], s[6:7], 60
+; GFX7-NEXT:    s_ashr_i64 s[6:7], s[4:5], 60
+; GFX7-NEXT:    s_load_dword s5, s[0:1], 0x0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    s_ashr_i64 s[20:21], s[8:9], 60
 ; GFX7-NEXT:    s_ashr_i64 s[12:13], s[12:13], 60
+; GFX7-NEXT:    s_ashr_i64 s[10:11], s[10:11], 60
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mad_i32_i24 v0, s0, v0, v1
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mad_i32_i24 v0, s4, v0, v1
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s36
-; GFX7-NEXT:    v_mad_i32_i24 v0, s22, v1, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s34
 ; GFX7-NEXT:    v_mad_i32_i24 v0, s20, v1, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s32
+; GFX7-NEXT:    v_mov_b32_e32 v1, s34
 ; GFX7-NEXT:    v_mad_i32_i24 v0, s18, v1, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s30
-; GFX7-NEXT:    v_mad_i32_i24 v0, s14, v1, v0
+; GFX7-NEXT:    v_mad_i32_i24 v0, s16, v1, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s28
 ; GFX7-NEXT:    v_mad_i32_i24 v0, s12, v1, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s26
-; GFX7-NEXT:    v_mad_i32_i24 v0, s16, v1, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s24
 ; GFX7-NEXT:    v_mad_i32_i24 v0, s10, v1, v0
-; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s24
+; GFX7-NEXT:    v_mad_i32_i24 v0, s14, v1, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s22
+; GFX7-NEXT:    v_mad_i32_i24 v0, s8, v1, v0
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: idot8_acc32_vecMul:
@@ -1308,57 +1308,58 @@ define amdgpu_kernel void @idot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_load_dword s5, s[4:5], 0x0
-; GFX8-NEXT:    s_load_dword s7, s[6:7], 0x0
-; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT:    s_load_dword s3, s[4:5], 0x0
+; GFX8-NEXT:    s_load_dword s5, s[6:7], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_ashr_i64 s[8:9], s[4:5], 60
-; GFX8-NEXT:    s_lshl_b32 s9, s5, 4
-; GFX8-NEXT:    s_ashr_i64 s[16:17], s[8:9], 60
-; GFX8-NEXT:    s_lshl_b32 s9, s5, 20
-; GFX8-NEXT:    s_lshl_b32 s11, s5, 8
-; GFX8-NEXT:    s_lshl_b32 s13, s5, 12
-; GFX8-NEXT:    s_lshl_b32 s15, s5, 16
-; GFX8-NEXT:    s_ashr_i64 s[18:19], s[8:9], 60
-; GFX8-NEXT:    s_lshl_b32 s9, s5, 24
-; GFX8-NEXT:    s_lshl_b32 s5, s5, 28
-; GFX8-NEXT:    s_ashr_i64 s[4:5], s[4:5], 60
-; GFX8-NEXT:    s_lshl_b32 s5, s7, 4
-; GFX8-NEXT:    s_ashr_i64 s[24:25], s[4:5], 60
-; GFX8-NEXT:    s_lshl_b32 s5, s7, 8
-; GFX8-NEXT:    s_ashr_i64 s[26:27], s[4:5], 60
-; GFX8-NEXT:    s_lshl_b32 s5, s7, 12
-; GFX8-NEXT:    s_ashr_i64 s[28:29], s[4:5], 60
-; GFX8-NEXT:    s_lshl_b32 s5, s7, 16
-; GFX8-NEXT:    s_ashr_i64 s[30:31], s[4:5], 60
-; GFX8-NEXT:    s_lshl_b32 s5, s7, 20
-; GFX8-NEXT:    s_ashr_i64 s[32:33], s[4:5], 60
-; GFX8-NEXT:    s_lshl_b32 s5, s7, 24
-; GFX8-NEXT:    s_ashr_i64 s[34:35], s[4:5], 60
-; GFX8-NEXT:    s_lshl_b32 s5, s7, 28
-; GFX8-NEXT:    s_ashr_i64 s[22:23], s[6:7], 60
-; GFX8-NEXT:    s_ashr_i64 s[6:7], s[4:5], 60
-; GFX8-NEXT:    v_mov_b32_e32 v0, s6
-; GFX8-NEXT:    v_mov_b32_e32 v1, s2
-; GFX8-NEXT:    v_mad_i32_i24 v0, s4, v0, v1
-; GFX8-NEXT:    s_ashr_i64 s[20:21], s[8:9], 60
+; GFX8-NEXT:    s_ashr_i64 s[6:7], s[2:3], 60
+; GFX8-NEXT:    s_lshl_b32 s7, s3, 4
+; GFX8-NEXT:    s_ashr_i64 s[14:15], s[6:7], 60
+; GFX8-NEXT:    s_lshl_b32 s7, s3, 20
+; GFX8-NEXT:    s_lshl_b32 s9, s3, 8
+; GFX8-NEXT:    s_lshl_b32 s11, s3, 12
+; GFX8-NEXT:    s_lshl_b32 s13, s3, 16
+; GFX8-NEXT:    s_ashr_i64 s[16:17], s[6:7], 60
+; GFX8-NEXT:    s_lshl_b32 s7, s3, 24
+; GFX8-NEXT:    s_lshl_b32 s3, s3, 28
+; GFX8-NEXT:    s_ashr_i64 s[2:3], s[2:3], 60
+; GFX8-NEXT:    s_lshl_b32 s3, s5, 4
+; GFX8-NEXT:    s_ashr_i64 s[22:23], s[2:3], 60
+; GFX8-NEXT:    s_lshl_b32 s3, s5, 8
+; GFX8-NEXT:    s_ashr_i64 s[24:25], s[2:3], 60
+; GFX8-NEXT:    s_lshl_b32 s3, s5, 12
+; GFX8-NEXT:    s_ashr_i64 s[26:27], s[2:3], 60
+; GFX8-NEXT:    s_lshl_b32 s3, s5, 16
+; GFX8-NEXT:    s_ashr_i64 s[28:29], s[2:3], 60
+; GFX8-NEXT:    s_lshl_b32 s3, s5, 20
+; GFX8-NEXT:    s_ashr_i64 s[30:31], s[2:3], 60
+; GFX8-NEXT:    s_lshl_b32 s3, s5, 24
+; GFX8-NEXT:    s_ashr_i64 s[34:35], s[2:3], 60
+; GFX8-NEXT:    s_lshl_b32 s3, s5, 28
+; GFX8-NEXT:    s_ashr_i64 s[20:21], s[4:5], 60
+; GFX8-NEXT:    s_ashr_i64 s[4:5], s[2:3], 60
+; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    s_ashr_i64 s[18:19], s[6:7], 60
+; GFX8-NEXT:    s_ashr_i64 s[12:13], s[12:13], 60
+; GFX8-NEXT:    s_ashr_i64 s[10:11], s[10:11], 60
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_mad_i32_i24 v0, s2, v0, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s34
-; GFX8-NEXT:    v_mad_i32_i24 v0, s20, v1, v0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s32
 ; GFX8-NEXT:    v_mad_i32_i24 v0, s18, v1, v0
-; GFX8-NEXT:    s_ashr_i64 s[14:15], s[14:15], 60
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s30
-; GFX8-NEXT:    v_mad_i32_i24 v0, s14, v1, v0
-; GFX8-NEXT:    s_ashr_i64 s[12:13], s[12:13], 60
+; GFX8-NEXT:    v_mad_i32_i24 v0, s16, v1, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s28
 ; GFX8-NEXT:    v_mad_i32_i24 v0, s12, v1, v0
-; GFX8-NEXT:    s_ashr_i64 s[10:11], s[10:11], 60
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s26
 ; GFX8-NEXT:    v_mad_i32_i24 v0, s10, v1, v0
+; GFX8-NEXT:    s_ashr_i64 s[8:9], s[8:9], 60
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s24
-; GFX8-NEXT:    v_mad_i32_i24 v0, s16, v1, v0
+; GFX8-NEXT:    v_mad_i32_i24 v0, s8, v1, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s22
-; GFX8-NEXT:    v_mad_i32_i24 v2, s8, v1, v0
+; GFX8-NEXT:    v_mad_i32_i24 v0, s14, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s20
+; GFX8-NEXT:    v_mad_i32_i24 v2, s6, v1, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
@@ -1369,57 +1370,58 @@ define amdgpu_kernel void @idot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s5, s[4:5], 0x0
-; GFX9-NEXT:    s_load_dword s7, s[6:7], 0x0
-; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX9-NEXT:    s_load_dword s3, s[4:5], 0x0
+; GFX9-NEXT:    s_load_dword s5, s[6:7], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i64 s[8:9], s[4:5], 60
-; GFX9-NEXT:    s_lshl_b32 s9, s5, 4
-; GFX9-NEXT:    s_ashr_i64 s[16:17], s[8:9], 60
-; GFX9-NEXT:    s_lshl_b32 s9, s5, 20
-; GFX9-NEXT:    s_lshl_b32 s11, s5, 8
-; GFX9-NEXT:    s_lshl_b32 s13, s5, 12
-; GFX9-NEXT:    s_lshl_b32 s15, s5, 16
-; GFX9-NEXT:    s_ashr_i64 s[18:19], s[8:9], 60
-; GFX9-NEXT:    s_lshl_b32 s9, s5, 24
-; GFX9-NEXT:    s_lshl_b32 s5, s5, 28
-; GFX9-NEXT:    s_ashr_i64 s[4:5], s[4:5], 60
-; GFX9-NEXT:    s_lshl_b32 s5, s7, 4
-; GFX9-NEXT:    s_ashr_i64 s[24:25], s[4:5], 60
-; GFX9-NEXT:    s_lshl_b32 s5, s7, 8
-; GFX9-NEXT:    s_ashr_i64 s[26:27], s[4:5], 60
-; GFX9-NEXT:    s_lshl_b32 s5, s7, 12
-; GFX9-NEXT:    s_ashr_i64 s[28:29], s[4:5], 60
-; GFX9-NEXT:    s_lshl_b32 s5, s7, 16
-; GFX9-NEXT:    s_ashr_i64 s[30:31], s[4:5], 60
-; GFX9-NEXT:    s_lshl_b32 s5, s7, 20
-; GFX9-NEXT:    s_ashr_i64 s[32:33], s[4:5], 60
-; GFX9-NEXT:    s_lshl_b32 s5, s7, 24
-; GFX9-NEXT:    s_ashr_i64 s[34:35], s[4:5], 60
-; GFX9-NEXT:    s_lshl_b32 s5, s7, 28
-; GFX9-NEXT:    s_ashr_i64 s[22:23], s[6:7], 60
-; GFX9-NEXT:    s_ashr_i64 s[6:7], s[4:5], 60
-; GFX9-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    v_mad_i32_i24 v0, s4, v0, v1
-; GFX9-NEXT:    s_ashr_i64 s[20:21], s[8:9], 60
+; GFX9-NEXT:    s_ashr_i64 s[6:7], s[2:3], 60
+; GFX9-NEXT:    s_lshl_b32 s7, s3, 4
+; GFX9-NEXT:    s_ashr_i64 s[14:15], s[6:7], 60
+; GFX9-NEXT:    s_lshl_b32 s7, s3, 20
+; GFX9-NEXT:    s_lshl_b32 s9, s3, 8
+; GFX9-NEXT:    s_lshl_b32 s11, s3, 12
+; GFX9-NEXT:    s_lshl_b32 s13, s3, 16
+; GFX9-NEXT:    s_ashr_i64 s[16:17], s[6:7], 60
+; GFX9-NEXT:    s_lshl_b32 s7, s3, 24
+; GFX9-NEXT:    s_lshl_b32 s3, s3, 28
+; GFX9-NEXT:    s_ashr_i64 s[2:3], s[2:3], 60
+; GFX9-NEXT:    s_lshl_b32 s3, s5, 4
+; GFX9-NEXT:    s_ashr_i64 s[22:23], s[2:3], 60
+; GFX9-NEXT:    s_lshl_b32 s3, s5, 8
+; GFX9-NEXT:    s_ashr_i64 s[24:25], s[2:3], 60
+; GFX9-NEXT:    s_lshl_b32 s3, s5, 12
+; GFX9-NEXT:    s_ashr_i64 s[26:27], s[2:3], 60
+; GFX9-NEXT:    s_lshl_b32 s3, s5, 16
+; GFX9-NEXT:    s_ashr_i64 s[28:29], s[2:3], 60
+; GFX9-NEXT:    s_lshl_b32 s3, s5, 20
+; GFX9-NEXT:    s_ashr_i64 s[30:31], s[2:3], 60
+; GFX9-NEXT:    s_lshl_b32 s3, s5, 24
+; GFX9-NEXT:    s_ashr_i64 s[34:35], s[2:3], 60
+; GFX9-NEXT:    s_lshl_b32 s3, s5, 28
+; GFX9-NEXT:    s_ashr_i64 s[20:21], s[4:5], 60
+; GFX9-NEXT:    s_ashr_i64 s[4:5], s[2:3], 60
+; GFX9-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    s_ashr_i64 s[18:19], s[6:7], 60
+; GFX9-NEXT:    s_ashr_i64 s[12:13], s[12:13], 60
+; GFX9-NEXT:    s_ashr_i64 s[10:11], s[10:11], 60
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_mad_i32_i24 v0, s2, v0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s34
-; GFX9-NEXT:    v_mad_i32_i24 v0, s20, v1, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s32
 ; GFX9-NEXT:    v_mad_i32_i24 v0, s18, v1, v0
-; GFX9-NEXT:    s_ashr_i64 s[14:15], s[14:15], 60
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s30
-; GFX9-NEXT:    v_mad_i32_i24 v0, s14, v1, v0
-; GFX9-NEXT:    s_ashr_i64 s[12:13], s[12:13], 60
+; GFX9-NEXT:    v_mad_i32_i24 v0, s16, v1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s28
 ; GFX9-NEXT:    v_mad_i32_i24 v0, s12, v1, v0
-; GFX9-NEXT:    s_ashr_i64 s[10:11], s[10:11], 60
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s26
 ; GFX9-NEXT:    v_mad_i32_i24 v0, s10, v1, v0
+; GFX9-NEXT:    s_ashr_i64 s[8:9], s[8:9], 60
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s24
-; GFX9-NEXT:    v_mad_i32_i24 v0, s16, v1, v0
+; GFX9-NEXT:    v_mad_i32_i24 v0, s8, v1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s22
-; GFX9-NEXT:    v_mad_i32_i24 v2, s8, v1, v0
+; GFX9-NEXT:    v_mad_i32_i24 v0, s14, v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s20
+; GFX9-NEXT:    v_mad_i32_i24 v2, s6, v1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
@@ -1430,57 +1432,58 @@ define amdgpu_kernel void @idot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_load_dword s5, s[4:5], 0x0
-; GFX9-DL-NEXT:    s_load_dword s7, s[6:7], 0x0
-; GFX9-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX9-DL-NEXT:    s_load_dword s3, s[4:5], 0x0
+; GFX9-DL-NEXT:    s_load_dword s5, s[6:7], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_ashr_i64 s[8:9], s[4:5], 60
-; GFX9-DL-NEXT:    s_lshl_b32 s9, s5, 4
-; GFX9-DL-NEXT:    s_ashr_i64 s[16:17], s[8:9], 60
-; GFX9-DL-NEXT:    s_lshl_b32 s9, s5, 20
-; GFX9-DL-NEXT:    s_lshl_b32 s11, s5, 8
-; GFX9-DL-NEXT:    s_lshl_b32 s13, s5, 12
-; GFX9-DL-NEXT:    s_lshl_b32 s15, s5, 16
-; GFX9-DL-NEXT:    s_ashr_i64 s[18:19], s[8:9], 60
-; GFX9-DL-NEXT:    s_lshl_b32 s9, s5, 24
-; GFX9-DL-NEXT:    s_lshl_b32 s5, s5, 28
-; GFX9-DL-NEXT:    s_ashr_i64 s[4:5], s[4:5], 60
-; GFX9-DL-NEXT:    s_lshl_b32 s5, s7, 4
-; GFX9-DL-NEXT:    s_ashr_i64 s[24:25], s[4:5], 60
-; GFX9-DL-NEXT:    s_lshl_b32 s5, s7, 8
-; GFX9-DL-NEXT:    s_ashr_i64 s[26:27], s[4:5], 60
-; GFX9-DL-NEXT:    s_lshl_b32 s5, s7, 12
-; GFX9-DL-NEXT:    s_ashr_i64 s[28:29], s[4:5], 60
-; GFX9-DL-NEXT:    s_lshl_b32 s5, s7, 16
-; GFX9-DL-NEXT:    s_ashr_i64 s[30:31], s[4:5], 60
-; GFX9-DL-NEXT:    s_lshl_b32 s5, s7, 20
-; GFX9-DL-NEXT:    s_ashr_i64 s[32:33], s[4:5], 60
-; GFX9-DL-NEXT:    s_lshl_b32 s5, s7, 24
-; GFX9-DL-NEXT:    s_ashr_i64 s[34:35], s[4:5], 60
-; GFX9-DL-NEXT:    s_lshl_b32 s5, s7, 28
-; GFX9-DL-NEXT:    s_ashr_i64 s[22:23], s[6:7], 60
-; GFX9-DL-NEXT:    s_ashr_i64 s[6:7], s[4:5], 60
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s4, v0, v1
-; GFX9-DL-NEXT:    s_ashr_i64 s[20:21], s[8:9], 60
+; GFX9-DL-NEXT:    s_ashr_i64 s[6:7], s[2:3], 60
+; GFX9-DL-NEXT:    s_lshl_b32 s7, s3, 4
+; GFX9-DL-NEXT:    s_ashr_i64 s[14:15], s[6:7], 60
+; GFX9-DL-NEXT:    s_lshl_b32 s7, s3, 20
+; GFX9-DL-NEXT:    s_lshl_b32 s9, s3, 8
+; GFX9-DL-NEXT:    s_lshl_b32 s11, s3, 12
+; GFX9-DL-NEXT:    s_lshl_b32 s13, s3, 16
+; GFX9-DL-NEXT:    s_ashr_i64 s[16:17], s[6:7], 60
+; GFX9-DL-NEXT:    s_lshl_b32 s7, s3, 24
+; GFX9-DL-NEXT:    s_lshl_b32 s3, s3, 28
+; GFX9-DL-NEXT:    s_ashr_i64 s[2:3], s[2:3], 60
+; GFX9-DL-NEXT:    s_lshl_b32 s3, s5, 4
+; GFX9-DL-NEXT:    s_ashr_i64 s[22:23], s[2:3], 60
+; GFX9-DL-NEXT:    s_lshl_b32 s3, s5, 8
+; GFX9-DL-NEXT:    s_ashr_i64 s[24:25], s[2:3], 60
+; GFX9-DL-NEXT:    s_lshl_b32 s3, s5, 12
+; GFX9-DL-NEXT:    s_ashr_i64 s[26:27], s[2:3], 60
+; GFX9-DL-NEXT:    s_lshl_b32 s3, s5, 16
+; GFX9-DL-NEXT:    s_ashr_i64 s[28:29], s[2:3], 60
+; GFX9-DL-NEXT:    s_lshl_b32 s3, s5, 20
+; GFX9-DL-NEXT:    s_ashr_i64 s[30:31], s[2:3], 60
+; GFX9-DL-NEXT:    s_lshl_b32 s3, s5, 24
+; GFX9-DL-NEXT:    s_ashr_i64 s[34:35], s[2:3], 60
+; GFX9-DL-NEXT:    s_lshl_b32 s3, s5, 28
+; GFX9-DL-NEXT:    s_ashr_i64 s[20:21], s[4:5], 60
+; GFX9-DL-NEXT:    s_ashr_i64 s[4:5], s[2:3], 60
+; GFX9-DL-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-DL-NEXT:    s_ashr_i64 s[18:19], s[6:7], 60
+; GFX9-DL-NEXT:    s_ashr_i64 s[12:13], s[12:13], 60
+; GFX9-DL-NEXT:    s_ashr_i64 s[10:11], s[10:11], 60
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s2, v0, v1
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s34
-; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s20, v1, v0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s32
 ; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s18, v1, v0
-; GFX9-DL-NEXT:    s_ashr_i64 s[14:15], s[14:15], 60
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s30
-; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s14, v1, v0
-; GFX9-DL-NEXT:    s_ashr_i64 s[12:13], s[12:13], 60
+; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s16, v1, v0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s28
 ; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s12, v1, v0
-; GFX9-DL-NEXT:    s_ashr_i64 s[10:11], s[10:11], 60
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s26
 ; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s10, v1, v0
+; GFX9-DL-NEXT:    s_ashr_i64 s[8:9], s[8:9], 60
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s24
-; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s16, v1, v0
+; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s8, v1, v0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s22
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s8, v1, v0
+; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s14, v1, v0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s20
+; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s6, v1, v0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
@@ -1492,49 +1495,49 @@ define amdgpu_kernel void @idot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_load_dword s5, s[4:5], 0x0
-; GFX10-DL-NEXT:    s_load_dword s7, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_load_dword s3, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s5, s[6:7], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_lshl_b32 s7, s3, 28
 ; GFX10-DL-NEXT:    s_lshl_b32 s9, s5, 28
-; GFX10-DL-NEXT:    s_lshl_b32 s11, s7, 28
+; GFX10-DL-NEXT:    s_lshl_b32 s11, s3, 24
 ; GFX10-DL-NEXT:    s_lshl_b32 s13, s5, 24
-; GFX10-DL-NEXT:    s_lshl_b32 s15, s7, 24
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-DL-NEXT:    s_ashr_i64 s[6:7], s[6:7], 60
 ; GFX10-DL-NEXT:    s_ashr_i64 s[8:9], s[8:9], 60
 ; GFX10-DL-NEXT:    s_ashr_i64 s[10:11], s[10:11], 60
 ; GFX10-DL-NEXT:    s_ashr_i64 s[12:13], s[12:13], 60
-; GFX10-DL-NEXT:    s_ashr_i64 s[14:15], s[14:15], 60
+; GFX10-DL-NEXT:    s_lshl_b32 s7, s3, 20
 ; GFX10-DL-NEXT:    s_lshl_b32 s9, s5, 20
-; GFX10-DL-NEXT:    s_lshl_b32 s11, s7, 20
-; GFX10-DL-NEXT:    v_mad_i32_i24 v0, s8, s10, v0
+; GFX10-DL-NEXT:    v_mad_i32_i24 v0, s6, s8, v0
+; GFX10-DL-NEXT:    s_lshl_b32 s11, s3, 16
 ; GFX10-DL-NEXT:    s_lshl_b32 s13, s5, 16
-; GFX10-DL-NEXT:    s_lshl_b32 s15, s7, 16
+; GFX10-DL-NEXT:    s_ashr_i64 s[6:7], s[6:7], 60
 ; GFX10-DL-NEXT:    s_ashr_i64 s[8:9], s[8:9], 60
+; GFX10-DL-NEXT:    v_mad_i32_i24 v0, s10, s12, v0
 ; GFX10-DL-NEXT:    s_ashr_i64 s[10:11], s[10:11], 60
-; GFX10-DL-NEXT:    v_mad_i32_i24 v0, s12, s14, v0
 ; GFX10-DL-NEXT:    s_ashr_i64 s[12:13], s[12:13], 60
-; GFX10-DL-NEXT:    s_ashr_i64 s[14:15], s[14:15], 60
+; GFX10-DL-NEXT:    s_lshl_b32 s7, s3, 12
 ; GFX10-DL-NEXT:    s_lshl_b32 s9, s5, 12
-; GFX10-DL-NEXT:    s_lshl_b32 s11, s7, 12
-; GFX10-DL-NEXT:    v_mad_i32_i24 v0, s8, s10, v0
+; GFX10-DL-NEXT:    v_mad_i32_i24 v0, s6, s8, v0
+; GFX10-DL-NEXT:    s_lshl_b32 s11, s3, 8
 ; GFX10-DL-NEXT:    s_lshl_b32 s13, s5, 8
-; GFX10-DL-NEXT:    s_lshl_b32 s15, s7, 8
+; GFX10-DL-NEXT:    s_ashr_i64 s[6:7], s[6:7], 60
 ; GFX10-DL-NEXT:    s_ashr_i64 s[8:9], s[8:9], 60
-; GFX10-DL-NEXT:    s_ashr_i64 s[10:11], s[10:11], 60
-; GFX10-DL-NEXT:    v_mad_i32_i24 v0, s12, s14, v0
+; GFX10-DL-NEXT:    v_mad_i32_i24 v0, s10, s12, v0
+; GFX10-DL-NEXT:    s_lshl_b32 s7, s3, 4
 ; GFX10-DL-NEXT:    s_lshl_b32 s9, s5, 4
-; GFX10-DL-NEXT:    s_lshl_b32 s11, s7, 4
+; GFX10-DL-NEXT:    s_ashr_i64 s[10:11], s[10:11], 60
 ; GFX10-DL-NEXT:    s_ashr_i64 s[12:13], s[12:13], 60
-; GFX10-DL-NEXT:    s_ashr_i64 s[14:15], s[14:15], 60
-; GFX10-DL-NEXT:    v_mad_i32_i24 v0, s8, s10, v0
+; GFX10-DL-NEXT:    v_mad_i32_i24 v0, s6, s8, v0
+; GFX10-DL-NEXT:    s_ashr_i64 s[6:7], s[6:7], 60
 ; GFX10-DL-NEXT:    s_ashr_i64 s[8:9], s[8:9], 60
-; GFX10-DL-NEXT:    s_ashr_i64 s[10:11], s[10:11], 60
+; GFX10-DL-NEXT:    s_ashr_i64 s[2:3], s[2:3], 60
 ; GFX10-DL-NEXT:    s_ashr_i64 s[4:5], s[4:5], 60
-; GFX10-DL-NEXT:    s_ashr_i64 s[6:7], s[6:7], 60
-; GFX10-DL-NEXT:    v_mad_i32_i24 v0, s12, s14, v0
-; GFX10-DL-NEXT:    v_mad_i32_i24 v0, s8, s10, v0
-; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s4, s6, v0
+; GFX10-DL-NEXT:    v_mad_i32_i24 v0, s10, s12, v0
+; GFX10-DL-NEXT:    v_mad_i32_i24 v0, s6, s8, v0
+; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s2, s4, v0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
@@ -1576,62 +1579,62 @@ entry:
 define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX7-LABEL: idot8_acc16_vecMul:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
-; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s6, -1
-; GFX7-NEXT:    s_mov_b32 s2, 0xffff
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b32 s8, 0xffff
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_load_dword s0, s[10:11], 0x0
-; GFX7-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
-; GFX7-NEXT:    s_load_dword s1, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x0
+; GFX7-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_bfe_i32 s16, s0, 0x40018
-; GFX7-NEXT:    s_bfe_i32 s17, s0, 0x40014
-; GFX7-NEXT:    s_bfe_i32 s18, s0, 0x40010
-; GFX7-NEXT:    s_bfe_i32 s19, s0, 0x40000
-; GFX7-NEXT:    s_bfe_i32 s20, s0, 0x40004
-; GFX7-NEXT:    s_bfe_i32 s21, s0, 0x40008
-; GFX7-NEXT:    s_ashr_i32 s15, s0, 28
-; GFX7-NEXT:    s_bfe_i32 s0, s0, 0x4000c
-; GFX7-NEXT:    s_ashr_i32 s8, s1, 28
-; GFX7-NEXT:    s_bfe_i32 s9, s1, 0x40018
-; GFX7-NEXT:    s_bfe_i32 s10, s1, 0x40014
-; GFX7-NEXT:    s_bfe_i32 s11, s1, 0x40010
-; GFX7-NEXT:    s_bfe_i32 s12, s1, 0x40000
-; GFX7-NEXT:    v_mov_b32_e32 v4, s19
-; GFX7-NEXT:    s_bfe_i32 s13, s1, 0x40004
-; GFX7-NEXT:    v_mov_b32_e32 v3, s20
-; GFX7-NEXT:    s_bfe_i32 s14, s1, 0x40008
-; GFX7-NEXT:    v_mov_b32_e32 v2, s21
-; GFX7-NEXT:    s_bfe_i32 s1, s1, 0x4000c
-; GFX7-NEXT:    v_mov_b32_e32 v1, s0
-; GFX7-NEXT:    v_mul_i32_i24_e32 v1, s1, v1
-; GFX7-NEXT:    v_mul_i32_i24_e32 v2, s14, v2
-; GFX7-NEXT:    v_mul_i32_i24_e32 v3, s13, v3
-; GFX7-NEXT:    v_mul_i32_i24_e32 v4, s12, v4
+; GFX7-NEXT:    s_bfe_i32 s15, s6, 0x40018
+; GFX7-NEXT:    s_bfe_i32 s16, s6, 0x40014
+; GFX7-NEXT:    s_bfe_i32 s17, s6, 0x40010
+; GFX7-NEXT:    s_bfe_i32 s18, s6, 0x40000
+; GFX7-NEXT:    s_bfe_i32 s19, s6, 0x40004
+; GFX7-NEXT:    s_bfe_i32 s20, s6, 0x40008
+; GFX7-NEXT:    s_ashr_i32 s14, s6, 28
+; GFX7-NEXT:    s_bfe_i32 s6, s6, 0x4000c
+; GFX7-NEXT:    s_ashr_i32 s5, s4, 28
+; GFX7-NEXT:    s_bfe_i32 s7, s4, 0x40018
+; GFX7-NEXT:    s_bfe_i32 s9, s4, 0x40014
+; GFX7-NEXT:    s_bfe_i32 s10, s4, 0x40010
+; GFX7-NEXT:    s_bfe_i32 s11, s4, 0x40000
+; GFX7-NEXT:    v_mov_b32_e32 v4, s18
+; GFX7-NEXT:    s_bfe_i32 s12, s4, 0x40004
+; GFX7-NEXT:    v_mov_b32_e32 v3, s19
+; GFX7-NEXT:    s_bfe_i32 s13, s4, 0x40008
+; GFX7-NEXT:    v_mov_b32_e32 v2, s20
+; GFX7-NEXT:    s_bfe_i32 s4, s4, 0x4000c
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mul_i32_i24_e32 v1, s4, v1
+; GFX7-NEXT:    v_mul_i32_i24_e32 v2, s13, v2
+; GFX7-NEXT:    v_mul_i32_i24_e32 v3, s12, v3
+; GFX7-NEXT:    v_mul_i32_i24_e32 v4, s11, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_and_b32_e32 v2, s2, v2
+; GFX7-NEXT:    v_and_b32_e32 v2, s8, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v4, s2, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, s8, v4
 ; GFX7-NEXT:    v_or_b32_e32 v1, v2, v1
 ; GFX7-NEXT:    v_or_b32_e32 v2, v4, v3
 ; GFX7-NEXT:    v_alignbit_b32 v3, v1, v2, 16
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
-; GFX7-NEXT:    v_mov_b32_e32 v5, s18
-; GFX7-NEXT:    v_mov_b32_e32 v6, s17
-; GFX7-NEXT:    v_mov_b32_e32 v7, s16
+; GFX7-NEXT:    v_mov_b32_e32 v5, s17
+; GFX7-NEXT:    v_mov_b32_e32 v6, s16
+; GFX7-NEXT:    v_mov_b32_e32 v7, s15
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
-; GFX7-NEXT:    v_mad_i32_i24 v0, s11, v5, v0
-; GFX7-NEXT:    v_mad_i32_i24 v0, s10, v6, v0
-; GFX7-NEXT:    v_mad_i32_i24 v0, s9, v7, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s15
-; GFX7-NEXT:    v_mad_i32_i24 v0, s8, v1, v0
-; GFX7-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; GFX7-NEXT:    v_mad_i32_i24 v0, s10, v5, v0
+; GFX7-NEXT:    v_mad_i32_i24 v0, s9, v6, v0
+; GFX7-NEXT:    v_mad_i32_i24 v0, s7, v7, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s14
+; GFX7-NEXT:    v_mad_i32_i24 v0, s5, v1, v0
+; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: idot8_acc16_vecMul:
@@ -1639,59 +1642,59 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_load_dword s7, s[6:7], 0x0
+; GFX8-NEXT:    s_load_dword s3, s[6:7], 0x0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_load_ushort v2, v[0:1]
 ; GFX8-NEXT:    s_load_dword s1, s[4:5], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_lshl_b32 s29, s7, 28
-; GFX8-NEXT:    s_ashr_i64 s[18:19], s[6:7], 60
-; GFX8-NEXT:    s_lshl_b32 s21, s7, 8
-; GFX8-NEXT:    s_lshl_b32 s23, s7, 12
-; GFX8-NEXT:    s_lshl_b32 s17, s1, 28
-; GFX8-NEXT:    s_lshl_b32 s25, s7, 16
-; GFX8-NEXT:    s_lshl_b32 s27, s7, 24
-; GFX8-NEXT:    s_lshl_b32 s19, s7, 4
-; GFX8-NEXT:    s_lshl_b32 s7, s7, 20
+; GFX8-NEXT:    s_lshl_b32 s27, s3, 28
+; GFX8-NEXT:    s_ashr_i64 s[16:17], s[2:3], 60
+; GFX8-NEXT:    s_lshl_b32 s19, s3, 8
+; GFX8-NEXT:    s_lshl_b32 s21, s3, 12
+; GFX8-NEXT:    s_lshl_b32 s15, s1, 28
+; GFX8-NEXT:    s_lshl_b32 s23, s3, 16
+; GFX8-NEXT:    s_lshl_b32 s25, s3, 24
+; GFX8-NEXT:    s_lshl_b32 s17, s3, 4
+; GFX8-NEXT:    s_lshl_b32 s3, s3, 20
 ; GFX8-NEXT:    s_ashr_i64 s[4:5], s[0:1], 60
-; GFX8-NEXT:    s_ashr_i64 s[28:29], s[28:29], 60
-; GFX8-NEXT:    s_lshl_b32 s9, s1, 8
-; GFX8-NEXT:    s_lshl_b32 s11, s1, 12
-; GFX8-NEXT:    s_lshl_b32 s13, s1, 16
-; GFX8-NEXT:    s_lshl_b32 s15, s1, 24
+; GFX8-NEXT:    s_ashr_i64 s[26:27], s[26:27], 60
+; GFX8-NEXT:    s_lshl_b32 s7, s1, 8
+; GFX8-NEXT:    s_lshl_b32 s9, s1, 12
+; GFX8-NEXT:    s_lshl_b32 s11, s1, 16
+; GFX8-NEXT:    s_lshl_b32 s13, s1, 24
 ; GFX8-NEXT:    s_lshl_b32 s5, s1, 4
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 20
-; GFX8-NEXT:    s_ashr_i64 s[26:27], s[26:27], 60
-; GFX8-NEXT:    s_ashr_i64 s[6:7], s[6:7], 60
-; GFX8-NEXT:    s_ashr_i64 s[16:17], s[16:17], 60
-; GFX8-NEXT:    v_mov_b32_e32 v4, s28
+; GFX8-NEXT:    s_ashr_i64 s[24:25], s[24:25], 60
+; GFX8-NEXT:    s_ashr_i64 s[2:3], s[2:3], 60
 ; GFX8-NEXT:    s_ashr_i64 s[14:15], s[14:15], 60
+; GFX8-NEXT:    v_mov_b32_e32 v4, s26
+; GFX8-NEXT:    s_ashr_i64 s[12:13], s[12:13], 60
 ; GFX8-NEXT:    s_ashr_i64 s[0:1], s[0:1], 60
-; GFX8-NEXT:    v_mov_b32_e32 v3, s6
-; GFX8-NEXT:    v_mov_b32_e32 v5, s26
-; GFX8-NEXT:    s_ashr_i64 s[24:25], s[24:25], 60
-; GFX8-NEXT:    v_mul_i32_i24_e32 v3, s0, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, s2
+; GFX8-NEXT:    v_mov_b32_e32 v5, s24
 ; GFX8-NEXT:    s_ashr_i64 s[22:23], s[22:23], 60
-; GFX8-NEXT:    s_ashr_i64 s[12:13], s[12:13], 60
-; GFX8-NEXT:    v_mov_b32_e32 v6, s24
+; GFX8-NEXT:    v_mul_i32_i24_e32 v3, s0, v3
 ; GFX8-NEXT:    s_ashr_i64 s[20:21], s[20:21], 60
 ; GFX8-NEXT:    s_ashr_i64 s[10:11], s[10:11], 60
-; GFX8-NEXT:    v_mov_b32_e32 v7, s22
-; GFX8-NEXT:    s_ashr_i64 s[32:33], s[18:19], 60
+; GFX8-NEXT:    v_mov_b32_e32 v6, s22
+; GFX8-NEXT:    s_ashr_i64 s[18:19], s[18:19], 60
 ; GFX8-NEXT:    s_ashr_i64 s[8:9], s[8:9], 60
-; GFX8-NEXT:    v_mov_b32_e32 v8, s20
-; GFX8-NEXT:    s_ashr_i64 s[30:31], s[4:5], 60
-; GFX8-NEXT:    v_mov_b32_e32 v9, s32
+; GFX8-NEXT:    v_mov_b32_e32 v7, s20
+; GFX8-NEXT:    s_ashr_i64 s[30:31], s[16:17], 60
+; GFX8-NEXT:    s_ashr_i64 s[6:7], s[6:7], 60
+; GFX8-NEXT:    v_mov_b32_e32 v8, s18
+; GFX8-NEXT:    s_ashr_i64 s[28:29], s[4:5], 60
+; GFX8-NEXT:    v_mov_b32_e32 v9, s30
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mad_i32_i24 v2, s16, v4, v2
-; GFX8-NEXT:    v_mad_i32_i24 v2, s14, v5, v2
+; GFX8-NEXT:    v_mad_i32_i24 v2, s14, v4, v2
+; GFX8-NEXT:    v_mad_i32_i24 v2, s12, v5, v2
 ; GFX8-NEXT:    v_add_u32_sdwa v2, vcc, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX8-NEXT:    v_mad_i32_i24 v2, s12, v6, v2
-; GFX8-NEXT:    v_mad_i32_i24 v2, s10, v7, v2
-; GFX8-NEXT:    v_mad_i32_i24 v2, s8, v8, v2
-; GFX8-NEXT:    v_mad_i32_i24 v2, s30, v9, v2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s18
+; GFX8-NEXT:    v_mad_i32_i24 v2, s10, v6, v2
+; GFX8-NEXT:    v_mad_i32_i24 v2, s8, v7, v2
+; GFX8-NEXT:    v_mad_i32_i24 v2, s6, v8, v2
+; GFX8-NEXT:    v_mad_i32_i24 v2, s28, v9, v2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s16
 ; GFX8-NEXT:    v_mad_i32_i24 v2, s4, v3, v2
 ; GFX8-NEXT:    flat_store_short v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
@@ -1704,35 +1707,35 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-NEXT:    s_load_dword s6, s[6:7], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_bfe_u32 s4, s2, 0x40018
-; GFX9-NEXT:    s_lshr_b32 s5, s2, 28
-; GFX9-NEXT:    s_bfe_u32 s8, s2, 0x40010
-; GFX9-NEXT:    s_bfe_u32 s9, s2, 0x40014
-; GFX9-NEXT:    s_bfe_u32 s10, s2, 0x40008
-; GFX9-NEXT:    s_bfe_u32 s11, s2, 0x4000c
-; GFX9-NEXT:    s_and_b32 s12, s2, 15
+; GFX9-NEXT:    s_bfe_u32 s3, s2, 0x40018
+; GFX9-NEXT:    s_lshr_b32 s4, s2, 28
+; GFX9-NEXT:    s_bfe_u32 s5, s2, 0x40010
+; GFX9-NEXT:    s_bfe_u32 s8, s2, 0x40014
+; GFX9-NEXT:    s_bfe_u32 s9, s2, 0x40008
+; GFX9-NEXT:    s_bfe_u32 s10, s2, 0x4000c
+; GFX9-NEXT:    s_and_b32 s11, s2, 15
 ; GFX9-NEXT:    s_bfe_u32 s2, s2, 0x40004
-; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s12, s2
+; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s11, s2
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v0, 12, s2 op_sel_hi:[0,1]
-; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s10, s11
+; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s9, s10
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v1, 12, s2 op_sel_hi:[0,1]
-; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s8, s9
+; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s5, s8
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v2, 12, s2 op_sel_hi:[0,1]
-; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s4, s5
+; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s3, s4
 ; GFX9-NEXT:    s_bfe_u32 s7, s6, 0x40018
-; GFX9-NEXT:    s_lshr_b32 s13, s6, 28
-; GFX9-NEXT:    s_bfe_u32 s14, s6, 0x40010
-; GFX9-NEXT:    s_bfe_u32 s15, s6, 0x40014
-; GFX9-NEXT:    s_bfe_u32 s16, s6, 0x40008
-; GFX9-NEXT:    s_bfe_u32 s17, s6, 0x4000c
-; GFX9-NEXT:    s_and_b32 s18, s6, 15
+; GFX9-NEXT:    s_lshr_b32 s12, s6, 28
+; GFX9-NEXT:    s_bfe_u32 s13, s6, 0x40010
+; GFX9-NEXT:    s_bfe_u32 s14, s6, 0x40014
+; GFX9-NEXT:    s_bfe_u32 s15, s6, 0x40008
+; GFX9-NEXT:    s_bfe_u32 s16, s6, 0x4000c
+; GFX9-NEXT:    s_and_b32 s17, s6, 15
 ; GFX9-NEXT:    s_bfe_u32 s6, s6, 0x40004
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v3, 12, s2 op_sel_hi:[0,1]
-; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s18, s6
+; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s17, s6
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v4, 12, s2 op_sel_hi:[0,1]
-; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s16, s17
+; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s15, s16
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v5, 12, s2 op_sel_hi:[0,1]
-; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s14, s15
+; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s13, s14
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 12, v0 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1]
@@ -1746,7 +1749,7 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    v_pk_mul_lo_u16 v2, v2, v6
 ; GFX9-NEXT:    global_load_ushort v6, v[0:1], off
-; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s7, s13
+; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s7, s12
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v7, 12, s2 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1]
@@ -1771,35 +1774,35 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s6, s[6:7], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_bfe_u32 s4, s2, 0x40018
-; GFX9-DL-NEXT:    s_lshr_b32 s5, s2, 28
-; GFX9-DL-NEXT:    s_bfe_u32 s8, s2, 0x40010
-; GFX9-DL-NEXT:    s_bfe_u32 s9, s2, 0x40014
-; GFX9-DL-NEXT:    s_bfe_u32 s10, s2, 0x40008
-; GFX9-DL-NEXT:    s_bfe_u32 s11, s2, 0x4000c
-; GFX9-DL-NEXT:    s_and_b32 s12, s2, 15
+; GFX9-DL-NEXT:    s_bfe_u32 s3, s2, 0x40018
+; GFX9-DL-NEXT:    s_lshr_b32 s4, s2, 28
+; GFX9-DL-NEXT:    s_bfe_u32 s5, s2, 0x40010
+; GFX9-DL-NEXT:    s_bfe_u32 s8, s2, 0x40014
+; GFX9-DL-NEXT:    s_bfe_u32 s9, s2, 0x40008
+; GFX9-DL-NEXT:    s_bfe_u32 s10, s2, 0x4000c
+; GFX9-DL-NEXT:    s_and_b32 s11, s2, 15
 ; GFX9-DL-NEXT:    s_bfe_u32 s2, s2, 0x40004
-; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s2, s12, s2
+; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s2, s11, s2
 ; GFX9-DL-NEXT:    v_pk_lshlrev_b16 v0, 12, s2 op_sel_hi:[0,1]
-; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s2, s10, s11
+; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s2, s9, s10
 ; GFX9-DL-NEXT:    v_pk_lshlrev_b16 v1, 12, s2 op_sel_hi:[0,1]
-; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s2, s8, s9
+; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s2, s5, s8
 ; GFX9-DL-NEXT:    v_pk_lshlrev_b16 v2, 12, s2 op_sel_hi:[0,1]
-; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s2, s4, s5
+; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s2, s3, s4
 ; GFX9-DL-NEXT:    s_bfe_u32 s7, s6, 0x40018
-; GFX9-DL-NEXT:    s_lshr_b32 s13, s6, 28
-; GFX9-DL-NEXT:    s_bfe_u32 s14, s6, 0x40010
-; GFX9-DL-NEXT:    s_bfe_u32 s15, s6, 0x40014
-; GFX9-DL-NEXT:    s_bfe_u32 s16, s6, 0x40008
-; GFX9-DL-NEXT:    s_bfe_u32 s17, s6, 0x4000c
-; GFX9-DL-NEXT:    s_and_b32 s18, s6, 15
+; GFX9-DL-NEXT:    s_lshr_b32 s12, s6, 28
+; GFX9-DL-NEXT:    s_bfe_u32 s13, s6, 0x40010
+; GFX9-DL-NEXT:    s_bfe_u32 s14, s6, 0x40014
+; GFX9-DL-NEXT:    s_bfe_u32 s15, s6, 0x40008
+; GFX9-DL-NEXT:    s_bfe_u32 s16, s6, 0x4000c
+; GFX9-DL-NEXT:    s_and_b32 s17, s6, 15
 ; GFX9-DL-NEXT:    s_bfe_u32 s6, s6, 0x40004
 ; GFX9-DL-NEXT:    v_pk_lshlrev_b16 v3, 12, s2 op_sel_hi:[0,1]
-; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s2, s18, s6
+; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s2, s17, s6
 ; GFX9-DL-NEXT:    v_pk_lshlrev_b16 v4, 12, s2 op_sel_hi:[0,1]
-; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s2, s16, s17
+; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s2, s15, s16
 ; GFX9-DL-NEXT:    v_pk_lshlrev_b16 v5, 12, s2 op_sel_hi:[0,1]
-; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s2, s14, s15
+; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s2, s13, s14
 ; GFX9-DL-NEXT:    v_pk_ashrrev_i16 v0, 12, v0 op_sel_hi:[0,1]
 ; GFX9-DL-NEXT:    v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1]
 ; GFX9-DL-NEXT:    v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1]
@@ -1813,7 +1816,7 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v2, v2, v6
 ; GFX9-DL-NEXT:    global_load_ushort v6, v[0:1], off
-; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s2, s7, s13
+; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s2, s7, s12
 ; GFX9-DL-NEXT:    v_pk_lshlrev_b16 v7, 12, s2 op_sel_hi:[0,1]
 ; GFX9-DL-NEXT:    v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1]
 ; GFX9-DL-NEXT:    v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1]
@@ -1832,53 +1835,53 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ;
 ; GFX10-DL-LABEL: idot8_acc16_vecMul:
 ; GFX10-DL:       ; %bb.0: ; %entry
-; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-DL-NEXT:    global_load_ushort v2, v[0:1], off
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX10-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_and_b32 s5, s0, 15
-; GFX10-DL-NEXT:    s_bfe_u32 s6, s0, 0x40004
-; GFX10-DL-NEXT:    s_and_b32 s7, s1, 15
-; GFX10-DL-NEXT:    s_bfe_u32 s8, s1, 0x40004
+; GFX10-DL-NEXT:    s_and_b32 s4, s0, 15
+; GFX10-DL-NEXT:    s_bfe_u32 s5, s0, 0x40004
+; GFX10-DL-NEXT:    s_and_b32 s6, s1, 15
+; GFX10-DL-NEXT:    s_bfe_u32 s7, s1, 0x40004
 ; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40018
-; GFX10-DL-NEXT:    s_lshr_b32 s4, s0, 28
-; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
-; GFX10-DL-NEXT:    s_bfe_u32 s9, s0, 0x40010
-; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s7, s7, s8
-; GFX10-DL-NEXT:    s_bfe_u32 s10, s0, 0x40014
-; GFX10-DL-NEXT:    s_bfe_u32 s6, s0, 0x40008
-; GFX10-DL-NEXT:    v_pk_lshlrev_b16 v3, 12, s5 op_sel_hi:[0,1]
+; GFX10-DL-NEXT:    s_lshr_b32 s3, s0, 28
+; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s4, s4, s5
+; GFX10-DL-NEXT:    s_bfe_u32 s8, s0, 0x40010
+; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s6, s6, s7
+; GFX10-DL-NEXT:    s_bfe_u32 s9, s0, 0x40014
+; GFX10-DL-NEXT:    s_bfe_u32 s5, s0, 0x40008
+; GFX10-DL-NEXT:    v_pk_lshlrev_b16 v3, 12, s4 op_sel_hi:[0,1]
 ; GFX10-DL-NEXT:    s_bfe_u32 s0, s0, 0x4000c
-; GFX10-DL-NEXT:    v_pk_lshlrev_b16 v4, 12, s7 op_sel_hi:[0,1]
-; GFX10-DL-NEXT:    s_bfe_u32 s8, s1, 0x40008
-; GFX10-DL-NEXT:    s_bfe_u32 s5, s1, 0x4000c
+; GFX10-DL-NEXT:    v_pk_lshlrev_b16 v4, 12, s6 op_sel_hi:[0,1]
+; GFX10-DL-NEXT:    s_bfe_u32 s7, s1, 0x40008
+; GFX10-DL-NEXT:    s_bfe_u32 s4, s1, 0x4000c
 ; GFX10-DL-NEXT:    v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1]
-; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s0, s6, s0
+; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s0, s5, s0
 ; GFX10-DL-NEXT:    v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1]
-; GFX10-DL-NEXT:    s_bfe_u32 s6, s1, 0x40010
-; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s5, s8, s5
-; GFX10-DL-NEXT:    s_bfe_u32 s7, s1, 0x40018
+; GFX10-DL-NEXT:    s_bfe_u32 s5, s1, 0x40010
+; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s4, s7, s4
+; GFX10-DL-NEXT:    s_bfe_u32 s6, s1, 0x40018
 ; GFX10-DL-NEXT:    v_pk_lshlrev_b16 v5, 12, s0 op_sel_hi:[0,1]
 ; GFX10-DL-NEXT:    s_bfe_u32 s0, s1, 0x40014
 ; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v3, v3, v4
-; GFX10-DL-NEXT:    v_pk_lshlrev_b16 v6, 12, s5 op_sel_hi:[0,1]
-; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s5, s9, s10
+; GFX10-DL-NEXT:    v_pk_lshlrev_b16 v6, 12, s4 op_sel_hi:[0,1]
+; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s4, s8, s9
 ; GFX10-DL-NEXT:    v_pk_ashrrev_i16 v4, 12, v5 op_sel_hi:[0,1]
-; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s0, s6, s0
+; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s0, s5, s0
 ; GFX10-DL-NEXT:    s_lshr_b32 s1, s1, 28
 ; GFX10-DL-NEXT:    v_pk_ashrrev_i16 v5, 12, v6 op_sel_hi:[0,1]
-; GFX10-DL-NEXT:    v_pk_lshlrev_b16 v6, 12, s5 op_sel_hi:[0,1]
+; GFX10-DL-NEXT:    v_pk_lshlrev_b16 v6, 12, s4 op_sel_hi:[0,1]
 ; GFX10-DL-NEXT:    v_pk_lshlrev_b16 v7, 12, s0 op_sel_hi:[0,1]
-; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s0, s2, s4
+; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s0, s2, s3
 ; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v4, v4, v5
-; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s1, s7, s1
+; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s1, s6, s1
 ; GFX10-DL-NEXT:    v_pk_ashrrev_i16 v5, 12, v7 op_sel_hi:[0,1]
 ; GFX10-DL-NEXT:    v_pk_lshlrev_b16 v7, 12, s1 op_sel_hi:[0,1]
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
@@ -1935,65 +1938,65 @@ entry:
 define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX7-LABEL: idot8_acc8_vecMul:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
-; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s6, -1
-; GFX7-NEXT:    s_movk_i32 s0, 0xff
-; GFX7-NEXT:    s_mov_b32 s1, 0xffff
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_movk_i32 s8, 0xff
+; GFX7-NEXT:    s_mov_b32 s9, 0xffff
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
-; GFX7-NEXT:    s_load_dword s2, s[8:9], 0x0
-; GFX7-NEXT:    s_load_dword s8, s[10:11], 0x0
+; GFX7-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_bfe_i32 s9, s2, 0x40000
-; GFX7-NEXT:    s_bfe_i32 s16, s8, 0x40000
-; GFX7-NEXT:    s_bfe_i32 s17, s8, 0x40004
-; GFX7-NEXT:    s_bfe_i32 s18, s8, 0x40008
-; GFX7-NEXT:    s_bfe_i32 s19, s8, 0x4000c
-; GFX7-NEXT:    s_bfe_i32 s20, s8, 0x40010
-; GFX7-NEXT:    s_bfe_i32 s21, s8, 0x40014
-; GFX7-NEXT:    s_bfe_i32 s22, s8, 0x40018
-; GFX7-NEXT:    s_ashr_i32 s8, s8, 28
-; GFX7-NEXT:    v_mov_b32_e32 v8, s16
-; GFX7-NEXT:    s_bfe_i32 s10, s2, 0x40004
-; GFX7-NEXT:    v_mov_b32_e32 v7, s17
-; GFX7-NEXT:    s_bfe_i32 s11, s2, 0x40008
-; GFX7-NEXT:    v_mov_b32_e32 v6, s18
-; GFX7-NEXT:    s_bfe_i32 s12, s2, 0x4000c
-; GFX7-NEXT:    v_mov_b32_e32 v5, s19
-; GFX7-NEXT:    s_bfe_i32 s13, s2, 0x40010
-; GFX7-NEXT:    v_mov_b32_e32 v4, s20
-; GFX7-NEXT:    s_bfe_i32 s14, s2, 0x40014
-; GFX7-NEXT:    v_mov_b32_e32 v3, s21
-; GFX7-NEXT:    s_bfe_i32 s15, s2, 0x40018
-; GFX7-NEXT:    v_mov_b32_e32 v2, s22
-; GFX7-NEXT:    s_ashr_i32 s2, s2, 28
-; GFX7-NEXT:    v_mov_b32_e32 v1, s8
-; GFX7-NEXT:    v_mul_i32_i24_e32 v1, s2, v1
-; GFX7-NEXT:    v_mul_i32_i24_e32 v2, s15, v2
-; GFX7-NEXT:    v_mul_i32_i24_e32 v3, s14, v3
-; GFX7-NEXT:    v_mul_i32_i24_e32 v9, s13, v4
-; GFX7-NEXT:    v_mul_i32_i24_e32 v5, s12, v5
-; GFX7-NEXT:    v_mul_i32_i24_e32 v6, s11, v6
-; GFX7-NEXT:    v_mul_i32_i24_e32 v7, s10, v7
-; GFX7-NEXT:    v_mul_i32_i24_e32 v8, s9, v8
+; GFX7-NEXT:    s_bfe_i32 s6, s4, 0x40000
+; GFX7-NEXT:    s_bfe_i32 s15, s5, 0x40000
+; GFX7-NEXT:    s_bfe_i32 s16, s5, 0x40004
+; GFX7-NEXT:    s_bfe_i32 s17, s5, 0x40008
+; GFX7-NEXT:    s_bfe_i32 s18, s5, 0x4000c
+; GFX7-NEXT:    s_bfe_i32 s19, s5, 0x40010
+; GFX7-NEXT:    s_bfe_i32 s20, s5, 0x40014
+; GFX7-NEXT:    s_bfe_i32 s21, s5, 0x40018
+; GFX7-NEXT:    s_ashr_i32 s5, s5, 28
+; GFX7-NEXT:    v_mov_b32_e32 v8, s15
+; GFX7-NEXT:    s_bfe_i32 s7, s4, 0x40004
+; GFX7-NEXT:    v_mov_b32_e32 v7, s16
+; GFX7-NEXT:    s_bfe_i32 s10, s4, 0x40008
+; GFX7-NEXT:    v_mov_b32_e32 v6, s17
+; GFX7-NEXT:    s_bfe_i32 s11, s4, 0x4000c
+; GFX7-NEXT:    v_mov_b32_e32 v5, s18
+; GFX7-NEXT:    s_bfe_i32 s12, s4, 0x40010
+; GFX7-NEXT:    v_mov_b32_e32 v4, s19
+; GFX7-NEXT:    s_bfe_i32 s13, s4, 0x40014
+; GFX7-NEXT:    v_mov_b32_e32 v3, s20
+; GFX7-NEXT:    s_bfe_i32 s14, s4, 0x40018
+; GFX7-NEXT:    v_mov_b32_e32 v2, s21
+; GFX7-NEXT:    s_ashr_i32 s4, s4, 28
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mul_i32_i24_e32 v1, s4, v1
+; GFX7-NEXT:    v_mul_i32_i24_e32 v2, s14, v2
+; GFX7-NEXT:    v_mul_i32_i24_e32 v3, s13, v3
+; GFX7-NEXT:    v_mul_i32_i24_e32 v9, s12, v4
+; GFX7-NEXT:    v_mul_i32_i24_e32 v5, s11, v5
+; GFX7-NEXT:    v_mul_i32_i24_e32 v6, s10, v6
+; GFX7-NEXT:    v_mul_i32_i24_e32 v7, s7, v7
+; GFX7-NEXT:    v_mul_i32_i24_e32 v8, s6, v8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX7-NEXT:    v_and_b32_e32 v2, s0, v2
+; GFX7-NEXT:    v_and_b32_e32 v2, s8, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
-; GFX7-NEXT:    v_and_b32_e32 v9, s0, v9
+; GFX7-NEXT:    v_and_b32_e32 v9, s8, v9
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX7-NEXT:    v_and_b32_e32 v6, s0, v6
+; GFX7-NEXT:    v_and_b32_e32 v6, s8, v6
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
-; GFX7-NEXT:    v_and_b32_e32 v8, s0, v8
+; GFX7-NEXT:    v_and_b32_e32 v8, s8, v8
 ; GFX7-NEXT:    v_or_b32_e32 v1, v2, v1
 ; GFX7-NEXT:    v_or_b32_e32 v2, v9, v3
 ; GFX7-NEXT:    v_or_b32_e32 v3, v6, v5
 ; GFX7-NEXT:    v_or_b32_e32 v5, v8, v7
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_and_b32_e32 v2, s1, v2
+; GFX7-NEXT:    v_and_b32_e32 v2, s9, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v5, s1, v5
+; GFX7-NEXT:    v_and_b32_e32 v5, s9, v5
 ; GFX7-NEXT:    v_or_b32_e32 v1, v2, v1
 ; GFX7-NEXT:    v_or_b32_e32 v2, v5, v3
 ; GFX7-NEXT:    v_alignbit_b32 v3, v1, v2, 8
@@ -2007,83 +2010,83 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v6, v0
-; GFX7-NEXT:    v_mad_i32_i24 v0, s13, v4, v0
+; GFX7-NEXT:    v_mad_i32_i24 v0, s12, v4, v0
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v7
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GFX7-NEXT:    buffer_store_byte v0, off, s[4:7], 0
+; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: idot8_acc8_vecMul:
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX8-NEXT:    s_mov_b32 s2, 0xffff
+; GFX8-NEXT:    s_mov_b32 s33, 0xffff
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_load_ubyte v2, v[0:1]
 ; GFX8-NEXT:    s_load_dword s1, s[4:5], 0x0
-; GFX8-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX8-NEXT:    s_load_dword s3, s[6:7], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_lshl_b32 s13, s1, 24
-; GFX8-NEXT:    s_lshl_b32 s17, s1, 16
-; GFX8-NEXT:    s_ashr_i64 s[22:23], s[4:5], 60
-; GFX8-NEXT:    s_lshl_b32 s25, s5, 24
-; GFX8-NEXT:    s_lshl_b32 s27, s5, 28
-; GFX8-NEXT:    s_lshl_b32 s29, s5, 16
-; GFX8-NEXT:    s_ashr_i64 s[10:11], s[0:1], 60
-; GFX8-NEXT:    s_lshl_b32 s15, s1, 28
-; GFX8-NEXT:    s_lshl_b32 s19, s5, 8
-; GFX8-NEXT:    s_lshl_b32 s21, s5, 12
-; GFX8-NEXT:    s_lshl_b32 s23, s5, 4
-; GFX8-NEXT:    s_lshl_b32 s5, s5, 20
-; GFX8-NEXT:    s_ashr_i64 s[12:13], s[12:13], 60
-; GFX8-NEXT:    s_ashr_i64 s[16:17], s[16:17], 60
+; GFX8-NEXT:    s_lshl_b32 s11, s1, 24
+; GFX8-NEXT:    s_lshl_b32 s15, s1, 16
+; GFX8-NEXT:    s_ashr_i64 s[20:21], s[2:3], 60
+; GFX8-NEXT:    s_lshl_b32 s23, s3, 24
+; GFX8-NEXT:    s_lshl_b32 s25, s3, 28
+; GFX8-NEXT:    s_lshl_b32 s27, s3, 16
+; GFX8-NEXT:    s_ashr_i64 s[8:9], s[0:1], 60
+; GFX8-NEXT:    s_lshl_b32 s13, s1, 28
+; GFX8-NEXT:    s_lshl_b32 s17, s3, 8
+; GFX8-NEXT:    s_lshl_b32 s19, s3, 12
+; GFX8-NEXT:    s_lshl_b32 s21, s3, 4
+; GFX8-NEXT:    s_lshl_b32 s3, s3, 20
+; GFX8-NEXT:    s_ashr_i64 s[10:11], s[10:11], 60
+; GFX8-NEXT:    s_ashr_i64 s[14:15], s[14:15], 60
+; GFX8-NEXT:    s_ashr_i64 s[22:23], s[22:23], 60
 ; GFX8-NEXT:    s_ashr_i64 s[24:25], s[24:25], 60
 ; GFX8-NEXT:    s_ashr_i64 s[26:27], s[26:27], 60
-; GFX8-NEXT:    s_ashr_i64 s[28:29], s[28:29], 60
-; GFX8-NEXT:    s_lshl_b32 s7, s1, 8
-; GFX8-NEXT:    s_lshl_b32 s9, s1, 12
-; GFX8-NEXT:    s_lshl_b32 s11, s1, 4
+; GFX8-NEXT:    s_lshl_b32 s5, s1, 8
+; GFX8-NEXT:    s_lshl_b32 s7, s1, 12
+; GFX8-NEXT:    s_lshl_b32 s9, s1, 4
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 20
-; GFX8-NEXT:    s_ashr_i64 s[4:5], s[4:5], 60
-; GFX8-NEXT:    s_ashr_i64 s[14:15], s[14:15], 60
-; GFX8-NEXT:    v_mov_b32_e32 v6, s28
-; GFX8-NEXT:    v_mov_b32_e32 v7, s16
-; GFX8-NEXT:    v_mov_b32_e32 v8, s26
-; GFX8-NEXT:    v_mov_b32_e32 v9, s24
-; GFX8-NEXT:    v_mov_b32_e32 v10, s12
+; GFX8-NEXT:    s_ashr_i64 s[2:3], s[2:3], 60
+; GFX8-NEXT:    s_ashr_i64 s[12:13], s[12:13], 60
+; GFX8-NEXT:    v_mov_b32_e32 v6, s26
+; GFX8-NEXT:    v_mov_b32_e32 v7, s14
+; GFX8-NEXT:    v_mov_b32_e32 v8, s24
+; GFX8-NEXT:    v_mov_b32_e32 v9, s22
+; GFX8-NEXT:    v_mov_b32_e32 v10, s10
 ; GFX8-NEXT:    v_mul_i32_i24_sdwa v6, v7, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_mul_i32_i24_e32 v7, s14, v8
+; GFX8-NEXT:    v_mul_i32_i24_e32 v7, s12, v8
 ; GFX8-NEXT:    v_mul_i32_i24_sdwa v8, v10, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    s_ashr_i64 s[0:1], s[0:1], 60
-; GFX8-NEXT:    v_mov_b32_e32 v5, s4
+; GFX8-NEXT:    v_mov_b32_e32 v5, s2
 ; GFX8-NEXT:    v_mul_i32_i24_e32 v5, s0, v5
 ; GFX8-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    s_ashr_i64 s[6:7], s[6:7], 60
-; GFX8-NEXT:    s_ashr_i64 s[18:19], s[18:19], 60
+; GFX8-NEXT:    s_ashr_i64 s[4:5], s[4:5], 60
+; GFX8-NEXT:    s_ashr_i64 s[16:17], s[16:17], 60
 ; GFX8-NEXT:    v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_e32 v6, s2, v7
-; GFX8-NEXT:    s_ashr_i64 s[20:21], s[20:21], 60
-; GFX8-NEXT:    v_mov_b32_e32 v3, s22
-; GFX8-NEXT:    v_mov_b32_e32 v4, s10
-; GFX8-NEXT:    s_ashr_i64 s[32:33], s[22:23], 60
+; GFX8-NEXT:    v_and_b32_e32 v6, s33, v7
+; GFX8-NEXT:    s_ashr_i64 s[18:19], s[18:19], 60
+; GFX8-NEXT:    v_mov_b32_e32 v3, s20
+; GFX8-NEXT:    v_mov_b32_e32 v4, s8
+; GFX8-NEXT:    s_ashr_i64 s[30:31], s[20:21], 60
 ; GFX8-NEXT:    v_mul_i32_i24_sdwa v3, v4, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v5, v6, v5
-; GFX8-NEXT:    s_ashr_i64 s[8:9], s[8:9], 60
-; GFX8-NEXT:    v_mov_b32_e32 v4, s20
-; GFX8-NEXT:    v_mov_b32_e32 v12, s18
-; GFX8-NEXT:    v_mov_b32_e32 v13, s6
-; GFX8-NEXT:    s_ashr_i64 s[30:31], s[10:11], 60
-; GFX8-NEXT:    v_mov_b32_e32 v11, s32
-; GFX8-NEXT:    v_mul_i32_i24_e32 v4, s8, v4
+; GFX8-NEXT:    s_ashr_i64 s[6:7], s[6:7], 60
+; GFX8-NEXT:    v_mov_b32_e32 v4, s18
+; GFX8-NEXT:    v_mov_b32_e32 v12, s16
+; GFX8-NEXT:    v_mov_b32_e32 v13, s4
+; GFX8-NEXT:    s_ashr_i64 s[28:29], s[8:9], 60
+; GFX8-NEXT:    v_mov_b32_e32 v11, s30
+; GFX8-NEXT:    v_mul_i32_i24_e32 v4, s6, v4
 ; GFX8-NEXT:    v_mul_i32_i24_sdwa v10, v13, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 8, v5
 ; GFX8-NEXT:    v_or_b32_sdwa v4, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_mul_i32_i24_e32 v9, s30, v11
+; GFX8-NEXT:    v_mul_i32_i24_e32 v9, s28, v11
 ; GFX8-NEXT:    v_or_b32_sdwa v3, v9, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_e32 v4, s2, v4
+; GFX8-NEXT:    v_and_b32_e32 v4, s33, v4
 ; GFX8-NEXT:    v_or_b32_e32 v3, v4, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 8, v3
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -2110,20 +2113,20 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX9-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshr_b32 s8, s0, 4
-; GFX9-NEXT:    s_lshr_b32 s15, s1, 4
+; GFX9-NEXT:    s_lshr_b32 s7, s0, 4
+; GFX9-NEXT:    s_lshr_b32 s14, s1, 4
 ; GFX9-NEXT:    v_lshlrev_b16_e64 v3, 12, s0
 ; GFX9-NEXT:    v_lshlrev_b16_e64 v4, 12, s1
-; GFX9-NEXT:    v_lshlrev_b16_e64 v7, 12, s8
-; GFX9-NEXT:    v_lshlrev_b16_e64 v14, 12, s15
-; GFX9-NEXT:    s_lshr_b32 s9, s0, 12
-; GFX9-NEXT:    s_lshr_b32 s10, s0, 8
-; GFX9-NEXT:    s_lshr_b32 s16, s1, 12
-; GFX9-NEXT:    s_lshr_b32 s17, s1, 8
-; GFX9-NEXT:    v_lshlrev_b16_e64 v5, 12, s10
-; GFX9-NEXT:    v_lshlrev_b16_e64 v6, 12, s9
-; GFX9-NEXT:    v_lshlrev_b16_e64 v12, 12, s17
-; GFX9-NEXT:    v_lshlrev_b16_e64 v13, 12, s16
+; GFX9-NEXT:    v_lshlrev_b16_e64 v7, 12, s7
+; GFX9-NEXT:    v_lshlrev_b16_e64 v14, 12, s14
+; GFX9-NEXT:    s_lshr_b32 s8, s0, 12
+; GFX9-NEXT:    s_lshr_b32 s9, s0, 8
+; GFX9-NEXT:    s_lshr_b32 s15, s1, 12
+; GFX9-NEXT:    s_lshr_b32 s16, s1, 8
+; GFX9-NEXT:    v_lshlrev_b16_e64 v5, 12, s9
+; GFX9-NEXT:    v_lshlrev_b16_e64 v6, 12, s8
+; GFX9-NEXT:    v_lshlrev_b16_e64 v12, 12, s16
+; GFX9-NEXT:    v_lshlrev_b16_e64 v13, 12, s15
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v3, 12, v3
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v4, 12, v4
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v7, 12, v7
@@ -2135,26 +2138,26 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    v_mul_lo_u16_e32 v3, v3, v4
 ; GFX9-NEXT:    v_mul_lo_u16_sdwa v7, v7, v14 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_lshr_b32 s4, s0, 20
-; GFX9-NEXT:    s_lshr_b32 s5, s0, 16
-; GFX9-NEXT:    s_lshr_b32 s11, s1, 20
-; GFX9-NEXT:    s_lshr_b32 s12, s1, 16
+; GFX9-NEXT:    s_lshr_b32 s3, s0, 20
+; GFX9-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX9-NEXT:    s_lshr_b32 s10, s1, 20
+; GFX9-NEXT:    s_lshr_b32 s11, s1, 16
 ; GFX9-NEXT:    v_mul_lo_u16_sdwa v6, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_mul_lo_u16_e32 v5, v5, v12
-; GFX9-NEXT:    v_lshlrev_b16_e64 v10, 12, s5
-; GFX9-NEXT:    v_lshlrev_b16_e64 v11, 12, s4
-; GFX9-NEXT:    v_lshlrev_b16_e64 v17, 12, s12
-; GFX9-NEXT:    v_lshlrev_b16_e64 v18, 12, s11
-; GFX9-NEXT:    s_lshr_b32 s6, s0, 28
-; GFX9-NEXT:    s_lshr_b32 s7, s0, 24
-; GFX9-NEXT:    s_lshr_b32 s13, s1, 28
-; GFX9-NEXT:    s_lshr_b32 s14, s1, 24
+; GFX9-NEXT:    v_lshlrev_b16_e64 v10, 12, s4
+; GFX9-NEXT:    v_lshlrev_b16_e64 v11, 12, s3
+; GFX9-NEXT:    v_lshlrev_b16_e64 v17, 12, s11
+; GFX9-NEXT:    v_lshlrev_b16_e64 v18, 12, s10
+; GFX9-NEXT:    s_lshr_b32 s5, s0, 28
+; GFX9-NEXT:    s_lshr_b32 s6, s0, 24
+; GFX9-NEXT:    s_lshr_b32 s12, s1, 28
+; GFX9-NEXT:    s_lshr_b32 s13, s1, 24
 ; GFX9-NEXT:    v_and_b32_e32 v3, s2, v3
 ; GFX9-NEXT:    v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b16_e64 v8, 12, s7
-; GFX9-NEXT:    v_lshlrev_b16_e64 v9, 12, s6
-; GFX9-NEXT:    v_lshlrev_b16_e64 v15, 12, s14
-; GFX9-NEXT:    v_lshlrev_b16_e64 v16, 12, s13
+; GFX9-NEXT:    v_lshlrev_b16_e64 v8, 12, s6
+; GFX9-NEXT:    v_lshlrev_b16_e64 v9, 12, s5
+; GFX9-NEXT:    v_lshlrev_b16_e64 v15, 12, s13
+; GFX9-NEXT:    v_lshlrev_b16_e64 v16, 12, s12
 ; GFX9-NEXT:    v_or_b32_e32 v5, v3, v5
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v10, 12, v10
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v17, 12, v17
@@ -2198,20 +2201,20 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_lshr_b32 s8, s0, 4
-; GFX9-DL-NEXT:    s_lshr_b32 s15, s1, 4
+; GFX9-DL-NEXT:    s_lshr_b32 s7, s0, 4
+; GFX9-DL-NEXT:    s_lshr_b32 s14, s1, 4
 ; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v3, 12, s0
 ; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v4, 12, s1
-; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v7, 12, s8
-; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v14, 12, s15
-; GFX9-DL-NEXT:    s_lshr_b32 s9, s0, 12
-; GFX9-DL-NEXT:    s_lshr_b32 s10, s0, 8
-; GFX9-DL-NEXT:    s_lshr_b32 s16, s1, 12
-; GFX9-DL-NEXT:    s_lshr_b32 s17, s1, 8
-; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v5, 12, s10
-; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v6, 12, s9
-; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v12, 12, s17
-; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v13, 12, s16
+; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v7, 12, s7
+; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v14, 12, s14
+; GFX9-DL-NEXT:    s_lshr_b32 s8, s0, 12
+; GFX9-DL-NEXT:    s_lshr_b32 s9, s0, 8
+; GFX9-DL-NEXT:    s_lshr_b32 s15, s1, 12
+; GFX9-DL-NEXT:    s_lshr_b32 s16, s1, 8
+; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v5, 12, s9
+; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v6, 12, s8
+; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v12, 12, s16
+; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v13, 12, s15
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v3, 12, v3
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v4, 12, v4
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v7, 12, v7
@@ -2223,26 +2226,26 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v3, v3, v4
 ; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v7, v7, v14 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-DL-NEXT:    v_or_b32_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-DL-NEXT:    s_lshr_b32 s4, s0, 20
-; GFX9-DL-NEXT:    s_lshr_b32 s5, s0, 16
-; GFX9-DL-NEXT:    s_lshr_b32 s11, s1, 20
-; GFX9-DL-NEXT:    s_lshr_b32 s12, s1, 16
+; GFX9-DL-NEXT:    s_lshr_b32 s3, s0, 20
+; GFX9-DL-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX9-DL-NEXT:    s_lshr_b32 s10, s1, 20
+; GFX9-DL-NEXT:    s_lshr_b32 s11, s1, 16
 ; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v6, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v5, v5, v12
-; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v10, 12, s5
-; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v11, 12, s4
-; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v17, 12, s12
-; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v18, 12, s11
-; GFX9-DL-NEXT:    s_lshr_b32 s6, s0, 28
-; GFX9-DL-NEXT:    s_lshr_b32 s7, s0, 24
-; GFX9-DL-NEXT:    s_lshr_b32 s13, s1, 28
-; GFX9-DL-NEXT:    s_lshr_b32 s14, s1, 24
+; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v10, 12, s4
+; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v11, 12, s3
+; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v17, 12, s11
+; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v18, 12, s10
+; GFX9-DL-NEXT:    s_lshr_b32 s5, s0, 28
+; GFX9-DL-NEXT:    s_lshr_b32 s6, s0, 24
+; GFX9-DL-NEXT:    s_lshr_b32 s12, s1, 28
+; GFX9-DL-NEXT:    s_lshr_b32 s13, s1, 24
 ; GFX9-DL-NEXT:    v_and_b32_e32 v3, s2, v3
 ; GFX9-DL-NEXT:    v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v8, 12, s7
-; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v9, 12, s6
-; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v15, 12, s14
-; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v16, 12, s13
+; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v8, 12, s6
+; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v9, 12, s5
+; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v15, 12, s13
+; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v16, 12, s12
 ; GFX9-DL-NEXT:    v_or_b32_e32 v5, v3, v5
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v10, 12, v10
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v17, 12, v17
@@ -2276,70 +2279,70 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ;
 ; GFX10-DL-LABEL: idot8_acc8_vecMul:
 ; GFX10-DL:       ; %bb.0: ; %entry
-; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GFX10-DL-NEXT:    s_mov_b32 s2, 0xffff
+; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-DL-NEXT:    global_load_ubyte v2, v[0:1], off
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX10-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX10-DL-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_lshr_b32 s8, s0, 4
-; GFX10-DL-NEXT:    s_lshr_b32 s15, s1, 4
-; GFX10-DL-NEXT:    s_lshr_b32 s9, s0, 12
-; GFX10-DL-NEXT:    s_lshr_b32 s16, s1, 12
+; GFX10-DL-NEXT:    s_lshr_b32 s7, s0, 4
+; GFX10-DL-NEXT:    s_lshr_b32 s14, s1, 4
+; GFX10-DL-NEXT:    s_lshr_b32 s8, s0, 12
+; GFX10-DL-NEXT:    s_lshr_b32 s15, s1, 12
 ; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v3, 12, s0
-; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v7, 12, s8
-; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v12, 12, s15
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v7, 12, s7
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v12, 12, s14
 ; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v4, 12, s1
-; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v14, 12, s16
-; GFX10-DL-NEXT:    s_lshr_b32 s10, s0, 8
-; GFX10-DL-NEXT:    s_lshr_b32 s17, s1, 8
-; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v6, 12, s9
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v14, 12, s15
+; GFX10-DL-NEXT:    s_lshr_b32 s9, s0, 8
+; GFX10-DL-NEXT:    s_lshr_b32 s16, s1, 8
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v6, 12, s8
 ; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v7, 12, v7
 ; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v12, 12, v12
-; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v5, 12, s10
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v5, 12, s9
 ; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v3, 12, v3
 ; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v4, 12, v4
-; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v13, 12, s17
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v13, 12, s16
 ; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v7, v7, v12
 ; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v19, 12, v6
 ; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v14, 12, v14
-; GFX10-DL-NEXT:    s_lshr_b32 s4, s0, 20
-; GFX10-DL-NEXT:    s_lshr_b32 s5, s0, 16
-; GFX10-DL-NEXT:    s_lshr_b32 s6, s0, 28
-; GFX10-DL-NEXT:    s_lshr_b32 s7, s0, 24
+; GFX10-DL-NEXT:    s_lshr_b32 s3, s0, 20
+; GFX10-DL-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX10-DL-NEXT:    s_lshr_b32 s5, s0, 28
+; GFX10-DL-NEXT:    s_lshr_b32 s6, s0, 24
 ; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v3, v3, v4
 ; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v4, v19, v14
 ; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v6, 8, v7
-; GFX10-DL-NEXT:    s_lshr_b32 s11, s1, 20
+; GFX10-DL-NEXT:    s_lshr_b32 s10, s1, 20
 ; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v12, 12, v13
 ; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v5, 12, v5
-; GFX10-DL-NEXT:    s_lshr_b32 s12, s1, 16
+; GFX10-DL-NEXT:    s_lshr_b32 s11, s1, 16
 ; GFX10-DL-NEXT:    v_or_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-DL-NEXT:    s_lshr_b32 s13, s1, 28
-; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v8, 12, s7
-; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v9, 12, s6
-; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v10, 12, s5
-; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v11, 12, s4
-; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v13, 12, s11
+; GFX10-DL-NEXT:    s_lshr_b32 s12, s1, 28
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v8, 12, s6
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v9, 12, s5
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v10, 12, s4
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v11, 12, s3
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v13, 12, s10
 ; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v5, v5, v12
 ; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v4, 8, v4
-; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v7, 12, s12
-; GFX10-DL-NEXT:    s_lshr_b32 s14, s1, 24
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v7, 12, s11
+; GFX10-DL-NEXT:    s_lshr_b32 s13, s1, 24
 ; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v6, 12, v8
 ; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v8, 12, v9
 ; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v9, 12, v10
 ; GFX10-DL-NEXT:    v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-DL-NEXT:    v_and_b32_e32 v3, s2, v3
-; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v16, 12, s13
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v16, 12, s12
 ; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v5, 12, v11
 ; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v10, 12, v13
-; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v15, 12, s14
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v15, 12, s13
 ; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v7, 12, v7
 ; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v11, 12, v16
 ; GFX10-DL-NEXT:    v_or_b32_e32 v4, v3, v4

diff  --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll
index 543b55e8e261..21eede8df373 100644
--- a/llvm/test/CodeGen/AMDGPU/idot8u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll
@@ -9,49 +9,49 @@
 define amdgpu_kernel void @udot8_acc32(<8 x i4> addrspace(1)* %src1,
 ; GFX7-LABEL: udot8_acc32:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
-; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_load_dword s10, s[10:11], 0x0
-; GFX7-NEXT:    s_load_dword s0, s[8:9], 0x0
-; GFX7-NEXT:    s_load_dword s21, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s20, s[0:1], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_lshr_b32 s11, s10, 28
-; GFX7-NEXT:    s_bfe_u32 s15, s10, 0x40018
-; GFX7-NEXT:    s_bfe_u32 s16, s10, 0x40014
-; GFX7-NEXT:    s_bfe_u32 s17, s10, 0x40010
-; GFX7-NEXT:    s_bfe_u32 s18, s10, 0x4000c
-; GFX7-NEXT:    s_bfe_u32 s19, s10, 0x40008
-; GFX7-NEXT:    s_bfe_u32 s20, s10, 0x40004
-; GFX7-NEXT:    s_and_b32 s10, s10, 15
-; GFX7-NEXT:    s_lshr_b32 s1, s0, 28
-; GFX7-NEXT:    s_bfe_u32 s2, s0, 0x40018
-; GFX7-NEXT:    s_bfe_u32 s8, s0, 0x40014
-; GFX7-NEXT:    s_bfe_u32 s9, s0, 0x40010
-; GFX7-NEXT:    s_bfe_u32 s12, s0, 0x4000c
-; GFX7-NEXT:    s_bfe_u32 s13, s0, 0x40008
-; GFX7-NEXT:    s_bfe_u32 s14, s0, 0x40004
-; GFX7-NEXT:    s_and_b32 s0, s0, 15
-; GFX7-NEXT:    v_mov_b32_e32 v0, s10
-; GFX7-NEXT:    v_mov_b32_e32 v1, s21
-; GFX7-NEXT:    v_mad_u32_u24 v0, s0, v0, v1
+; GFX7-NEXT:    s_lshr_b32 s7, s6, 28
+; GFX7-NEXT:    s_bfe_u32 s14, s6, 0x40018
+; GFX7-NEXT:    s_bfe_u32 s15, s6, 0x40014
+; GFX7-NEXT:    s_bfe_u32 s16, s6, 0x40010
+; GFX7-NEXT:    s_bfe_u32 s17, s6, 0x4000c
+; GFX7-NEXT:    s_bfe_u32 s18, s6, 0x40008
+; GFX7-NEXT:    s_bfe_u32 s19, s6, 0x40004
+; GFX7-NEXT:    s_and_b32 s6, s6, 15
+; GFX7-NEXT:    s_lshr_b32 s5, s4, 28
+; GFX7-NEXT:    s_bfe_u32 s8, s4, 0x40018
+; GFX7-NEXT:    s_bfe_u32 s9, s4, 0x40014
+; GFX7-NEXT:    s_bfe_u32 s10, s4, 0x40010
+; GFX7-NEXT:    s_bfe_u32 s11, s4, 0x4000c
+; GFX7-NEXT:    s_bfe_u32 s12, s4, 0x40008
+; GFX7-NEXT:    s_bfe_u32 s13, s4, 0x40004
+; GFX7-NEXT:    s_and_b32 s4, s4, 15
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s20
-; GFX7-NEXT:    v_mad_u32_u24 v0, s14, v1, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s4, v0, v1
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s19
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s13, v1, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s18
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s12, v1, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s17
-; GFX7-NEXT:    v_mad_u32_u24 v0, s9, v1, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s11, v1, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s16
-; GFX7-NEXT:    v_mad_u32_u24 v0, s8, v1, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s10, v1, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s15
-; GFX7-NEXT:    v_mad_u32_u24 v0, s2, v1, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s11
-; GFX7-NEXT:    v_mad_u32_u24 v0, s1, v1, v0
-; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s9, v1, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s14
+; GFX7-NEXT:    v_mad_u32_u24 v0, s8, v1, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mad_u32_u24 v0, s5, v1, v0
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: udot8_acc32:
@@ -61,29 +61,27 @@ define amdgpu_kernel void @udot8_acc32(<8 x i4> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_load_dword s6, s[6:7], 0x0
 ; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX8-NEXT:    s_load_dword s19, s[0:1], 0x0
+; GFX8-NEXT:    s_load_dword s18, s[0:1], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_lshr_b32 s7, s6, 28
-; GFX8-NEXT:    s_bfe_u32 s13, s6, 0x40018
-; GFX8-NEXT:    s_bfe_u32 s14, s6, 0x40014
-; GFX8-NEXT:    s_bfe_u32 s15, s6, 0x40010
-; GFX8-NEXT:    s_bfe_u32 s16, s6, 0x4000c
-; GFX8-NEXT:    s_bfe_u32 s17, s6, 0x40008
-; GFX8-NEXT:    s_bfe_u32 s18, s6, 0x40004
+; GFX8-NEXT:    s_bfe_u32 s12, s6, 0x40018
+; GFX8-NEXT:    s_bfe_u32 s13, s6, 0x40014
+; GFX8-NEXT:    s_bfe_u32 s14, s6, 0x40010
+; GFX8-NEXT:    s_bfe_u32 s15, s6, 0x4000c
+; GFX8-NEXT:    s_bfe_u32 s16, s6, 0x40008
+; GFX8-NEXT:    s_bfe_u32 s17, s6, 0x40004
 ; GFX8-NEXT:    s_and_b32 s6, s6, 15
-; GFX8-NEXT:    s_lshr_b32 s4, s2, 28
-; GFX8-NEXT:    s_bfe_u32 s5, s2, 0x40018
-; GFX8-NEXT:    s_bfe_u32 s8, s2, 0x40014
-; GFX8-NEXT:    s_bfe_u32 s9, s2, 0x40010
-; GFX8-NEXT:    s_bfe_u32 s10, s2, 0x4000c
-; GFX8-NEXT:    s_bfe_u32 s11, s2, 0x40008
-; GFX8-NEXT:    s_bfe_u32 s12, s2, 0x40004
+; GFX8-NEXT:    s_lshr_b32 s3, s2, 28
+; GFX8-NEXT:    s_bfe_u32 s4, s2, 0x40018
+; GFX8-NEXT:    s_bfe_u32 s5, s2, 0x40014
+; GFX8-NEXT:    s_bfe_u32 s8, s2, 0x40010
+; GFX8-NEXT:    s_bfe_u32 s9, s2, 0x4000c
+; GFX8-NEXT:    s_bfe_u32 s10, s2, 0x40008
+; GFX8-NEXT:    s_bfe_u32 s11, s2, 0x40004
 ; GFX8-NEXT:    s_and_b32 s2, s2, 15
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s6
-; GFX8-NEXT:    v_mov_b32_e32 v1, s19
-; GFX8-NEXT:    v_mad_u32_u24 v0, s2, v0, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s18
-; GFX8-NEXT:    v_mad_u32_u24 v0, s12, v1, v0
+; GFX8-NEXT:    v_mad_u32_u24 v0, s2, v0, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s17
 ; GFX8-NEXT:    v_mad_u32_u24 v0, s11, v1, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s16
@@ -94,8 +92,10 @@ define amdgpu_kernel void @udot8_acc32(<8 x i4> addrspace(1)* %src1,
 ; GFX8-NEXT:    v_mad_u32_u24 v0, s8, v1, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s13
 ; GFX8-NEXT:    v_mad_u32_u24 v0, s5, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s12
+; GFX8-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s7
-; GFX8-NEXT:    v_mad_u32_u24 v2, s4, v1, v0
+; GFX8-NEXT:    v_mad_u32_u24 v2, s3, v1, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
@@ -108,29 +108,27 @@ define amdgpu_kernel void @udot8_acc32(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_load_dword s6, s[6:7], 0x0
 ; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX9-NEXT:    s_load_dword s19, s[0:1], 0x0
+; GFX9-NEXT:    s_load_dword s18, s[0:1], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshr_b32 s7, s6, 28
-; GFX9-NEXT:    s_bfe_u32 s13, s6, 0x40018
-; GFX9-NEXT:    s_bfe_u32 s14, s6, 0x40014
-; GFX9-NEXT:    s_bfe_u32 s15, s6, 0x40010
-; GFX9-NEXT:    s_bfe_u32 s16, s6, 0x4000c
-; GFX9-NEXT:    s_bfe_u32 s17, s6, 0x40008
-; GFX9-NEXT:    s_bfe_u32 s18, s6, 0x40004
+; GFX9-NEXT:    s_bfe_u32 s12, s6, 0x40018
+; GFX9-NEXT:    s_bfe_u32 s13, s6, 0x40014
+; GFX9-NEXT:    s_bfe_u32 s14, s6, 0x40010
+; GFX9-NEXT:    s_bfe_u32 s15, s6, 0x4000c
+; GFX9-NEXT:    s_bfe_u32 s16, s6, 0x40008
+; GFX9-NEXT:    s_bfe_u32 s17, s6, 0x40004
 ; GFX9-NEXT:    s_and_b32 s6, s6, 15
-; GFX9-NEXT:    s_lshr_b32 s4, s2, 28
-; GFX9-NEXT:    s_bfe_u32 s5, s2, 0x40018
-; GFX9-NEXT:    s_bfe_u32 s8, s2, 0x40014
-; GFX9-NEXT:    s_bfe_u32 s9, s2, 0x40010
-; GFX9-NEXT:    s_bfe_u32 s10, s2, 0x4000c
-; GFX9-NEXT:    s_bfe_u32 s11, s2, 0x40008
-; GFX9-NEXT:    s_bfe_u32 s12, s2, 0x40004
+; GFX9-NEXT:    s_lshr_b32 s3, s2, 28
+; GFX9-NEXT:    s_bfe_u32 s4, s2, 0x40018
+; GFX9-NEXT:    s_bfe_u32 s5, s2, 0x40014
+; GFX9-NEXT:    s_bfe_u32 s8, s2, 0x40010
+; GFX9-NEXT:    s_bfe_u32 s9, s2, 0x4000c
+; GFX9-NEXT:    s_bfe_u32 s10, s2, 0x40008
+; GFX9-NEXT:    s_bfe_u32 s11, s2, 0x40004
 ; GFX9-NEXT:    s_and_b32 s2, s2, 15
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-NEXT:    v_mov_b32_e32 v1, s19
-; GFX9-NEXT:    v_mad_u32_u24 v0, s2, v0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s18
-; GFX9-NEXT:    v_mad_u32_u24 v0, s12, v1, v0
+; GFX9-NEXT:    v_mad_u32_u24 v0, s2, v0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s17
 ; GFX9-NEXT:    v_mad_u32_u24 v0, s11, v1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s16
@@ -141,8 +139,10 @@ define amdgpu_kernel void @udot8_acc32(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    v_mad_u32_u24 v0, s8, v1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s13
 ; GFX9-NEXT:    v_mad_u32_u24 v0, s5, v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s12
+; GFX9-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-NEXT:    v_mad_u32_u24 v2, s4, v1, v0
+; GFX9-NEXT:    v_mad_u32_u24 v2, s3, v1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
@@ -154,11 +154,11 @@ define amdgpu_kernel void @udot8_acc32(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
-; GFX9-DL-NEXT:    s_load_dword s6, s[0:1], 0x0
+; GFX9-DL-NEXT:    s_load_dword s3, s[0:1], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-DL-NEXT:    v_dot8_u32_u4 v2, s4, v0, v1
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
@@ -167,18 +167,18 @@ define amdgpu_kernel void @udot8_acc32(<8 x i4> addrspace(1)* %src1,
 ;
 ; GFX10-DL-LABEL: udot8_acc32:
 ; GFX10-DL:       ; %bb.0: ; %entry
-; GFX10-DL-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
-; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_load_dword s0, s[8:9], 0x0
-; GFX10-DL-NEXT:    s_load_dword s1, s[4:5], 0x0
-; GFX10-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT:    v_dot8_u32_u4 v2, s1, s2, v0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s8
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s9
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-DL-NEXT:    v_dot8_u32_u4 v2, s0, s1, v0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                        <8 x i4> addrspace(1)* %src2,
@@ -254,49 +254,49 @@ entry:
 define amdgpu_kernel void @udot8_acc16(<8 x i4> addrspace(1)* %src1,
 ; GFX7-LABEL: udot8_acc16:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
-; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
-; GFX7-NEXT:    s_load_dword s0, s[8:9], 0x0
-; GFX7-NEXT:    s_load_dword s1, s[10:11], 0x0
+; GFX7-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_lshr_b32 s2, s0, 28
-; GFX7-NEXT:    s_bfe_u32 s15, s1, 0x40018
-; GFX7-NEXT:    s_bfe_u32 s16, s1, 0x40014
-; GFX7-NEXT:    s_bfe_u32 s17, s1, 0x40010
-; GFX7-NEXT:    s_bfe_u32 s18, s1, 0x4000c
-; GFX7-NEXT:    s_bfe_u32 s19, s1, 0x40008
-; GFX7-NEXT:    s_bfe_u32 s20, s1, 0x40004
-; GFX7-NEXT:    s_lshr_b32 s14, s1, 28
-; GFX7-NEXT:    s_and_b32 s1, s1, 15
-; GFX7-NEXT:    s_bfe_u32 s8, s0, 0x40018
-; GFX7-NEXT:    s_bfe_u32 s9, s0, 0x40014
-; GFX7-NEXT:    s_bfe_u32 s10, s0, 0x40010
-; GFX7-NEXT:    s_bfe_u32 s11, s0, 0x4000c
-; GFX7-NEXT:    s_bfe_u32 s12, s0, 0x40008
-; GFX7-NEXT:    s_bfe_u32 s13, s0, 0x40004
-; GFX7-NEXT:    s_and_b32 s0, s0, 15
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s20
-; GFX7-NEXT:    v_mov_b32_e32 v3, s19
-; GFX7-NEXT:    v_mov_b32_e32 v4, s18
-; GFX7-NEXT:    v_mov_b32_e32 v5, s17
-; GFX7-NEXT:    v_mov_b32_e32 v6, s16
-; GFX7-NEXT:    v_mov_b32_e32 v7, s15
+; GFX7-NEXT:    s_lshr_b32 s6, s4, 28
+; GFX7-NEXT:    s_bfe_u32 s14, s5, 0x40018
+; GFX7-NEXT:    s_bfe_u32 s15, s5, 0x40014
+; GFX7-NEXT:    s_bfe_u32 s16, s5, 0x40010
+; GFX7-NEXT:    s_bfe_u32 s17, s5, 0x4000c
+; GFX7-NEXT:    s_bfe_u32 s18, s5, 0x40008
+; GFX7-NEXT:    s_bfe_u32 s19, s5, 0x40004
+; GFX7-NEXT:    s_lshr_b32 s13, s5, 28
+; GFX7-NEXT:    s_and_b32 s5, s5, 15
+; GFX7-NEXT:    s_bfe_u32 s7, s4, 0x40018
+; GFX7-NEXT:    s_bfe_u32 s8, s4, 0x40014
+; GFX7-NEXT:    s_bfe_u32 s9, s4, 0x40010
+; GFX7-NEXT:    s_bfe_u32 s10, s4, 0x4000c
+; GFX7-NEXT:    s_bfe_u32 s11, s4, 0x40008
+; GFX7-NEXT:    s_bfe_u32 s12, s4, 0x40004
+; GFX7-NEXT:    s_and_b32 s4, s4, 15
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s19
+; GFX7-NEXT:    v_mov_b32_e32 v3, s18
+; GFX7-NEXT:    v_mov_b32_e32 v4, s17
+; GFX7-NEXT:    v_mov_b32_e32 v5, s16
+; GFX7-NEXT:    v_mov_b32_e32 v6, s15
+; GFX7-NEXT:    v_mov_b32_e32 v7, s14
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mad_u32_u24 v0, s0, v1, v0
-; GFX7-NEXT:    v_mad_u32_u24 v0, s13, v2, v0
-; GFX7-NEXT:    v_mad_u32_u24 v0, s12, v3, v0
-; GFX7-NEXT:    v_mad_u32_u24 v0, s11, v4, v0
-; GFX7-NEXT:    v_mad_u32_u24 v0, s10, v5, v0
-; GFX7-NEXT:    v_mad_u32_u24 v0, s9, v6, v0
-; GFX7-NEXT:    v_mad_u32_u24 v0, s8, v7, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s14
-; GFX7-NEXT:    v_mad_u32_u24 v0, s2, v1, v0
-; GFX7-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s12, v2, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s11, v3, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s10, v4, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s9, v5, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s8, v6, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s7, v7, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s13
+; GFX7-NEXT:    v_mad_u32_u24 v0, s6, v1, v0
+; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: udot8_acc16:
@@ -311,38 +311,38 @@ define amdgpu_kernel void @udot8_acc16(<8 x i4> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 28
-; GFX8-NEXT:    s_bfe_u32 s11, s1, 0x40018
-; GFX8-NEXT:    s_bfe_u32 s12, s1, 0x40014
-; GFX8-NEXT:    s_bfe_u32 s13, s1, 0x40010
-; GFX8-NEXT:    s_bfe_u32 s14, s1, 0x4000c
-; GFX8-NEXT:    s_bfe_u32 s15, s1, 0x40008
-; GFX8-NEXT:    s_bfe_u32 s16, s1, 0x40004
-; GFX8-NEXT:    s_lshr_b32 s10, s1, 28
+; GFX8-NEXT:    s_bfe_u32 s10, s1, 0x40018
+; GFX8-NEXT:    s_bfe_u32 s11, s1, 0x40014
+; GFX8-NEXT:    s_bfe_u32 s12, s1, 0x40010
+; GFX8-NEXT:    s_bfe_u32 s13, s1, 0x4000c
+; GFX8-NEXT:    s_bfe_u32 s14, s1, 0x40008
+; GFX8-NEXT:    s_bfe_u32 s15, s1, 0x40004
+; GFX8-NEXT:    s_lshr_b32 s9, s1, 28
 ; GFX8-NEXT:    s_and_b32 s1, s1, 15
-; GFX8-NEXT:    s_bfe_u32 s4, s0, 0x40018
-; GFX8-NEXT:    s_bfe_u32 s5, s0, 0x40014
-; GFX8-NEXT:    s_bfe_u32 s6, s0, 0x40010
-; GFX8-NEXT:    s_bfe_u32 s7, s0, 0x4000c
-; GFX8-NEXT:    s_bfe_u32 s8, s0, 0x40008
-; GFX8-NEXT:    s_bfe_u32 s9, s0, 0x40004
+; GFX8-NEXT:    s_bfe_u32 s3, s0, 0x40018
+; GFX8-NEXT:    s_bfe_u32 s4, s0, 0x40014
+; GFX8-NEXT:    s_bfe_u32 s5, s0, 0x40010
+; GFX8-NEXT:    s_bfe_u32 s6, s0, 0x4000c
+; GFX8-NEXT:    s_bfe_u32 s7, s0, 0x40008
+; GFX8-NEXT:    s_bfe_u32 s8, s0, 0x40004
 ; GFX8-NEXT:    s_and_b32 s0, s0, 15
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    v_mov_b32_e32 v4, s16
-; GFX8-NEXT:    v_mov_b32_e32 v5, s15
-; GFX8-NEXT:    v_mov_b32_e32 v6, s14
-; GFX8-NEXT:    v_mov_b32_e32 v7, s13
-; GFX8-NEXT:    v_mov_b32_e32 v8, s12
-; GFX8-NEXT:    v_mov_b32_e32 v9, s11
+; GFX8-NEXT:    v_mov_b32_e32 v4, s15
+; GFX8-NEXT:    v_mov_b32_e32 v5, s14
+; GFX8-NEXT:    v_mov_b32_e32 v6, s13
+; GFX8-NEXT:    v_mov_b32_e32 v7, s12
+; GFX8-NEXT:    v_mov_b32_e32 v8, s11
+; GFX8-NEXT:    v_mov_b32_e32 v9, s10
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s9, v4, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s8, v4, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s8, v5, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s7, v6, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s6, v7, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s5, v8, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s4, v9, v2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s10
+; GFX8-NEXT:    v_mad_u32_u24 v2, s7, v5, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s6, v6, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s5, v7, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s4, v8, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s3, v9, v2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s9
 ; GFX8-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
 ; GFX8-NEXT:    flat_store_short v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
@@ -359,38 +359,38 @@ define amdgpu_kernel void @udot8_acc16(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshr_b32 s2, s0, 28
-; GFX9-NEXT:    s_bfe_u32 s11, s1, 0x40018
-; GFX9-NEXT:    s_bfe_u32 s12, s1, 0x40014
-; GFX9-NEXT:    s_bfe_u32 s13, s1, 0x40010
-; GFX9-NEXT:    s_bfe_u32 s14, s1, 0x4000c
-; GFX9-NEXT:    s_bfe_u32 s15, s1, 0x40008
-; GFX9-NEXT:    s_bfe_u32 s16, s1, 0x40004
-; GFX9-NEXT:    s_lshr_b32 s10, s1, 28
+; GFX9-NEXT:    s_bfe_u32 s10, s1, 0x40018
+; GFX9-NEXT:    s_bfe_u32 s11, s1, 0x40014
+; GFX9-NEXT:    s_bfe_u32 s12, s1, 0x40010
+; GFX9-NEXT:    s_bfe_u32 s13, s1, 0x4000c
+; GFX9-NEXT:    s_bfe_u32 s14, s1, 0x40008
+; GFX9-NEXT:    s_bfe_u32 s15, s1, 0x40004
+; GFX9-NEXT:    s_lshr_b32 s9, s1, 28
 ; GFX9-NEXT:    s_and_b32 s1, s1, 15
-; GFX9-NEXT:    s_bfe_u32 s4, s0, 0x40018
-; GFX9-NEXT:    s_bfe_u32 s5, s0, 0x40014
-; GFX9-NEXT:    s_bfe_u32 s6, s0, 0x40010
-; GFX9-NEXT:    s_bfe_u32 s7, s0, 0x4000c
-; GFX9-NEXT:    s_bfe_u32 s8, s0, 0x40008
-; GFX9-NEXT:    s_bfe_u32 s9, s0, 0x40004
+; GFX9-NEXT:    s_bfe_u32 s3, s0, 0x40018
+; GFX9-NEXT:    s_bfe_u32 s4, s0, 0x40014
+; GFX9-NEXT:    s_bfe_u32 s5, s0, 0x40010
+; GFX9-NEXT:    s_bfe_u32 s6, s0, 0x4000c
+; GFX9-NEXT:    s_bfe_u32 s7, s0, 0x40008
+; GFX9-NEXT:    s_bfe_u32 s8, s0, 0x40004
 ; GFX9-NEXT:    s_and_b32 s0, s0, 15
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    v_mov_b32_e32 v4, s16
-; GFX9-NEXT:    v_mov_b32_e32 v5, s15
-; GFX9-NEXT:    v_mov_b32_e32 v6, s14
-; GFX9-NEXT:    v_mov_b32_e32 v7, s13
-; GFX9-NEXT:    v_mov_b32_e32 v8, s12
-; GFX9-NEXT:    v_mov_b32_e32 v9, s11
+; GFX9-NEXT:    v_mov_b32_e32 v4, s15
+; GFX9-NEXT:    v_mov_b32_e32 v5, s14
+; GFX9-NEXT:    v_mov_b32_e32 v6, s13
+; GFX9-NEXT:    v_mov_b32_e32 v7, s12
+; GFX9-NEXT:    v_mov_b32_e32 v8, s11
+; GFX9-NEXT:    v_mov_b32_e32 v9, s10
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s9, v4, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s8, v4, v2
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s8, v5, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s7, v6, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s6, v7, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s5, v8, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s4, v9, v2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s10
+; GFX9-NEXT:    v_mad_u32_u24 v2, s7, v5, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s6, v6, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s5, v7, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s4, v8, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s3, v9, v2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s9
 ; GFX9-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
 ; GFX9-NEXT:    global_store_short v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
@@ -407,80 +407,80 @@ define amdgpu_kernel void @udot8_acc16(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    s_lshr_b32 s2, s0, 28
-; GFX9-DL-NEXT:    s_bfe_u32 s11, s1, 0x40018
-; GFX9-DL-NEXT:    s_bfe_u32 s12, s1, 0x40014
-; GFX9-DL-NEXT:    s_bfe_u32 s13, s1, 0x40010
-; GFX9-DL-NEXT:    s_bfe_u32 s14, s1, 0x4000c
-; GFX9-DL-NEXT:    s_bfe_u32 s15, s1, 0x40008
-; GFX9-DL-NEXT:    s_bfe_u32 s16, s1, 0x40004
-; GFX9-DL-NEXT:    s_lshr_b32 s10, s1, 28
+; GFX9-DL-NEXT:    s_bfe_u32 s10, s1, 0x40018
+; GFX9-DL-NEXT:    s_bfe_u32 s11, s1, 0x40014
+; GFX9-DL-NEXT:    s_bfe_u32 s12, s1, 0x40010
+; GFX9-DL-NEXT:    s_bfe_u32 s13, s1, 0x4000c
+; GFX9-DL-NEXT:    s_bfe_u32 s14, s1, 0x40008
+; GFX9-DL-NEXT:    s_bfe_u32 s15, s1, 0x40004
+; GFX9-DL-NEXT:    s_lshr_b32 s9, s1, 28
 ; GFX9-DL-NEXT:    s_and_b32 s1, s1, 15
-; GFX9-DL-NEXT:    s_bfe_u32 s4, s0, 0x40018
-; GFX9-DL-NEXT:    s_bfe_u32 s5, s0, 0x40014
-; GFX9-DL-NEXT:    s_bfe_u32 s6, s0, 0x40010
-; GFX9-DL-NEXT:    s_bfe_u32 s7, s0, 0x4000c
-; GFX9-DL-NEXT:    s_bfe_u32 s8, s0, 0x40008
-; GFX9-DL-NEXT:    s_bfe_u32 s9, s0, 0x40004
+; GFX9-DL-NEXT:    s_bfe_u32 s3, s0, 0x40018
+; GFX9-DL-NEXT:    s_bfe_u32 s4, s0, 0x40014
+; GFX9-DL-NEXT:    s_bfe_u32 s5, s0, 0x40010
+; GFX9-DL-NEXT:    s_bfe_u32 s6, s0, 0x4000c
+; GFX9-DL-NEXT:    s_bfe_u32 s7, s0, 0x40008
+; GFX9-DL-NEXT:    s_bfe_u32 s8, s0, 0x40004
 ; GFX9-DL-NEXT:    s_and_b32 s0, s0, 15
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s16
-; GFX9-DL-NEXT:    v_mov_b32_e32 v5, s15
-; GFX9-DL-NEXT:    v_mov_b32_e32 v6, s14
-; GFX9-DL-NEXT:    v_mov_b32_e32 v7, s13
-; GFX9-DL-NEXT:    v_mov_b32_e32 v8, s12
-; GFX9-DL-NEXT:    v_mov_b32_e32 v9, s11
+; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s15
+; GFX9-DL-NEXT:    v_mov_b32_e32 v5, s14
+; GFX9-DL-NEXT:    v_mov_b32_e32 v6, s13
+; GFX9-DL-NEXT:    v_mov_b32_e32 v7, s12
+; GFX9-DL-NEXT:    v_mov_b32_e32 v8, s11
+; GFX9-DL-NEXT:    v_mov_b32_e32 v9, s10
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s9, v4, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s8, v4, v2
 ; GFX9-DL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s8, v5, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s7, v6, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s6, v7, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s5, v8, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s4, v9, v2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s10
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s7, v5, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s6, v6, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s5, v7, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s4, v8, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s3, v9, v2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s9
 ; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
 ; GFX9-DL-NEXT:    global_store_short v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: udot8_acc16:
 ; GFX10-DL:       ; %bb.0: ; %entry
-; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-DL-NEXT:    global_load_ushort v2, v[0:1], off
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX10-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_and_b32 s2, s0, 15
-; GFX10-DL-NEXT:    s_and_b32 s4, s1, 15
-; GFX10-DL-NEXT:    s_bfe_u32 s5, s0, 0x40004
-; GFX10-DL-NEXT:    s_bfe_u32 s6, s1, 0x40004
+; GFX10-DL-NEXT:    s_and_b32 s3, s1, 15
+; GFX10-DL-NEXT:    s_bfe_u32 s4, s0, 0x40004
+; GFX10-DL-NEXT:    s_bfe_u32 s5, s1, 0x40004
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s4, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s3, v2
 ; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40008
-; GFX10-DL-NEXT:    s_bfe_u32 s4, s1, 0x40008
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s5, s6, v2
-; GFX10-DL-NEXT:    s_bfe_u32 s5, s0, 0x4000c
-; GFX10-DL-NEXT:    s_bfe_u32 s6, s1, 0x4000c
+; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x40008
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s4, s5, v2
+; GFX10-DL-NEXT:    s_bfe_u32 s4, s0, 0x4000c
+; GFX10-DL-NEXT:    s_bfe_u32 s5, s1, 0x4000c
 ; GFX10-DL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s4, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s3, v2
 ; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40010
-; GFX10-DL-NEXT:    s_bfe_u32 s4, s1, 0x40010
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s5, s6, v2
-; GFX10-DL-NEXT:    s_bfe_u32 s5, s0, 0x40014
-; GFX10-DL-NEXT:    s_bfe_u32 s6, s1, 0x40014
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s4, v2
+; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x40010
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s4, s5, v2
+; GFX10-DL-NEXT:    s_bfe_u32 s4, s0, 0x40014
+; GFX10-DL-NEXT:    s_bfe_u32 s5, s1, 0x40014
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s3, v2
 ; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40018
-; GFX10-DL-NEXT:    s_bfe_u32 s4, s1, 0x40018
+; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x40018
 ; GFX10-DL-NEXT:    s_lshr_b32 s0, s0, 28
 ; GFX10-DL-NEXT:    s_lshr_b32 s1, s1, 28
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s5, s6, v2
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s4, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s4, s5, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s3, v2
 ; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s0, s1, v2
 ; GFX10-DL-NEXT:    global_store_short v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
@@ -557,49 +557,49 @@ entry:
 define amdgpu_kernel void @udot8_acc8(<8 x i4> addrspace(1)* %src1,
 ; GFX7-LABEL: udot8_acc8:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
-; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
-; GFX7-NEXT:    s_load_dword s0, s[8:9], 0x0
-; GFX7-NEXT:    s_load_dword s1, s[10:11], 0x0
+; GFX7-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_lshr_b32 s2, s0, 28
-; GFX7-NEXT:    s_bfe_u32 s15, s1, 0x40018
-; GFX7-NEXT:    s_bfe_u32 s16, s1, 0x40014
-; GFX7-NEXT:    s_bfe_u32 s17, s1, 0x40010
-; GFX7-NEXT:    s_bfe_u32 s18, s1, 0x4000c
-; GFX7-NEXT:    s_bfe_u32 s19, s1, 0x40008
-; GFX7-NEXT:    s_bfe_u32 s20, s1, 0x40004
-; GFX7-NEXT:    s_lshr_b32 s14, s1, 28
-; GFX7-NEXT:    s_and_b32 s1, s1, 15
-; GFX7-NEXT:    s_bfe_u32 s8, s0, 0x40018
-; GFX7-NEXT:    s_bfe_u32 s9, s0, 0x40014
-; GFX7-NEXT:    s_bfe_u32 s10, s0, 0x40010
-; GFX7-NEXT:    s_bfe_u32 s11, s0, 0x4000c
-; GFX7-NEXT:    s_bfe_u32 s12, s0, 0x40008
-; GFX7-NEXT:    s_bfe_u32 s13, s0, 0x40004
-; GFX7-NEXT:    s_and_b32 s0, s0, 15
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s20
-; GFX7-NEXT:    v_mov_b32_e32 v3, s19
-; GFX7-NEXT:    v_mov_b32_e32 v4, s18
-; GFX7-NEXT:    v_mov_b32_e32 v5, s17
-; GFX7-NEXT:    v_mov_b32_e32 v6, s16
-; GFX7-NEXT:    v_mov_b32_e32 v7, s15
+; GFX7-NEXT:    s_lshr_b32 s6, s4, 28
+; GFX7-NEXT:    s_bfe_u32 s14, s5, 0x40018
+; GFX7-NEXT:    s_bfe_u32 s15, s5, 0x40014
+; GFX7-NEXT:    s_bfe_u32 s16, s5, 0x40010
+; GFX7-NEXT:    s_bfe_u32 s17, s5, 0x4000c
+; GFX7-NEXT:    s_bfe_u32 s18, s5, 0x40008
+; GFX7-NEXT:    s_bfe_u32 s19, s5, 0x40004
+; GFX7-NEXT:    s_lshr_b32 s13, s5, 28
+; GFX7-NEXT:    s_and_b32 s5, s5, 15
+; GFX7-NEXT:    s_bfe_u32 s7, s4, 0x40018
+; GFX7-NEXT:    s_bfe_u32 s8, s4, 0x40014
+; GFX7-NEXT:    s_bfe_u32 s9, s4, 0x40010
+; GFX7-NEXT:    s_bfe_u32 s10, s4, 0x4000c
+; GFX7-NEXT:    s_bfe_u32 s11, s4, 0x40008
+; GFX7-NEXT:    s_bfe_u32 s12, s4, 0x40004
+; GFX7-NEXT:    s_and_b32 s4, s4, 15
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s19
+; GFX7-NEXT:    v_mov_b32_e32 v3, s18
+; GFX7-NEXT:    v_mov_b32_e32 v4, s17
+; GFX7-NEXT:    v_mov_b32_e32 v5, s16
+; GFX7-NEXT:    v_mov_b32_e32 v6, s15
+; GFX7-NEXT:    v_mov_b32_e32 v7, s14
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mad_u32_u24 v0, s0, v1, v0
-; GFX7-NEXT:    v_mad_u32_u24 v0, s13, v2, v0
-; GFX7-NEXT:    v_mad_u32_u24 v0, s12, v3, v0
-; GFX7-NEXT:    v_mad_u32_u24 v0, s11, v4, v0
-; GFX7-NEXT:    v_mad_u32_u24 v0, s10, v5, v0
-; GFX7-NEXT:    v_mad_u32_u24 v0, s9, v6, v0
-; GFX7-NEXT:    v_mad_u32_u24 v0, s8, v7, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s14
-; GFX7-NEXT:    v_mad_u32_u24 v0, s2, v1, v0
-; GFX7-NEXT:    buffer_store_byte v0, off, s[4:7], 0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s12, v2, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s11, v3, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s10, v4, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s9, v5, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s8, v6, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s7, v7, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s13
+; GFX7-NEXT:    v_mad_u32_u24 v0, s6, v1, v0
+; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: udot8_acc8:
@@ -614,38 +614,38 @@ define amdgpu_kernel void @udot8_acc8(<8 x i4> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 28
-; GFX8-NEXT:    s_bfe_u32 s11, s1, 0x40018
-; GFX8-NEXT:    s_bfe_u32 s12, s1, 0x40014
-; GFX8-NEXT:    s_bfe_u32 s13, s1, 0x40010
-; GFX8-NEXT:    s_bfe_u32 s14, s1, 0x4000c
-; GFX8-NEXT:    s_bfe_u32 s15, s1, 0x40008
-; GFX8-NEXT:    s_bfe_u32 s16, s1, 0x40004
-; GFX8-NEXT:    s_lshr_b32 s10, s1, 28
+; GFX8-NEXT:    s_bfe_u32 s10, s1, 0x40018
+; GFX8-NEXT:    s_bfe_u32 s11, s1, 0x40014
+; GFX8-NEXT:    s_bfe_u32 s12, s1, 0x40010
+; GFX8-NEXT:    s_bfe_u32 s13, s1, 0x4000c
+; GFX8-NEXT:    s_bfe_u32 s14, s1, 0x40008
+; GFX8-NEXT:    s_bfe_u32 s15, s1, 0x40004
+; GFX8-NEXT:    s_lshr_b32 s9, s1, 28
 ; GFX8-NEXT:    s_and_b32 s1, s1, 15
-; GFX8-NEXT:    s_bfe_u32 s4, s0, 0x40018
-; GFX8-NEXT:    s_bfe_u32 s5, s0, 0x40014
-; GFX8-NEXT:    s_bfe_u32 s6, s0, 0x40010
-; GFX8-NEXT:    s_bfe_u32 s7, s0, 0x4000c
-; GFX8-NEXT:    s_bfe_u32 s8, s0, 0x40008
-; GFX8-NEXT:    s_bfe_u32 s9, s0, 0x40004
+; GFX8-NEXT:    s_bfe_u32 s3, s0, 0x40018
+; GFX8-NEXT:    s_bfe_u32 s4, s0, 0x40014
+; GFX8-NEXT:    s_bfe_u32 s5, s0, 0x40010
+; GFX8-NEXT:    s_bfe_u32 s6, s0, 0x4000c
+; GFX8-NEXT:    s_bfe_u32 s7, s0, 0x40008
+; GFX8-NEXT:    s_bfe_u32 s8, s0, 0x40004
 ; GFX8-NEXT:    s_and_b32 s0, s0, 15
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    v_mov_b32_e32 v4, s16
-; GFX8-NEXT:    v_mov_b32_e32 v5, s15
-; GFX8-NEXT:    v_mov_b32_e32 v6, s14
-; GFX8-NEXT:    v_mov_b32_e32 v7, s13
-; GFX8-NEXT:    v_mov_b32_e32 v8, s12
-; GFX8-NEXT:    v_mov_b32_e32 v9, s11
+; GFX8-NEXT:    v_mov_b32_e32 v4, s15
+; GFX8-NEXT:    v_mov_b32_e32 v5, s14
+; GFX8-NEXT:    v_mov_b32_e32 v6, s13
+; GFX8-NEXT:    v_mov_b32_e32 v7, s12
+; GFX8-NEXT:    v_mov_b32_e32 v8, s11
+; GFX8-NEXT:    v_mov_b32_e32 v9, s10
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s9, v4, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s8, v4, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s8, v5, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s7, v6, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s6, v7, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s5, v8, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s4, v9, v2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s10
+; GFX8-NEXT:    v_mad_u32_u24 v2, s7, v5, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s6, v6, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s5, v7, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s4, v8, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s3, v9, v2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s9
 ; GFX8-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
 ; GFX8-NEXT:    flat_store_byte v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
@@ -662,38 +662,38 @@ define amdgpu_kernel void @udot8_acc8(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshr_b32 s2, s0, 28
-; GFX9-NEXT:    s_bfe_u32 s11, s1, 0x40018
-; GFX9-NEXT:    s_bfe_u32 s12, s1, 0x40014
-; GFX9-NEXT:    s_bfe_u32 s13, s1, 0x40010
-; GFX9-NEXT:    s_bfe_u32 s14, s1, 0x4000c
-; GFX9-NEXT:    s_bfe_u32 s15, s1, 0x40008
-; GFX9-NEXT:    s_bfe_u32 s16, s1, 0x40004
-; GFX9-NEXT:    s_lshr_b32 s10, s1, 28
+; GFX9-NEXT:    s_bfe_u32 s10, s1, 0x40018
+; GFX9-NEXT:    s_bfe_u32 s11, s1, 0x40014
+; GFX9-NEXT:    s_bfe_u32 s12, s1, 0x40010
+; GFX9-NEXT:    s_bfe_u32 s13, s1, 0x4000c
+; GFX9-NEXT:    s_bfe_u32 s14, s1, 0x40008
+; GFX9-NEXT:    s_bfe_u32 s15, s1, 0x40004
+; GFX9-NEXT:    s_lshr_b32 s9, s1, 28
 ; GFX9-NEXT:    s_and_b32 s1, s1, 15
-; GFX9-NEXT:    s_bfe_u32 s4, s0, 0x40018
-; GFX9-NEXT:    s_bfe_u32 s5, s0, 0x40014
-; GFX9-NEXT:    s_bfe_u32 s6, s0, 0x40010
-; GFX9-NEXT:    s_bfe_u32 s7, s0, 0x4000c
-; GFX9-NEXT:    s_bfe_u32 s8, s0, 0x40008
-; GFX9-NEXT:    s_bfe_u32 s9, s0, 0x40004
+; GFX9-NEXT:    s_bfe_u32 s3, s0, 0x40018
+; GFX9-NEXT:    s_bfe_u32 s4, s0, 0x40014
+; GFX9-NEXT:    s_bfe_u32 s5, s0, 0x40010
+; GFX9-NEXT:    s_bfe_u32 s6, s0, 0x4000c
+; GFX9-NEXT:    s_bfe_u32 s7, s0, 0x40008
+; GFX9-NEXT:    s_bfe_u32 s8, s0, 0x40004
 ; GFX9-NEXT:    s_and_b32 s0, s0, 15
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    v_mov_b32_e32 v4, s16
-; GFX9-NEXT:    v_mov_b32_e32 v5, s15
-; GFX9-NEXT:    v_mov_b32_e32 v6, s14
-; GFX9-NEXT:    v_mov_b32_e32 v7, s13
-; GFX9-NEXT:    v_mov_b32_e32 v8, s12
-; GFX9-NEXT:    v_mov_b32_e32 v9, s11
+; GFX9-NEXT:    v_mov_b32_e32 v4, s15
+; GFX9-NEXT:    v_mov_b32_e32 v5, s14
+; GFX9-NEXT:    v_mov_b32_e32 v6, s13
+; GFX9-NEXT:    v_mov_b32_e32 v7, s12
+; GFX9-NEXT:    v_mov_b32_e32 v8, s11
+; GFX9-NEXT:    v_mov_b32_e32 v9, s10
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s9, v4, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s8, v4, v2
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s8, v5, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s7, v6, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s6, v7, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s5, v8, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s4, v9, v2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s10
+; GFX9-NEXT:    v_mad_u32_u24 v2, s7, v5, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s6, v6, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s5, v7, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s4, v8, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s3, v9, v2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s9
 ; GFX9-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
 ; GFX9-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
@@ -710,80 +710,80 @@ define amdgpu_kernel void @udot8_acc8(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    s_lshr_b32 s2, s0, 28
-; GFX9-DL-NEXT:    s_bfe_u32 s11, s1, 0x40018
-; GFX9-DL-NEXT:    s_bfe_u32 s12, s1, 0x40014
-; GFX9-DL-NEXT:    s_bfe_u32 s13, s1, 0x40010
-; GFX9-DL-NEXT:    s_bfe_u32 s14, s1, 0x4000c
-; GFX9-DL-NEXT:    s_bfe_u32 s15, s1, 0x40008
-; GFX9-DL-NEXT:    s_bfe_u32 s16, s1, 0x40004
-; GFX9-DL-NEXT:    s_lshr_b32 s10, s1, 28
+; GFX9-DL-NEXT:    s_bfe_u32 s10, s1, 0x40018
+; GFX9-DL-NEXT:    s_bfe_u32 s11, s1, 0x40014
+; GFX9-DL-NEXT:    s_bfe_u32 s12, s1, 0x40010
+; GFX9-DL-NEXT:    s_bfe_u32 s13, s1, 0x4000c
+; GFX9-DL-NEXT:    s_bfe_u32 s14, s1, 0x40008
+; GFX9-DL-NEXT:    s_bfe_u32 s15, s1, 0x40004
+; GFX9-DL-NEXT:    s_lshr_b32 s9, s1, 28
 ; GFX9-DL-NEXT:    s_and_b32 s1, s1, 15
-; GFX9-DL-NEXT:    s_bfe_u32 s4, s0, 0x40018
-; GFX9-DL-NEXT:    s_bfe_u32 s5, s0, 0x40014
-; GFX9-DL-NEXT:    s_bfe_u32 s6, s0, 0x40010
-; GFX9-DL-NEXT:    s_bfe_u32 s7, s0, 0x4000c
-; GFX9-DL-NEXT:    s_bfe_u32 s8, s0, 0x40008
-; GFX9-DL-NEXT:    s_bfe_u32 s9, s0, 0x40004
+; GFX9-DL-NEXT:    s_bfe_u32 s3, s0, 0x40018
+; GFX9-DL-NEXT:    s_bfe_u32 s4, s0, 0x40014
+; GFX9-DL-NEXT:    s_bfe_u32 s5, s0, 0x40010
+; GFX9-DL-NEXT:    s_bfe_u32 s6, s0, 0x4000c
+; GFX9-DL-NEXT:    s_bfe_u32 s7, s0, 0x40008
+; GFX9-DL-NEXT:    s_bfe_u32 s8, s0, 0x40004
 ; GFX9-DL-NEXT:    s_and_b32 s0, s0, 15
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s16
-; GFX9-DL-NEXT:    v_mov_b32_e32 v5, s15
-; GFX9-DL-NEXT:    v_mov_b32_e32 v6, s14
-; GFX9-DL-NEXT:    v_mov_b32_e32 v7, s13
-; GFX9-DL-NEXT:    v_mov_b32_e32 v8, s12
-; GFX9-DL-NEXT:    v_mov_b32_e32 v9, s11
+; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s15
+; GFX9-DL-NEXT:    v_mov_b32_e32 v5, s14
+; GFX9-DL-NEXT:    v_mov_b32_e32 v6, s13
+; GFX9-DL-NEXT:    v_mov_b32_e32 v7, s12
+; GFX9-DL-NEXT:    v_mov_b32_e32 v8, s11
+; GFX9-DL-NEXT:    v_mov_b32_e32 v9, s10
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s9, v4, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s8, v4, v2
 ; GFX9-DL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s8, v5, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s7, v6, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s6, v7, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s5, v8, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s4, v9, v2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s10
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s7, v5, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s6, v6, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s5, v7, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s4, v8, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s3, v9, v2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s9
 ; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
 ; GFX9-DL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: udot8_acc8:
 ; GFX10-DL:       ; %bb.0: ; %entry
-; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-DL-NEXT:    global_load_ubyte v2, v[0:1], off
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX10-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_and_b32 s2, s0, 15
-; GFX10-DL-NEXT:    s_and_b32 s4, s1, 15
-; GFX10-DL-NEXT:    s_bfe_u32 s5, s0, 0x40004
-; GFX10-DL-NEXT:    s_bfe_u32 s6, s1, 0x40004
+; GFX10-DL-NEXT:    s_and_b32 s3, s1, 15
+; GFX10-DL-NEXT:    s_bfe_u32 s4, s0, 0x40004
+; GFX10-DL-NEXT:    s_bfe_u32 s5, s1, 0x40004
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s4, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s3, v2
 ; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40008
-; GFX10-DL-NEXT:    s_bfe_u32 s4, s1, 0x40008
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s5, s6, v2
-; GFX10-DL-NEXT:    s_bfe_u32 s5, s0, 0x4000c
-; GFX10-DL-NEXT:    s_bfe_u32 s6, s1, 0x4000c
+; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x40008
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s4, s5, v2
+; GFX10-DL-NEXT:    s_bfe_u32 s4, s0, 0x4000c
+; GFX10-DL-NEXT:    s_bfe_u32 s5, s1, 0x4000c
 ; GFX10-DL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s4, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s3, v2
 ; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40010
-; GFX10-DL-NEXT:    s_bfe_u32 s4, s1, 0x40010
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s5, s6, v2
-; GFX10-DL-NEXT:    s_bfe_u32 s5, s0, 0x40014
-; GFX10-DL-NEXT:    s_bfe_u32 s6, s1, 0x40014
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s4, v2
+; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x40010
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s4, s5, v2
+; GFX10-DL-NEXT:    s_bfe_u32 s4, s0, 0x40014
+; GFX10-DL-NEXT:    s_bfe_u32 s5, s1, 0x40014
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s3, v2
 ; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40018
-; GFX10-DL-NEXT:    s_bfe_u32 s4, s1, 0x40018
+; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x40018
 ; GFX10-DL-NEXT:    s_lshr_b32 s0, s0, 28
 ; GFX10-DL-NEXT:    s_lshr_b32 s1, s1, 28
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s5, s6, v2
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s4, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s4, s5, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s3, v2
 ; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s0, s1, v2
 ; GFX10-DL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
@@ -860,50 +860,50 @@ entry:
 define amdgpu_kernel void @udot8_acc4(<8 x i4> addrspace(1)* %src1,
 ; GFX7-LABEL: udot8_acc4:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
-; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
-; GFX7-NEXT:    s_load_dword s0, s[8:9], 0x0
-; GFX7-NEXT:    s_load_dword s1, s[10:11], 0x0
+; GFX7-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_lshr_b32 s2, s0, 28
-; GFX7-NEXT:    s_bfe_u32 s15, s1, 0x40018
-; GFX7-NEXT:    s_bfe_u32 s16, s1, 0x40014
-; GFX7-NEXT:    s_bfe_u32 s17, s1, 0x40010
-; GFX7-NEXT:    s_bfe_u32 s18, s1, 0x4000c
-; GFX7-NEXT:    s_bfe_u32 s19, s1, 0x40008
-; GFX7-NEXT:    s_bfe_u32 s20, s1, 0x40004
-; GFX7-NEXT:    s_lshr_b32 s14, s1, 28
-; GFX7-NEXT:    s_and_b32 s1, s1, 15
-; GFX7-NEXT:    s_bfe_u32 s8, s0, 0x40018
-; GFX7-NEXT:    s_bfe_u32 s9, s0, 0x40014
-; GFX7-NEXT:    s_bfe_u32 s10, s0, 0x40010
-; GFX7-NEXT:    s_bfe_u32 s11, s0, 0x4000c
-; GFX7-NEXT:    s_bfe_u32 s12, s0, 0x40008
-; GFX7-NEXT:    s_bfe_u32 s13, s0, 0x40004
-; GFX7-NEXT:    s_and_b32 s0, s0, 15
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s20
-; GFX7-NEXT:    v_mov_b32_e32 v3, s19
-; GFX7-NEXT:    v_mov_b32_e32 v4, s18
-; GFX7-NEXT:    v_mov_b32_e32 v5, s17
-; GFX7-NEXT:    v_mov_b32_e32 v6, s16
-; GFX7-NEXT:    v_mov_b32_e32 v7, s15
+; GFX7-NEXT:    s_lshr_b32 s6, s4, 28
+; GFX7-NEXT:    s_bfe_u32 s14, s5, 0x40018
+; GFX7-NEXT:    s_bfe_u32 s15, s5, 0x40014
+; GFX7-NEXT:    s_bfe_u32 s16, s5, 0x40010
+; GFX7-NEXT:    s_bfe_u32 s17, s5, 0x4000c
+; GFX7-NEXT:    s_bfe_u32 s18, s5, 0x40008
+; GFX7-NEXT:    s_bfe_u32 s19, s5, 0x40004
+; GFX7-NEXT:    s_lshr_b32 s13, s5, 28
+; GFX7-NEXT:    s_and_b32 s5, s5, 15
+; GFX7-NEXT:    s_bfe_u32 s7, s4, 0x40018
+; GFX7-NEXT:    s_bfe_u32 s8, s4, 0x40014
+; GFX7-NEXT:    s_bfe_u32 s9, s4, 0x40010
+; GFX7-NEXT:    s_bfe_u32 s10, s4, 0x4000c
+; GFX7-NEXT:    s_bfe_u32 s11, s4, 0x40008
+; GFX7-NEXT:    s_bfe_u32 s12, s4, 0x40004
+; GFX7-NEXT:    s_and_b32 s4, s4, 15
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s19
+; GFX7-NEXT:    v_mov_b32_e32 v3, s18
+; GFX7-NEXT:    v_mov_b32_e32 v4, s17
+; GFX7-NEXT:    v_mov_b32_e32 v5, s16
+; GFX7-NEXT:    v_mov_b32_e32 v6, s15
+; GFX7-NEXT:    v_mov_b32_e32 v7, s14
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mad_u32_u24 v0, s0, v1, v0
-; GFX7-NEXT:    v_mad_u32_u24 v0, s13, v2, v0
-; GFX7-NEXT:    v_mad_u32_u24 v0, s12, v3, v0
-; GFX7-NEXT:    v_mad_u32_u24 v0, s11, v4, v0
-; GFX7-NEXT:    v_mad_u32_u24 v0, s10, v5, v0
-; GFX7-NEXT:    v_mad_u32_u24 v0, s9, v6, v0
-; GFX7-NEXT:    v_mad_u32_u24 v0, s8, v7, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s14
-; GFX7-NEXT:    v_mad_u32_u24 v0, s2, v1, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s12, v2, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s11, v3, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s10, v4, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s9, v5, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s8, v6, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s7, v7, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s13
+; GFX7-NEXT:    v_mad_u32_u24 v0, s6, v1, v0
 ; GFX7-NEXT:    v_and_b32_e32 v0, 15, v0
-; GFX7-NEXT:    buffer_store_byte v0, off, s[4:7], 0
+; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: udot8_acc4:
@@ -917,41 +917,41 @@ define amdgpu_kernel void @udot8_acc4(<8 x i4> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX8-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_and_b32 s9, s0, 15
-; GFX8-NEXT:    s_and_b32 s16, s1, 15
-; GFX8-NEXT:    s_bfe_u32 s15, s1, 0x40004
-; GFX8-NEXT:    v_mov_b32_e32 v4, s16
-; GFX8-NEXT:    s_bfe_u32 s11, s1, 0x40018
-; GFX8-NEXT:    s_bfe_u32 s12, s1, 0x40014
-; GFX8-NEXT:    s_bfe_u32 s13, s1, 0x40010
-; GFX8-NEXT:    s_bfe_u32 s14, s1, 0x40008
-; GFX8-NEXT:    s_lshr_b32 s10, s1, 28
+; GFX8-NEXT:    s_and_b32 s8, s0, 15
+; GFX8-NEXT:    s_and_b32 s15, s1, 15
+; GFX8-NEXT:    s_bfe_u32 s14, s1, 0x40004
+; GFX8-NEXT:    v_mov_b32_e32 v4, s15
+; GFX8-NEXT:    s_bfe_u32 s10, s1, 0x40018
+; GFX8-NEXT:    s_bfe_u32 s11, s1, 0x40014
+; GFX8-NEXT:    s_bfe_u32 s12, s1, 0x40010
+; GFX8-NEXT:    s_bfe_u32 s13, s1, 0x40008
+; GFX8-NEXT:    s_lshr_b32 s9, s1, 28
 ; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x4000c
-; GFX8-NEXT:    s_bfe_u32 s8, s0, 0x40004
-; GFX8-NEXT:    v_mov_b32_e32 v5, s15
+; GFX8-NEXT:    s_bfe_u32 s7, s0, 0x40004
+; GFX8-NEXT:    v_mov_b32_e32 v5, s14
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 28
-; GFX8-NEXT:    s_bfe_u32 s4, s0, 0x40018
-; GFX8-NEXT:    s_bfe_u32 s5, s0, 0x40014
-; GFX8-NEXT:    s_bfe_u32 s6, s0, 0x40010
-; GFX8-NEXT:    s_bfe_u32 s7, s0, 0x40008
+; GFX8-NEXT:    s_bfe_u32 s3, s0, 0x40018
+; GFX8-NEXT:    s_bfe_u32 s4, s0, 0x40014
+; GFX8-NEXT:    s_bfe_u32 s5, s0, 0x40010
+; GFX8-NEXT:    s_bfe_u32 s6, s0, 0x40008
 ; GFX8-NEXT:    s_bfe_u32 s0, s0, 0x4000c
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    v_mov_b32_e32 v6, s14
+; GFX8-NEXT:    v_mov_b32_e32 v6, s13
 ; GFX8-NEXT:    v_mul_u32_u24_e32 v3, s0, v3
 ; GFX8-NEXT:    v_and_b32_e32 v3, 15, v3
-; GFX8-NEXT:    v_mov_b32_e32 v7, s13
-; GFX8-NEXT:    v_mov_b32_e32 v8, s12
-; GFX8-NEXT:    v_mov_b32_e32 v9, s11
+; GFX8-NEXT:    v_mov_b32_e32 v7, s12
+; GFX8-NEXT:    v_mov_b32_e32 v8, s11
+; GFX8-NEXT:    v_mov_b32_e32 v9, s10
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mad_u32_u24 v2, s9, v4, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s8, v5, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s7, v6, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s8, v4, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s7, v5, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s6, v6, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s6, v7, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s5, v8, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s4, v9, v2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s10
+; GFX8-NEXT:    v_mad_u32_u24 v2, s5, v7, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s4, v8, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s3, v9, v2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s9
 ; GFX8-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX8-NEXT:    flat_store_byte v[0:1], v2
@@ -968,41 +968,41 @@ define amdgpu_kernel void @udot8_acc4(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX9-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s9, s0, 15
-; GFX9-NEXT:    s_and_b32 s16, s1, 15
-; GFX9-NEXT:    s_bfe_u32 s15, s1, 0x40004
-; GFX9-NEXT:    v_mov_b32_e32 v4, s16
-; GFX9-NEXT:    s_bfe_u32 s11, s1, 0x40018
-; GFX9-NEXT:    s_bfe_u32 s12, s1, 0x40014
-; GFX9-NEXT:    s_bfe_u32 s13, s1, 0x40010
-; GFX9-NEXT:    s_bfe_u32 s14, s1, 0x40008
-; GFX9-NEXT:    s_lshr_b32 s10, s1, 28
+; GFX9-NEXT:    s_and_b32 s8, s0, 15
+; GFX9-NEXT:    s_and_b32 s15, s1, 15
+; GFX9-NEXT:    s_bfe_u32 s14, s1, 0x40004
+; GFX9-NEXT:    v_mov_b32_e32 v4, s15
+; GFX9-NEXT:    s_bfe_u32 s10, s1, 0x40018
+; GFX9-NEXT:    s_bfe_u32 s11, s1, 0x40014
+; GFX9-NEXT:    s_bfe_u32 s12, s1, 0x40010
+; GFX9-NEXT:    s_bfe_u32 s13, s1, 0x40008
+; GFX9-NEXT:    s_lshr_b32 s9, s1, 28
 ; GFX9-NEXT:    s_bfe_u32 s1, s1, 0x4000c
-; GFX9-NEXT:    s_bfe_u32 s8, s0, 0x40004
-; GFX9-NEXT:    v_mov_b32_e32 v5, s15
+; GFX9-NEXT:    s_bfe_u32 s7, s0, 0x40004
+; GFX9-NEXT:    v_mov_b32_e32 v5, s14
 ; GFX9-NEXT:    s_lshr_b32 s2, s0, 28
-; GFX9-NEXT:    s_bfe_u32 s4, s0, 0x40018
-; GFX9-NEXT:    s_bfe_u32 s5, s0, 0x40014
-; GFX9-NEXT:    s_bfe_u32 s6, s0, 0x40010
-; GFX9-NEXT:    s_bfe_u32 s7, s0, 0x40008
+; GFX9-NEXT:    s_bfe_u32 s3, s0, 0x40018
+; GFX9-NEXT:    s_bfe_u32 s4, s0, 0x40014
+; GFX9-NEXT:    s_bfe_u32 s5, s0, 0x40010
+; GFX9-NEXT:    s_bfe_u32 s6, s0, 0x40008
 ; GFX9-NEXT:    s_bfe_u32 s0, s0, 0x4000c
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    v_mov_b32_e32 v6, s14
+; GFX9-NEXT:    v_mov_b32_e32 v6, s13
 ; GFX9-NEXT:    v_mul_u32_u24_e32 v3, s0, v3
 ; GFX9-NEXT:    v_and_b32_e32 v3, 15, v3
-; GFX9-NEXT:    v_mov_b32_e32 v7, s13
-; GFX9-NEXT:    v_mov_b32_e32 v8, s12
-; GFX9-NEXT:    v_mov_b32_e32 v9, s11
+; GFX9-NEXT:    v_mov_b32_e32 v7, s12
+; GFX9-NEXT:    v_mov_b32_e32 v8, s11
+; GFX9-NEXT:    v_mov_b32_e32 v9, s10
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mad_u32_u24 v2, s9, v4, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s8, v5, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s7, v6, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s8, v4, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s7, v5, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s6, v6, v2
 ; GFX9-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX9-NEXT:    v_add_u32_e32 v2, v2, v3
-; GFX9-NEXT:    v_mad_u32_u24 v2, s6, v7, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s5, v8, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s4, v9, v2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s10
+; GFX9-NEXT:    v_mad_u32_u24 v2, s5, v7, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s4, v8, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s3, v9, v2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s9
 ; GFX9-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
 ; GFX9-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX9-NEXT:    global_store_byte v[0:1], v2, off
@@ -1019,41 +1019,41 @@ define amdgpu_kernel void @udot8_acc4(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_and_b32 s9, s0, 15
-; GFX9-DL-NEXT:    s_and_b32 s16, s1, 15
-; GFX9-DL-NEXT:    s_bfe_u32 s15, s1, 0x40004
-; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s16
-; GFX9-DL-NEXT:    s_bfe_u32 s11, s1, 0x40018
-; GFX9-DL-NEXT:    s_bfe_u32 s12, s1, 0x40014
-; GFX9-DL-NEXT:    s_bfe_u32 s13, s1, 0x40010
-; GFX9-DL-NEXT:    s_bfe_u32 s14, s1, 0x40008
-; GFX9-DL-NEXT:    s_lshr_b32 s10, s1, 28
+; GFX9-DL-NEXT:    s_and_b32 s8, s0, 15
+; GFX9-DL-NEXT:    s_and_b32 s15, s1, 15
+; GFX9-DL-NEXT:    s_bfe_u32 s14, s1, 0x40004
+; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s15
+; GFX9-DL-NEXT:    s_bfe_u32 s10, s1, 0x40018
+; GFX9-DL-NEXT:    s_bfe_u32 s11, s1, 0x40014
+; GFX9-DL-NEXT:    s_bfe_u32 s12, s1, 0x40010
+; GFX9-DL-NEXT:    s_bfe_u32 s13, s1, 0x40008
+; GFX9-DL-NEXT:    s_lshr_b32 s9, s1, 28
 ; GFX9-DL-NEXT:    s_bfe_u32 s1, s1, 0x4000c
-; GFX9-DL-NEXT:    s_bfe_u32 s8, s0, 0x40004
-; GFX9-DL-NEXT:    v_mov_b32_e32 v5, s15
+; GFX9-DL-NEXT:    s_bfe_u32 s7, s0, 0x40004
+; GFX9-DL-NEXT:    v_mov_b32_e32 v5, s14
 ; GFX9-DL-NEXT:    s_lshr_b32 s2, s0, 28
-; GFX9-DL-NEXT:    s_bfe_u32 s4, s0, 0x40018
-; GFX9-DL-NEXT:    s_bfe_u32 s5, s0, 0x40014
-; GFX9-DL-NEXT:    s_bfe_u32 s6, s0, 0x40010
-; GFX9-DL-NEXT:    s_bfe_u32 s7, s0, 0x40008
+; GFX9-DL-NEXT:    s_bfe_u32 s3, s0, 0x40018
+; GFX9-DL-NEXT:    s_bfe_u32 s4, s0, 0x40014
+; GFX9-DL-NEXT:    s_bfe_u32 s5, s0, 0x40010
+; GFX9-DL-NEXT:    s_bfe_u32 s6, s0, 0x40008
 ; GFX9-DL-NEXT:    s_bfe_u32 s0, s0, 0x4000c
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-DL-NEXT:    v_mov_b32_e32 v6, s14
+; GFX9-DL-NEXT:    v_mov_b32_e32 v6, s13
 ; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v3, s0, v3
 ; GFX9-DL-NEXT:    v_and_b32_e32 v3, 15, v3
-; GFX9-DL-NEXT:    v_mov_b32_e32 v7, s13
-; GFX9-DL-NEXT:    v_mov_b32_e32 v8, s12
-; GFX9-DL-NEXT:    v_mov_b32_e32 v9, s11
+; GFX9-DL-NEXT:    v_mov_b32_e32 v7, s12
+; GFX9-DL-NEXT:    v_mov_b32_e32 v8, s11
+; GFX9-DL-NEXT:    v_mov_b32_e32 v9, s10
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s9, v4, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s8, v5, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s7, v6, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s8, v4, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s7, v5, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s6, v6, v2
 ; GFX9-DL-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX9-DL-NEXT:    v_add_u32_e32 v2, v2, v3
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s6, v7, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s5, v8, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s4, v9, v2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s10
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s5, v7, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s4, v8, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s3, v9, v2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s9
 ; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
 ; GFX9-DL-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX9-DL-NEXT:    global_store_byte v[0:1], v2, off
@@ -1061,44 +1061,44 @@ define amdgpu_kernel void @udot8_acc4(<8 x i4> addrspace(1)* %src1,
 ;
 ; GFX10-DL-LABEL: udot8_acc4:
 ; GFX10-DL:       ; %bb.0: ; %entry
-; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-DL-NEXT:    global_load_ubyte v2, v[0:1], off
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX10-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_and_b32 s2, s0, 15
-; GFX10-DL-NEXT:    s_and_b32 s4, s1, 15
-; GFX10-DL-NEXT:    s_bfe_u32 s5, s0, 0x40004
-; GFX10-DL-NEXT:    s_bfe_u32 s6, s1, 0x40004
-; GFX10-DL-NEXT:    s_bfe_u32 s7, s1, 0x40008
+; GFX10-DL-NEXT:    s_and_b32 s3, s1, 15
+; GFX10-DL-NEXT:    s_bfe_u32 s4, s0, 0x40004
+; GFX10-DL-NEXT:    s_bfe_u32 s5, s1, 0x40004
+; GFX10-DL-NEXT:    s_bfe_u32 s6, s1, 0x40008
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s4, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s3, v2
 ; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40008
-; GFX10-DL-NEXT:    s_bfe_u32 s4, s0, 0x4000c
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s5, s6, v2
-; GFX10-DL-NEXT:    s_bfe_u32 s5, s1, 0x4000c
-; GFX10-DL-NEXT:    s_bfe_u32 s6, s1, 0x40014
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s7, v2
-; GFX10-DL-NEXT:    v_mul_u32_u24_e64 v3, s4, s5
+; GFX10-DL-NEXT:    s_bfe_u32 s3, s0, 0x4000c
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s4, s5, v2
+; GFX10-DL-NEXT:    s_bfe_u32 s4, s1, 0x4000c
+; GFX10-DL-NEXT:    s_bfe_u32 s5, s1, 0x40014
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s6, v2
+; GFX10-DL-NEXT:    v_mul_u32_u24_e64 v3, s3, s4
 ; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40010
-; GFX10-DL-NEXT:    s_bfe_u32 s4, s1, 0x40010
-; GFX10-DL-NEXT:    s_bfe_u32 s5, s0, 0x40014
+; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x40010
+; GFX10-DL-NEXT:    s_bfe_u32 s4, s0, 0x40014
 ; GFX10-DL-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX10-DL-NEXT:    v_and_b32_e32 v3, 15, v3
 ; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v2, v3
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s4, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s3, v2
 ; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40018
-; GFX10-DL-NEXT:    s_bfe_u32 s4, s1, 0x40018
+; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x40018
 ; GFX10-DL-NEXT:    s_lshr_b32 s0, s0, 28
 ; GFX10-DL-NEXT:    s_lshr_b32 s1, s1, 28
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s5, s6, v2
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s4, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s4, s5, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s3, v2
 ; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s0, s1, v2
 ; GFX10-DL-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX10-DL-NEXT:    global_store_byte v[0:1], v2, off
@@ -1160,50 +1160,50 @@ entry:
 define amdgpu_kernel void @udot8_CommutationInsideMAD(<8 x i4> addrspace(1)* %src1,
 ; GFX7-LABEL: udot8_CommutationInsideMAD:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
-; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
-; GFX7-NEXT:    s_load_dword s0, s[8:9], 0x0
-; GFX7-NEXT:    s_load_dword s1, s[10:11], 0x0
+; GFX7-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_lshr_b32 s2, s0, 28
-; GFX7-NEXT:    s_bfe_u32 s15, s1, 0x40018
-; GFX7-NEXT:    s_bfe_u32 s16, s1, 0x40014
-; GFX7-NEXT:    s_bfe_u32 s17, s1, 0x40010
-; GFX7-NEXT:    s_bfe_u32 s18, s1, 0x4000c
-; GFX7-NEXT:    s_bfe_u32 s19, s1, 0x40008
-; GFX7-NEXT:    s_bfe_u32 s20, s1, 0x40004
-; GFX7-NEXT:    s_lshr_b32 s14, s1, 28
-; GFX7-NEXT:    s_and_b32 s1, s1, 15
-; GFX7-NEXT:    s_bfe_u32 s8, s0, 0x40018
-; GFX7-NEXT:    s_bfe_u32 s9, s0, 0x40014
-; GFX7-NEXT:    s_bfe_u32 s10, s0, 0x40010
-; GFX7-NEXT:    s_bfe_u32 s11, s0, 0x4000c
-; GFX7-NEXT:    s_bfe_u32 s12, s0, 0x40008
-; GFX7-NEXT:    s_bfe_u32 s13, s0, 0x40004
-; GFX7-NEXT:    s_and_b32 s0, s0, 15
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s20
-; GFX7-NEXT:    v_mov_b32_e32 v3, s19
-; GFX7-NEXT:    v_mov_b32_e32 v4, s18
-; GFX7-NEXT:    v_mov_b32_e32 v5, s17
-; GFX7-NEXT:    v_mov_b32_e32 v6, s16
-; GFX7-NEXT:    v_mov_b32_e32 v7, s15
+; GFX7-NEXT:    s_lshr_b32 s6, s4, 28
+; GFX7-NEXT:    s_bfe_u32 s14, s5, 0x40018
+; GFX7-NEXT:    s_bfe_u32 s15, s5, 0x40014
+; GFX7-NEXT:    s_bfe_u32 s16, s5, 0x40010
+; GFX7-NEXT:    s_bfe_u32 s17, s5, 0x4000c
+; GFX7-NEXT:    s_bfe_u32 s18, s5, 0x40008
+; GFX7-NEXT:    s_bfe_u32 s19, s5, 0x40004
+; GFX7-NEXT:    s_lshr_b32 s13, s5, 28
+; GFX7-NEXT:    s_and_b32 s5, s5, 15
+; GFX7-NEXT:    s_bfe_u32 s7, s4, 0x40018
+; GFX7-NEXT:    s_bfe_u32 s8, s4, 0x40014
+; GFX7-NEXT:    s_bfe_u32 s9, s4, 0x40010
+; GFX7-NEXT:    s_bfe_u32 s10, s4, 0x4000c
+; GFX7-NEXT:    s_bfe_u32 s11, s4, 0x40008
+; GFX7-NEXT:    s_bfe_u32 s12, s4, 0x40004
+; GFX7-NEXT:    s_and_b32 s4, s4, 15
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s19
+; GFX7-NEXT:    v_mov_b32_e32 v3, s18
+; GFX7-NEXT:    v_mov_b32_e32 v4, s17
+; GFX7-NEXT:    v_mov_b32_e32 v5, s16
+; GFX7-NEXT:    v_mov_b32_e32 v6, s15
+; GFX7-NEXT:    v_mov_b32_e32 v7, s14
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mad_u32_u24 v0, s0, v1, v0
-; GFX7-NEXT:    v_mad_u32_u24 v0, s13, v2, v0
-; GFX7-NEXT:    v_mad_u32_u24 v0, s12, v3, v0
-; GFX7-NEXT:    v_mad_u32_u24 v0, s11, v4, v0
-; GFX7-NEXT:    v_mad_u32_u24 v0, s10, v5, v0
-; GFX7-NEXT:    v_mad_u32_u24 v0, s9, v6, v0
-; GFX7-NEXT:    v_mad_u32_u24 v0, s8, v7, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s14
-; GFX7-NEXT:    v_mad_u32_u24 v0, s2, v1, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s12, v2, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s11, v3, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s10, v4, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s9, v5, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s8, v6, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s7, v7, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s13
+; GFX7-NEXT:    v_mad_u32_u24 v0, s6, v1, v0
 ; GFX7-NEXT:    v_and_b32_e32 v0, 15, v0
-; GFX7-NEXT:    buffer_store_byte v0, off, s[4:7], 0
+; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: udot8_CommutationInsideMAD:
@@ -1217,41 +1217,41 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(<8 x i4> addrspace(1)* %sr
 ; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX8-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_and_b32 s9, s0, 15
-; GFX8-NEXT:    s_and_b32 s16, s1, 15
-; GFX8-NEXT:    s_bfe_u32 s15, s1, 0x40004
-; GFX8-NEXT:    v_mov_b32_e32 v4, s16
-; GFX8-NEXT:    s_bfe_u32 s11, s1, 0x40018
-; GFX8-NEXT:    s_bfe_u32 s12, s1, 0x40014
-; GFX8-NEXT:    s_bfe_u32 s13, s1, 0x40010
-; GFX8-NEXT:    s_bfe_u32 s14, s1, 0x40008
-; GFX8-NEXT:    s_lshr_b32 s10, s1, 28
+; GFX8-NEXT:    s_and_b32 s8, s0, 15
+; GFX8-NEXT:    s_and_b32 s15, s1, 15
+; GFX8-NEXT:    s_bfe_u32 s14, s1, 0x40004
+; GFX8-NEXT:    v_mov_b32_e32 v4, s15
+; GFX8-NEXT:    s_bfe_u32 s10, s1, 0x40018
+; GFX8-NEXT:    s_bfe_u32 s11, s1, 0x40014
+; GFX8-NEXT:    s_bfe_u32 s12, s1, 0x40010
+; GFX8-NEXT:    s_bfe_u32 s13, s1, 0x40008
+; GFX8-NEXT:    s_lshr_b32 s9, s1, 28
 ; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x4000c
-; GFX8-NEXT:    s_bfe_u32 s8, s0, 0x40004
-; GFX8-NEXT:    v_mov_b32_e32 v5, s15
+; GFX8-NEXT:    s_bfe_u32 s7, s0, 0x40004
+; GFX8-NEXT:    v_mov_b32_e32 v5, s14
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 28
-; GFX8-NEXT:    s_bfe_u32 s4, s0, 0x40018
-; GFX8-NEXT:    s_bfe_u32 s5, s0, 0x40014
-; GFX8-NEXT:    s_bfe_u32 s6, s0, 0x40010
-; GFX8-NEXT:    s_bfe_u32 s7, s0, 0x40008
+; GFX8-NEXT:    s_bfe_u32 s3, s0, 0x40018
+; GFX8-NEXT:    s_bfe_u32 s4, s0, 0x40014
+; GFX8-NEXT:    s_bfe_u32 s5, s0, 0x40010
+; GFX8-NEXT:    s_bfe_u32 s6, s0, 0x40008
 ; GFX8-NEXT:    s_bfe_u32 s0, s0, 0x4000c
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    v_mov_b32_e32 v6, s14
+; GFX8-NEXT:    v_mov_b32_e32 v6, s13
 ; GFX8-NEXT:    v_mul_u32_u24_e32 v3, s0, v3
 ; GFX8-NEXT:    v_and_b32_e32 v3, 15, v3
-; GFX8-NEXT:    v_mov_b32_e32 v7, s13
-; GFX8-NEXT:    v_mov_b32_e32 v8, s12
-; GFX8-NEXT:    v_mov_b32_e32 v9, s11
+; GFX8-NEXT:    v_mov_b32_e32 v7, s12
+; GFX8-NEXT:    v_mov_b32_e32 v8, s11
+; GFX8-NEXT:    v_mov_b32_e32 v9, s10
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mad_u32_u24 v2, s9, v4, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s8, v5, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s7, v6, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s8, v4, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s7, v5, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s6, v6, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
-; GFX8-NEXT:    v_mad_u32_u24 v2, s6, v7, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s5, v8, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s4, v9, v2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s10
+; GFX8-NEXT:    v_mad_u32_u24 v2, s5, v7, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s4, v8, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s3, v9, v2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s9
 ; GFX8-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX8-NEXT:    flat_store_byte v[0:1], v2
@@ -1268,41 +1268,41 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(<8 x i4> addrspace(1)* %sr
 ; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX9-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s9, s0, 15
-; GFX9-NEXT:    s_and_b32 s16, s1, 15
-; GFX9-NEXT:    s_bfe_u32 s15, s1, 0x40004
-; GFX9-NEXT:    v_mov_b32_e32 v4, s16
-; GFX9-NEXT:    s_bfe_u32 s11, s1, 0x40018
-; GFX9-NEXT:    s_bfe_u32 s12, s1, 0x40014
-; GFX9-NEXT:    s_bfe_u32 s13, s1, 0x40010
-; GFX9-NEXT:    s_bfe_u32 s14, s1, 0x40008
-; GFX9-NEXT:    s_lshr_b32 s10, s1, 28
+; GFX9-NEXT:    s_and_b32 s8, s0, 15
+; GFX9-NEXT:    s_and_b32 s15, s1, 15
+; GFX9-NEXT:    s_bfe_u32 s14, s1, 0x40004
+; GFX9-NEXT:    v_mov_b32_e32 v4, s15
+; GFX9-NEXT:    s_bfe_u32 s10, s1, 0x40018
+; GFX9-NEXT:    s_bfe_u32 s11, s1, 0x40014
+; GFX9-NEXT:    s_bfe_u32 s12, s1, 0x40010
+; GFX9-NEXT:    s_bfe_u32 s13, s1, 0x40008
+; GFX9-NEXT:    s_lshr_b32 s9, s1, 28
 ; GFX9-NEXT:    s_bfe_u32 s1, s1, 0x4000c
-; GFX9-NEXT:    s_bfe_u32 s8, s0, 0x40004
-; GFX9-NEXT:    v_mov_b32_e32 v5, s15
+; GFX9-NEXT:    s_bfe_u32 s7, s0, 0x40004
+; GFX9-NEXT:    v_mov_b32_e32 v5, s14
 ; GFX9-NEXT:    s_lshr_b32 s2, s0, 28
-; GFX9-NEXT:    s_bfe_u32 s4, s0, 0x40018
-; GFX9-NEXT:    s_bfe_u32 s5, s0, 0x40014
-; GFX9-NEXT:    s_bfe_u32 s6, s0, 0x40010
-; GFX9-NEXT:    s_bfe_u32 s7, s0, 0x40008
+; GFX9-NEXT:    s_bfe_u32 s3, s0, 0x40018
+; GFX9-NEXT:    s_bfe_u32 s4, s0, 0x40014
+; GFX9-NEXT:    s_bfe_u32 s5, s0, 0x40010
+; GFX9-NEXT:    s_bfe_u32 s6, s0, 0x40008
 ; GFX9-NEXT:    s_bfe_u32 s0, s0, 0x4000c
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    v_mov_b32_e32 v6, s14
+; GFX9-NEXT:    v_mov_b32_e32 v6, s13
 ; GFX9-NEXT:    v_mul_u32_u24_e32 v3, s0, v3
 ; GFX9-NEXT:    v_and_b32_e32 v3, 15, v3
-; GFX9-NEXT:    v_mov_b32_e32 v7, s13
-; GFX9-NEXT:    v_mov_b32_e32 v8, s12
-; GFX9-NEXT:    v_mov_b32_e32 v9, s11
+; GFX9-NEXT:    v_mov_b32_e32 v7, s12
+; GFX9-NEXT:    v_mov_b32_e32 v8, s11
+; GFX9-NEXT:    v_mov_b32_e32 v9, s10
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mad_u32_u24 v2, s9, v4, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s8, v5, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s7, v6, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s8, v4, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s7, v5, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s6, v6, v2
 ; GFX9-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s6, v7, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s5, v8, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s4, v9, v2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s10
+; GFX9-NEXT:    v_mad_u32_u24 v2, s5, v7, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s4, v8, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s3, v9, v2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s9
 ; GFX9-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
 ; GFX9-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX9-NEXT:    global_store_byte v[0:1], v2, off
@@ -1319,41 +1319,41 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(<8 x i4> addrspace(1)* %sr
 ; GFX9-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_and_b32 s9, s0, 15
-; GFX9-DL-NEXT:    s_and_b32 s16, s1, 15
-; GFX9-DL-NEXT:    s_bfe_u32 s15, s1, 0x40004
-; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s16
-; GFX9-DL-NEXT:    s_bfe_u32 s11, s1, 0x40018
-; GFX9-DL-NEXT:    s_bfe_u32 s12, s1, 0x40014
-; GFX9-DL-NEXT:    s_bfe_u32 s13, s1, 0x40010
-; GFX9-DL-NEXT:    s_bfe_u32 s14, s1, 0x40008
-; GFX9-DL-NEXT:    s_lshr_b32 s10, s1, 28
+; GFX9-DL-NEXT:    s_and_b32 s8, s0, 15
+; GFX9-DL-NEXT:    s_and_b32 s15, s1, 15
+; GFX9-DL-NEXT:    s_bfe_u32 s14, s1, 0x40004
+; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s15
+; GFX9-DL-NEXT:    s_bfe_u32 s10, s1, 0x40018
+; GFX9-DL-NEXT:    s_bfe_u32 s11, s1, 0x40014
+; GFX9-DL-NEXT:    s_bfe_u32 s12, s1, 0x40010
+; GFX9-DL-NEXT:    s_bfe_u32 s13, s1, 0x40008
+; GFX9-DL-NEXT:    s_lshr_b32 s9, s1, 28
 ; GFX9-DL-NEXT:    s_bfe_u32 s1, s1, 0x4000c
-; GFX9-DL-NEXT:    s_bfe_u32 s8, s0, 0x40004
-; GFX9-DL-NEXT:    v_mov_b32_e32 v5, s15
+; GFX9-DL-NEXT:    s_bfe_u32 s7, s0, 0x40004
+; GFX9-DL-NEXT:    v_mov_b32_e32 v5, s14
 ; GFX9-DL-NEXT:    s_lshr_b32 s2, s0, 28
-; GFX9-DL-NEXT:    s_bfe_u32 s4, s0, 0x40018
-; GFX9-DL-NEXT:    s_bfe_u32 s5, s0, 0x40014
-; GFX9-DL-NEXT:    s_bfe_u32 s6, s0, 0x40010
-; GFX9-DL-NEXT:    s_bfe_u32 s7, s0, 0x40008
+; GFX9-DL-NEXT:    s_bfe_u32 s3, s0, 0x40018
+; GFX9-DL-NEXT:    s_bfe_u32 s4, s0, 0x40014
+; GFX9-DL-NEXT:    s_bfe_u32 s5, s0, 0x40010
+; GFX9-DL-NEXT:    s_bfe_u32 s6, s0, 0x40008
 ; GFX9-DL-NEXT:    s_bfe_u32 s0, s0, 0x4000c
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-DL-NEXT:    v_mov_b32_e32 v6, s14
+; GFX9-DL-NEXT:    v_mov_b32_e32 v6, s13
 ; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v3, s0, v3
 ; GFX9-DL-NEXT:    v_and_b32_e32 v3, 15, v3
-; GFX9-DL-NEXT:    v_mov_b32_e32 v7, s13
-; GFX9-DL-NEXT:    v_mov_b32_e32 v8, s12
-; GFX9-DL-NEXT:    v_mov_b32_e32 v9, s11
+; GFX9-DL-NEXT:    v_mov_b32_e32 v7, s12
+; GFX9-DL-NEXT:    v_mov_b32_e32 v8, s11
+; GFX9-DL-NEXT:    v_mov_b32_e32 v9, s10
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s9, v4, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s8, v5, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s7, v6, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s8, v4, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s7, v5, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s6, v6, v2
 ; GFX9-DL-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX9-DL-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s6, v7, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s5, v8, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s4, v9, v2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s10
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s5, v7, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s4, v8, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s3, v9, v2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s9
 ; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
 ; GFX9-DL-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX9-DL-NEXT:    global_store_byte v[0:1], v2, off
@@ -1361,44 +1361,44 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(<8 x i4> addrspace(1)* %sr
 ;
 ; GFX10-DL-LABEL: udot8_CommutationInsideMAD:
 ; GFX10-DL:       ; %bb.0: ; %entry
-; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-DL-NEXT:    global_load_ubyte v2, v[0:1], off
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX10-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_and_b32 s2, s0, 15
-; GFX10-DL-NEXT:    s_and_b32 s4, s1, 15
-; GFX10-DL-NEXT:    s_bfe_u32 s5, s0, 0x40004
-; GFX10-DL-NEXT:    s_bfe_u32 s6, s1, 0x40004
-; GFX10-DL-NEXT:    s_bfe_u32 s7, s1, 0x40008
-; GFX10-DL-NEXT:    s_bfe_u32 s8, s1, 0x4000c
+; GFX10-DL-NEXT:    s_and_b32 s3, s1, 15
+; GFX10-DL-NEXT:    s_bfe_u32 s4, s0, 0x40004
+; GFX10-DL-NEXT:    s_bfe_u32 s5, s1, 0x40004
+; GFX10-DL-NEXT:    s_bfe_u32 s6, s1, 0x40008
+; GFX10-DL-NEXT:    s_bfe_u32 s7, s1, 0x4000c
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s4, v2
-; GFX10-DL-NEXT:    s_bfe_u32 s4, s0, 0x4000c
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s3, v2
+; GFX10-DL-NEXT:    s_bfe_u32 s3, s0, 0x4000c
 ; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40008
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s5, s6, v2
-; GFX10-DL-NEXT:    v_mul_u32_u24_e64 v3, s4, s8
-; GFX10-DL-NEXT:    s_bfe_u32 s4, s1, 0x40010
-; GFX10-DL-NEXT:    s_bfe_u32 s5, s0, 0x40014
-; GFX10-DL-NEXT:    s_bfe_u32 s6, s1, 0x40014
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s7, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s4, s5, v2
+; GFX10-DL-NEXT:    v_mul_u32_u24_e64 v3, s3, s7
+; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x40010
+; GFX10-DL-NEXT:    s_bfe_u32 s4, s0, 0x40014
+; GFX10-DL-NEXT:    s_bfe_u32 s5, s1, 0x40014
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s6, v2
 ; GFX10-DL-NEXT:    v_and_b32_e32 v3, 15, v3
 ; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40010
 ; GFX10-DL-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v3, v2
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s4, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s3, v2
 ; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40018
-; GFX10-DL-NEXT:    s_bfe_u32 s4, s1, 0x40018
+; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x40018
 ; GFX10-DL-NEXT:    s_lshr_b32 s0, s0, 28
 ; GFX10-DL-NEXT:    s_lshr_b32 s1, s1, 28
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s5, s6, v2
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s4, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s4, s5, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s3, v2
 ; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s0, s1, v2
 ; GFX10-DL-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX10-DL-NEXT:    global_store_byte v[0:1], v2, off
@@ -1458,51 +1458,51 @@ entry:
 define amdgpu_kernel void @udot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1,
 ; GFX7-LABEL: udot8_multiuses_mul1:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
-; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_load_dword s10, s[10:11], 0x0
-; GFX7-NEXT:    s_load_dword s0, s[8:9], 0x0
-; GFX7-NEXT:    s_load_dword s21, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s20, s[0:1], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_bfe_u32 s20, s10, 0x40004
-; GFX7-NEXT:    s_lshr_b32 s11, s10, 28
-; GFX7-NEXT:    s_bfe_u32 s15, s10, 0x40018
-; GFX7-NEXT:    s_bfe_u32 s16, s10, 0x40014
-; GFX7-NEXT:    s_bfe_u32 s17, s10, 0x40010
-; GFX7-NEXT:    s_bfe_u32 s18, s10, 0x4000c
-; GFX7-NEXT:    s_bfe_u32 s19, s10, 0x40008
-; GFX7-NEXT:    s_and_b32 s10, s10, 15
-; GFX7-NEXT:    s_lshr_b32 s1, s0, 28
-; GFX7-NEXT:    s_bfe_u32 s2, s0, 0x40018
-; GFX7-NEXT:    s_bfe_u32 s8, s0, 0x40014
-; GFX7-NEXT:    s_bfe_u32 s9, s0, 0x40010
-; GFX7-NEXT:    s_bfe_u32 s12, s0, 0x4000c
-; GFX7-NEXT:    s_bfe_u32 s13, s0, 0x40008
-; GFX7-NEXT:    s_bfe_u32 s14, s0, 0x40004
-; GFX7-NEXT:    s_and_b32 s0, s0, 15
-; GFX7-NEXT:    v_mov_b32_e32 v0, s10
-; GFX7-NEXT:    v_mov_b32_e32 v1, s21
-; GFX7-NEXT:    v_mad_u32_u24 v1, s0, v0, v1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s20
-; GFX7-NEXT:    v_mad_u32_u24 v0, s0, v0, v1
-; GFX7-NEXT:    v_mad_u32_u24 v1, s14, v2, v1
+; GFX7-NEXT:    s_bfe_u32 s19, s6, 0x40004
+; GFX7-NEXT:    s_lshr_b32 s7, s6, 28
+; GFX7-NEXT:    s_bfe_u32 s14, s6, 0x40018
+; GFX7-NEXT:    s_bfe_u32 s15, s6, 0x40014
+; GFX7-NEXT:    s_bfe_u32 s16, s6, 0x40010
+; GFX7-NEXT:    s_bfe_u32 s17, s6, 0x4000c
+; GFX7-NEXT:    s_bfe_u32 s18, s6, 0x40008
+; GFX7-NEXT:    s_and_b32 s6, s6, 15
+; GFX7-NEXT:    s_lshr_b32 s5, s4, 28
+; GFX7-NEXT:    s_bfe_u32 s8, s4, 0x40018
+; GFX7-NEXT:    s_bfe_u32 s9, s4, 0x40014
+; GFX7-NEXT:    s_bfe_u32 s10, s4, 0x40010
+; GFX7-NEXT:    s_bfe_u32 s11, s4, 0x4000c
+; GFX7-NEXT:    s_bfe_u32 s12, s4, 0x40008
+; GFX7-NEXT:    s_bfe_u32 s13, s4, 0x40004
+; GFX7-NEXT:    s_and_b32 s4, s4, 15
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s20
+; GFX7-NEXT:    v_mad_u32_u24 v1, s4, v0, v1
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s19
+; GFX7-NEXT:    v_mad_u32_u24 v0, s4, v0, v1
 ; GFX7-NEXT:    v_mad_u32_u24 v1, s13, v2, v1
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s18
 ; GFX7-NEXT:    v_mad_u32_u24 v1, s12, v2, v1
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s17
-; GFX7-NEXT:    v_mad_u32_u24 v1, s9, v2, v1
+; GFX7-NEXT:    v_mad_u32_u24 v1, s11, v2, v1
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s16
-; GFX7-NEXT:    v_mad_u32_u24 v1, s8, v2, v1
+; GFX7-NEXT:    v_mad_u32_u24 v1, s10, v2, v1
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s15
-; GFX7-NEXT:    v_mad_u32_u24 v1, s2, v2, v1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s11
-; GFX7-NEXT:    v_mad_u32_u24 v1, s1, v2, v1
+; GFX7-NEXT:    v_mad_u32_u24 v1, s9, v2, v1
+; GFX7-NEXT:    v_mov_b32_e32 v2, s14
+; GFX7-NEXT:    v_mad_u32_u24 v1, s8, v2, v1
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mad_u32_u24 v1, s5, v2, v1
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
-; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: udot8_multiuses_mul1:
@@ -1512,31 +1512,29 @@ define amdgpu_kernel void @udot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_load_dword s6, s[6:7], 0x0
 ; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX8-NEXT:    s_load_dword s19, s[0:1], 0x0
+; GFX8-NEXT:    s_load_dword s18, s[0:1], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_bfe_u32 s18, s6, 0x40004
+; GFX8-NEXT:    s_bfe_u32 s17, s6, 0x40004
 ; GFX8-NEXT:    s_lshr_b32 s7, s6, 28
-; GFX8-NEXT:    s_bfe_u32 s13, s6, 0x40018
-; GFX8-NEXT:    s_bfe_u32 s14, s6, 0x40014
-; GFX8-NEXT:    s_bfe_u32 s15, s6, 0x40010
-; GFX8-NEXT:    s_bfe_u32 s16, s6, 0x4000c
-; GFX8-NEXT:    s_bfe_u32 s17, s6, 0x40008
+; GFX8-NEXT:    s_bfe_u32 s12, s6, 0x40018
+; GFX8-NEXT:    s_bfe_u32 s13, s6, 0x40014
+; GFX8-NEXT:    s_bfe_u32 s14, s6, 0x40010
+; GFX8-NEXT:    s_bfe_u32 s15, s6, 0x4000c
+; GFX8-NEXT:    s_bfe_u32 s16, s6, 0x40008
 ; GFX8-NEXT:    s_and_b32 s6, s6, 15
-; GFX8-NEXT:    s_lshr_b32 s4, s2, 28
-; GFX8-NEXT:    s_bfe_u32 s5, s2, 0x40018
-; GFX8-NEXT:    s_bfe_u32 s8, s2, 0x40014
-; GFX8-NEXT:    s_bfe_u32 s9, s2, 0x40010
-; GFX8-NEXT:    s_bfe_u32 s10, s2, 0x4000c
-; GFX8-NEXT:    s_bfe_u32 s11, s2, 0x40008
-; GFX8-NEXT:    s_bfe_u32 s12, s2, 0x40004
+; GFX8-NEXT:    s_lshr_b32 s3, s2, 28
+; GFX8-NEXT:    s_bfe_u32 s4, s2, 0x40018
+; GFX8-NEXT:    s_bfe_u32 s5, s2, 0x40014
+; GFX8-NEXT:    s_bfe_u32 s8, s2, 0x40010
+; GFX8-NEXT:    s_bfe_u32 s9, s2, 0x4000c
+; GFX8-NEXT:    s_bfe_u32 s10, s2, 0x40008
+; GFX8-NEXT:    s_bfe_u32 s11, s2, 0x40004
 ; GFX8-NEXT:    s_and_b32 s2, s2, 15
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s6
-; GFX8-NEXT:    v_mov_b32_e32 v1, s19
+; GFX8-NEXT:    v_mov_b32_e32 v1, s18
 ; GFX8-NEXT:    v_mad_u32_u24 v1, s2, v0, v1
-; GFX8-NEXT:    v_mov_b32_e32 v2, s18
-; GFX8-NEXT:    v_mad_u32_u24 v0, s2, v0, v1
-; GFX8-NEXT:    v_mad_u32_u24 v1, s12, v2, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s17
+; GFX8-NEXT:    v_mad_u32_u24 v0, s2, v0, v1
 ; GFX8-NEXT:    v_mad_u32_u24 v1, s11, v2, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s16
 ; GFX8-NEXT:    v_mad_u32_u24 v1, s10, v2, v1
@@ -1546,8 +1544,10 @@ define amdgpu_kernel void @udot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1,
 ; GFX8-NEXT:    v_mad_u32_u24 v1, s8, v2, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s13
 ; GFX8-NEXT:    v_mad_u32_u24 v1, s5, v2, v1
-; GFX8-NEXT:    v_mov_b32_e32 v2, s7
+; GFX8-NEXT:    v_mov_b32_e32 v2, s12
 ; GFX8-NEXT:    v_mad_u32_u24 v1, s4, v2, v1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s7
+; GFX8-NEXT:    v_mad_u32_u24 v1, s3, v2, v1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v1, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
@@ -1561,31 +1561,29 @@ define amdgpu_kernel void @udot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_load_dword s6, s[6:7], 0x0
 ; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX9-NEXT:    s_load_dword s19, s[0:1], 0x0
+; GFX9-NEXT:    s_load_dword s18, s[0:1], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_bfe_u32 s18, s6, 0x40004
+; GFX9-NEXT:    s_bfe_u32 s17, s6, 0x40004
 ; GFX9-NEXT:    s_lshr_b32 s7, s6, 28
-; GFX9-NEXT:    s_bfe_u32 s13, s6, 0x40018
-; GFX9-NEXT:    s_bfe_u32 s14, s6, 0x40014
-; GFX9-NEXT:    s_bfe_u32 s15, s6, 0x40010
-; GFX9-NEXT:    s_bfe_u32 s16, s6, 0x4000c
-; GFX9-NEXT:    s_bfe_u32 s17, s6, 0x40008
+; GFX9-NEXT:    s_bfe_u32 s12, s6, 0x40018
+; GFX9-NEXT:    s_bfe_u32 s13, s6, 0x40014
+; GFX9-NEXT:    s_bfe_u32 s14, s6, 0x40010
+; GFX9-NEXT:    s_bfe_u32 s15, s6, 0x4000c
+; GFX9-NEXT:    s_bfe_u32 s16, s6, 0x40008
 ; GFX9-NEXT:    s_and_b32 s6, s6, 15
-; GFX9-NEXT:    s_lshr_b32 s4, s2, 28
-; GFX9-NEXT:    s_bfe_u32 s5, s2, 0x40018
-; GFX9-NEXT:    s_bfe_u32 s8, s2, 0x40014
-; GFX9-NEXT:    s_bfe_u32 s9, s2, 0x40010
-; GFX9-NEXT:    s_bfe_u32 s10, s2, 0x4000c
-; GFX9-NEXT:    s_bfe_u32 s11, s2, 0x40008
-; GFX9-NEXT:    s_bfe_u32 s12, s2, 0x40004
+; GFX9-NEXT:    s_lshr_b32 s3, s2, 28
+; GFX9-NEXT:    s_bfe_u32 s4, s2, 0x40018
+; GFX9-NEXT:    s_bfe_u32 s5, s2, 0x40014
+; GFX9-NEXT:    s_bfe_u32 s8, s2, 0x40010
+; GFX9-NEXT:    s_bfe_u32 s9, s2, 0x4000c
+; GFX9-NEXT:    s_bfe_u32 s10, s2, 0x40008
+; GFX9-NEXT:    s_bfe_u32 s11, s2, 0x40004
 ; GFX9-NEXT:    s_and_b32 s2, s2, 15
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-NEXT:    v_mov_b32_e32 v1, s19
+; GFX9-NEXT:    v_mov_b32_e32 v1, s18
 ; GFX9-NEXT:    v_mad_u32_u24 v1, s2, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v2, s18
-; GFX9-NEXT:    v_mad_u32_u24 v0, s2, v0, v1
-; GFX9-NEXT:    v_mad_u32_u24 v1, s12, v2, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s17
+; GFX9-NEXT:    v_mad_u32_u24 v0, s2, v0, v1
 ; GFX9-NEXT:    v_mad_u32_u24 v1, s11, v2, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s16
 ; GFX9-NEXT:    v_mad_u32_u24 v1, s10, v2, v1
@@ -1595,8 +1593,10 @@ define amdgpu_kernel void @udot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    v_mad_u32_u24 v1, s8, v2, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s13
 ; GFX9-NEXT:    v_mad_u32_u24 v1, s5, v2, v1
-; GFX9-NEXT:    v_mov_b32_e32 v2, s7
+; GFX9-NEXT:    v_mov_b32_e32 v2, s12
 ; GFX9-NEXT:    v_mad_u32_u24 v1, s4, v2, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s7
+; GFX9-NEXT:    v_mad_u32_u24 v1, s3, v2, v1
 ; GFX9-NEXT:    v_add_u32_e32 v2, v0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
@@ -1610,31 +1610,29 @@ define amdgpu_kernel void @udot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    s_load_dword s6, s[6:7], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX9-DL-NEXT:    s_load_dword s19, s[0:1], 0x0
+; GFX9-DL-NEXT:    s_load_dword s18, s[0:1], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_bfe_u32 s18, s6, 0x40004
+; GFX9-DL-NEXT:    s_bfe_u32 s17, s6, 0x40004
 ; GFX9-DL-NEXT:    s_lshr_b32 s7, s6, 28
-; GFX9-DL-NEXT:    s_bfe_u32 s13, s6, 0x40018
-; GFX9-DL-NEXT:    s_bfe_u32 s14, s6, 0x40014
-; GFX9-DL-NEXT:    s_bfe_u32 s15, s6, 0x40010
-; GFX9-DL-NEXT:    s_bfe_u32 s16, s6, 0x4000c
-; GFX9-DL-NEXT:    s_bfe_u32 s17, s6, 0x40008
+; GFX9-DL-NEXT:    s_bfe_u32 s12, s6, 0x40018
+; GFX9-DL-NEXT:    s_bfe_u32 s13, s6, 0x40014
+; GFX9-DL-NEXT:    s_bfe_u32 s14, s6, 0x40010
+; GFX9-DL-NEXT:    s_bfe_u32 s15, s6, 0x4000c
+; GFX9-DL-NEXT:    s_bfe_u32 s16, s6, 0x40008
 ; GFX9-DL-NEXT:    s_and_b32 s6, s6, 15
-; GFX9-DL-NEXT:    s_lshr_b32 s4, s2, 28
-; GFX9-DL-NEXT:    s_bfe_u32 s5, s2, 0x40018
-; GFX9-DL-NEXT:    s_bfe_u32 s8, s2, 0x40014
-; GFX9-DL-NEXT:    s_bfe_u32 s9, s2, 0x40010
-; GFX9-DL-NEXT:    s_bfe_u32 s10, s2, 0x4000c
-; GFX9-DL-NEXT:    s_bfe_u32 s11, s2, 0x40008
-; GFX9-DL-NEXT:    s_bfe_u32 s12, s2, 0x40004
+; GFX9-DL-NEXT:    s_lshr_b32 s3, s2, 28
+; GFX9-DL-NEXT:    s_bfe_u32 s4, s2, 0x40018
+; GFX9-DL-NEXT:    s_bfe_u32 s5, s2, 0x40014
+; GFX9-DL-NEXT:    s_bfe_u32 s8, s2, 0x40010
+; GFX9-DL-NEXT:    s_bfe_u32 s9, s2, 0x4000c
+; GFX9-DL-NEXT:    s_bfe_u32 s10, s2, 0x40008
+; GFX9-DL-NEXT:    s_bfe_u32 s11, s2, 0x40004
 ; GFX9-DL-NEXT:    s_and_b32 s2, s2, 15
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s19
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s18
 ; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s2, v0, v1
-; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s18
-; GFX9-DL-NEXT:    v_mad_u32_u24 v0, s2, v0, v1
-; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s12, v2, v1
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s17
+; GFX9-DL-NEXT:    v_mad_u32_u24 v0, s2, v0, v1
 ; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s11, v2, v1
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s16
 ; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s10, v2, v1
@@ -1644,8 +1642,10 @@ define amdgpu_kernel void @udot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s8, v2, v1
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s13
 ; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s5, v2, v1
-; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s7
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s12
 ; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s4, v2, v1
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s7
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s3, v2, v1
 ; GFX9-DL-NEXT:    v_add_u32_e32 v2, v0, v1
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
@@ -1659,35 +1659,35 @@ define amdgpu_kernel void @udot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
-; GFX10-DL-NEXT:    s_load_dword s5, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_load_dword s4, s[0:1], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_and_b32 s6, s2, 15
-; GFX10-DL-NEXT:    s_and_b32 s7, s4, 15
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s5
-; GFX10-DL-NEXT:    s_bfe_u32 s5, s2, 0x40004
-; GFX10-DL-NEXT:    s_bfe_u32 s8, s4, 0x40004
-; GFX10-DL-NEXT:    s_bfe_u32 s9, s2, 0x40008
-; GFX10-DL-NEXT:    s_bfe_u32 s10, s4, 0x40008
-; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s6, s7, v0
-; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s5, s8, v0
-; GFX10-DL-NEXT:    s_bfe_u32 s5, s2, 0x4000c
-; GFX10-DL-NEXT:    s_bfe_u32 s8, s4, 0x4000c
-; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s6, s7, v0
-; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s9, s10, v1
-; GFX10-DL-NEXT:    s_bfe_u32 s9, s2, 0x40010
-; GFX10-DL-NEXT:    s_bfe_u32 s10, s4, 0x40010
-; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s5, s8, v1
-; GFX10-DL-NEXT:    s_bfe_u32 s5, s2, 0x40014
-; GFX10-DL-NEXT:    s_bfe_u32 s8, s4, 0x40014
-; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s9, s10, v1
-; GFX10-DL-NEXT:    s_bfe_u32 s9, s2, 0x40018
-; GFX10-DL-NEXT:    s_bfe_u32 s10, s4, 0x40018
+; GFX10-DL-NEXT:    s_and_b32 s5, s2, 15
+; GFX10-DL-NEXT:    s_and_b32 s6, s3, 15
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-DL-NEXT:    s_bfe_u32 s4, s2, 0x40004
+; GFX10-DL-NEXT:    s_bfe_u32 s7, s3, 0x40004
+; GFX10-DL-NEXT:    s_bfe_u32 s8, s2, 0x40008
+; GFX10-DL-NEXT:    s_bfe_u32 s9, s3, 0x40008
+; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s5, s6, v0
+; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s4, s7, v0
+; GFX10-DL-NEXT:    s_bfe_u32 s4, s2, 0x4000c
+; GFX10-DL-NEXT:    s_bfe_u32 s7, s3, 0x4000c
+; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s5, s6, v0
+; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s8, s9, v1
+; GFX10-DL-NEXT:    s_bfe_u32 s8, s2, 0x40010
+; GFX10-DL-NEXT:    s_bfe_u32 s9, s3, 0x40010
+; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s4, s7, v1
+; GFX10-DL-NEXT:    s_bfe_u32 s4, s2, 0x40014
+; GFX10-DL-NEXT:    s_bfe_u32 s7, s3, 0x40014
+; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s8, s9, v1
+; GFX10-DL-NEXT:    s_bfe_u32 s8, s2, 0x40018
+; GFX10-DL-NEXT:    s_bfe_u32 s9, s3, 0x40018
 ; GFX10-DL-NEXT:    s_lshr_b32 s2, s2, 28
-; GFX10-DL-NEXT:    s_lshr_b32 s4, s4, 28
-; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s5, s8, v1
-; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s9, s10, v1
-; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s4, v1
+; GFX10-DL-NEXT:    s_lshr_b32 s3, s3, 28
+; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s4, s7, v1
+; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s8, s9, v1
+; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s3, v1
 ; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v0, v1
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
@@ -1766,49 +1766,49 @@ entry:
 define amdgpu_kernel void @udot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX7-LABEL: udot8_acc32_vecMul:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
-; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_load_dword s10, s[10:11], 0x0
-; GFX7-NEXT:    s_load_dword s0, s[8:9], 0x0
-; GFX7-NEXT:    s_load_dword s21, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s20, s[0:1], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_lshr_b32 s11, s10, 28
-; GFX7-NEXT:    s_bfe_u32 s15, s10, 0x40018
-; GFX7-NEXT:    s_bfe_u32 s16, s10, 0x40014
-; GFX7-NEXT:    s_bfe_u32 s17, s10, 0x40010
-; GFX7-NEXT:    s_bfe_u32 s18, s10, 0x4000c
-; GFX7-NEXT:    s_bfe_u32 s19, s10, 0x40008
-; GFX7-NEXT:    s_bfe_u32 s20, s10, 0x40004
-; GFX7-NEXT:    s_and_b32 s10, s10, 15
-; GFX7-NEXT:    s_lshr_b32 s1, s0, 28
-; GFX7-NEXT:    s_bfe_u32 s2, s0, 0x40018
-; GFX7-NEXT:    s_bfe_u32 s8, s0, 0x40014
-; GFX7-NEXT:    s_bfe_u32 s9, s0, 0x40010
-; GFX7-NEXT:    s_bfe_u32 s12, s0, 0x4000c
-; GFX7-NEXT:    s_bfe_u32 s13, s0, 0x40008
-; GFX7-NEXT:    s_bfe_u32 s14, s0, 0x40004
-; GFX7-NEXT:    s_and_b32 s0, s0, 15
-; GFX7-NEXT:    v_mov_b32_e32 v0, s10
-; GFX7-NEXT:    v_mov_b32_e32 v1, s21
-; GFX7-NEXT:    v_mad_u32_u24 v0, s0, v0, v1
+; GFX7-NEXT:    s_lshr_b32 s7, s6, 28
+; GFX7-NEXT:    s_bfe_u32 s14, s6, 0x40018
+; GFX7-NEXT:    s_bfe_u32 s15, s6, 0x40014
+; GFX7-NEXT:    s_bfe_u32 s16, s6, 0x40010
+; GFX7-NEXT:    s_bfe_u32 s17, s6, 0x4000c
+; GFX7-NEXT:    s_bfe_u32 s18, s6, 0x40008
+; GFX7-NEXT:    s_bfe_u32 s19, s6, 0x40004
+; GFX7-NEXT:    s_and_b32 s6, s6, 15
+; GFX7-NEXT:    s_lshr_b32 s5, s4, 28
+; GFX7-NEXT:    s_bfe_u32 s8, s4, 0x40018
+; GFX7-NEXT:    s_bfe_u32 s9, s4, 0x40014
+; GFX7-NEXT:    s_bfe_u32 s10, s4, 0x40010
+; GFX7-NEXT:    s_bfe_u32 s11, s4, 0x4000c
+; GFX7-NEXT:    s_bfe_u32 s12, s4, 0x40008
+; GFX7-NEXT:    s_bfe_u32 s13, s4, 0x40004
+; GFX7-NEXT:    s_and_b32 s4, s4, 15
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s20
-; GFX7-NEXT:    v_mad_u32_u24 v0, s14, v1, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s4, v0, v1
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s19
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s13, v1, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s18
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s12, v1, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s17
-; GFX7-NEXT:    v_mad_u32_u24 v0, s9, v1, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s11, v1, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s16
-; GFX7-NEXT:    v_mad_u32_u24 v0, s8, v1, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s10, v1, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s15
-; GFX7-NEXT:    v_mad_u32_u24 v0, s2, v1, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s11
-; GFX7-NEXT:    v_mad_u32_u24 v0, s1, v1, v0
-; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s9, v1, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s14
+; GFX7-NEXT:    v_mad_u32_u24 v0, s8, v1, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mad_u32_u24 v0, s5, v1, v0
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: udot8_acc32_vecMul:
@@ -1818,29 +1818,27 @@ define amdgpu_kernel void @udot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_load_dword s6, s[6:7], 0x0
 ; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX8-NEXT:    s_load_dword s19, s[0:1], 0x0
+; GFX8-NEXT:    s_load_dword s18, s[0:1], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_lshr_b32 s7, s6, 28
-; GFX8-NEXT:    s_bfe_u32 s13, s6, 0x40018
-; GFX8-NEXT:    s_bfe_u32 s14, s6, 0x40014
-; GFX8-NEXT:    s_bfe_u32 s15, s6, 0x40010
-; GFX8-NEXT:    s_bfe_u32 s16, s6, 0x4000c
-; GFX8-NEXT:    s_bfe_u32 s17, s6, 0x40008
-; GFX8-NEXT:    s_bfe_u32 s18, s6, 0x40004
+; GFX8-NEXT:    s_bfe_u32 s12, s6, 0x40018
+; GFX8-NEXT:    s_bfe_u32 s13, s6, 0x40014
+; GFX8-NEXT:    s_bfe_u32 s14, s6, 0x40010
+; GFX8-NEXT:    s_bfe_u32 s15, s6, 0x4000c
+; GFX8-NEXT:    s_bfe_u32 s16, s6, 0x40008
+; GFX8-NEXT:    s_bfe_u32 s17, s6, 0x40004
 ; GFX8-NEXT:    s_and_b32 s6, s6, 15
-; GFX8-NEXT:    s_lshr_b32 s4, s2, 28
-; GFX8-NEXT:    s_bfe_u32 s5, s2, 0x40018
-; GFX8-NEXT:    s_bfe_u32 s8, s2, 0x40014
-; GFX8-NEXT:    s_bfe_u32 s9, s2, 0x40010
-; GFX8-NEXT:    s_bfe_u32 s10, s2, 0x4000c
-; GFX8-NEXT:    s_bfe_u32 s11, s2, 0x40008
-; GFX8-NEXT:    s_bfe_u32 s12, s2, 0x40004
+; GFX8-NEXT:    s_lshr_b32 s3, s2, 28
+; GFX8-NEXT:    s_bfe_u32 s4, s2, 0x40018
+; GFX8-NEXT:    s_bfe_u32 s5, s2, 0x40014
+; GFX8-NEXT:    s_bfe_u32 s8, s2, 0x40010
+; GFX8-NEXT:    s_bfe_u32 s9, s2, 0x4000c
+; GFX8-NEXT:    s_bfe_u32 s10, s2, 0x40008
+; GFX8-NEXT:    s_bfe_u32 s11, s2, 0x40004
 ; GFX8-NEXT:    s_and_b32 s2, s2, 15
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s6
-; GFX8-NEXT:    v_mov_b32_e32 v1, s19
-; GFX8-NEXT:    v_mad_u32_u24 v0, s2, v0, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s18
-; GFX8-NEXT:    v_mad_u32_u24 v0, s12, v1, v0
+; GFX8-NEXT:    v_mad_u32_u24 v0, s2, v0, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s17
 ; GFX8-NEXT:    v_mad_u32_u24 v0, s11, v1, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s16
@@ -1851,8 +1849,10 @@ define amdgpu_kernel void @udot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX8-NEXT:    v_mad_u32_u24 v0, s8, v1, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s13
 ; GFX8-NEXT:    v_mad_u32_u24 v0, s5, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s12
+; GFX8-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s7
-; GFX8-NEXT:    v_mad_u32_u24 v2, s4, v1, v0
+; GFX8-NEXT:    v_mad_u32_u24 v2, s3, v1, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
@@ -1865,29 +1865,27 @@ define amdgpu_kernel void @udot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_load_dword s6, s[6:7], 0x0
 ; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX9-NEXT:    s_load_dword s19, s[0:1], 0x0
+; GFX9-NEXT:    s_load_dword s18, s[0:1], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshr_b32 s7, s6, 28
-; GFX9-NEXT:    s_bfe_u32 s13, s6, 0x40018
-; GFX9-NEXT:    s_bfe_u32 s14, s6, 0x40014
-; GFX9-NEXT:    s_bfe_u32 s15, s6, 0x40010
-; GFX9-NEXT:    s_bfe_u32 s16, s6, 0x4000c
-; GFX9-NEXT:    s_bfe_u32 s17, s6, 0x40008
-; GFX9-NEXT:    s_bfe_u32 s18, s6, 0x40004
+; GFX9-NEXT:    s_bfe_u32 s12, s6, 0x40018
+; GFX9-NEXT:    s_bfe_u32 s13, s6, 0x40014
+; GFX9-NEXT:    s_bfe_u32 s14, s6, 0x40010
+; GFX9-NEXT:    s_bfe_u32 s15, s6, 0x4000c
+; GFX9-NEXT:    s_bfe_u32 s16, s6, 0x40008
+; GFX9-NEXT:    s_bfe_u32 s17, s6, 0x40004
 ; GFX9-NEXT:    s_and_b32 s6, s6, 15
-; GFX9-NEXT:    s_lshr_b32 s4, s2, 28
-; GFX9-NEXT:    s_bfe_u32 s5, s2, 0x40018
-; GFX9-NEXT:    s_bfe_u32 s8, s2, 0x40014
-; GFX9-NEXT:    s_bfe_u32 s9, s2, 0x40010
-; GFX9-NEXT:    s_bfe_u32 s10, s2, 0x4000c
-; GFX9-NEXT:    s_bfe_u32 s11, s2, 0x40008
-; GFX9-NEXT:    s_bfe_u32 s12, s2, 0x40004
+; GFX9-NEXT:    s_lshr_b32 s3, s2, 28
+; GFX9-NEXT:    s_bfe_u32 s4, s2, 0x40018
+; GFX9-NEXT:    s_bfe_u32 s5, s2, 0x40014
+; GFX9-NEXT:    s_bfe_u32 s8, s2, 0x40010
+; GFX9-NEXT:    s_bfe_u32 s9, s2, 0x4000c
+; GFX9-NEXT:    s_bfe_u32 s10, s2, 0x40008
+; GFX9-NEXT:    s_bfe_u32 s11, s2, 0x40004
 ; GFX9-NEXT:    s_and_b32 s2, s2, 15
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-NEXT:    v_mov_b32_e32 v1, s19
-; GFX9-NEXT:    v_mad_u32_u24 v0, s2, v0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s18
-; GFX9-NEXT:    v_mad_u32_u24 v0, s12, v1, v0
+; GFX9-NEXT:    v_mad_u32_u24 v0, s2, v0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s17
 ; GFX9-NEXT:    v_mad_u32_u24 v0, s11, v1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s16
@@ -1898,8 +1896,10 @@ define amdgpu_kernel void @udot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    v_mad_u32_u24 v0, s8, v1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s13
 ; GFX9-NEXT:    v_mad_u32_u24 v0, s5, v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s12
+; GFX9-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-NEXT:    v_mad_u32_u24 v2, s4, v1, v0
+; GFX9-NEXT:    v_mad_u32_u24 v2, s3, v1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
@@ -1911,11 +1911,11 @@ define amdgpu_kernel void @udot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
-; GFX9-DL-NEXT:    s_load_dword s6, s[0:1], 0x0
+; GFX9-DL-NEXT:    s_load_dword s3, s[0:1], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-DL-NEXT:    v_dot8_u32_u4 v2, s4, v0, v1
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
@@ -1924,18 +1924,18 @@ define amdgpu_kernel void @udot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1,
 ;
 ; GFX10-DL-LABEL: udot8_acc32_vecMul:
 ; GFX10-DL:       ; %bb.0: ; %entry
-; GFX10-DL-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
-; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_load_dword s0, s[8:9], 0x0
-; GFX10-DL-NEXT:    s_load_dword s1, s[4:5], 0x0
-; GFX10-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT:    v_dot8_u32_u4 v2, s1, s2, v0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s8
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s9
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-DL-NEXT:    v_dot8_u32_u4 v2, s0, s1, v0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                               <8 x i4> addrspace(1)* %src2,
@@ -1976,59 +1976,59 @@ entry:
 define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX7-LABEL: udot8_acc16_vecMul:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
-; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
-; GFX7-NEXT:    s_load_dword s0, s[8:9], 0x0
-; GFX7-NEXT:    s_load_dword s1, s[10:11], 0x0
+; GFX7-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_bfe_u32 s11, s0, 0x40004
-; GFX7-NEXT:    s_bfe_u32 s18, s1, 0x40004
-; GFX7-NEXT:    s_bfe_u32 s20, s1, 0x4000c
-; GFX7-NEXT:    v_mov_b32_e32 v4, s18
-; GFX7-NEXT:    s_bfe_u32 s15, s1, 0x40018
-; GFX7-NEXT:    s_bfe_u32 s16, s1, 0x40014
-; GFX7-NEXT:    s_bfe_u32 s17, s1, 0x40010
-; GFX7-NEXT:    s_and_b32 s19, s1, 15
-; GFX7-NEXT:    s_lshr_b32 s14, s1, 28
-; GFX7-NEXT:    s_bfe_u32 s1, s1, 0x40008
-; GFX7-NEXT:    s_bfe_u32 s13, s0, 0x4000c
-; GFX7-NEXT:    v_mov_b32_e32 v2, s20
-; GFX7-NEXT:    v_mul_u32_u24_e32 v2, s13, v2
-; GFX7-NEXT:    v_mul_u32_u24_e32 v4, s11, v4
-; GFX7-NEXT:    s_lshr_b32 s2, s0, 28
-; GFX7-NEXT:    s_bfe_u32 s8, s0, 0x40018
-; GFX7-NEXT:    s_bfe_u32 s9, s0, 0x40014
-; GFX7-NEXT:    s_bfe_u32 s10, s0, 0x40010
-; GFX7-NEXT:    s_and_b32 s12, s0, 15
-; GFX7-NEXT:    v_mov_b32_e32 v3, s19
-; GFX7-NEXT:    s_bfe_u32 s0, s0, 0x40008
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mul_u32_u24_e32 v1, s0, v1
+; GFX7-NEXT:    s_bfe_u32 s10, s4, 0x40004
+; GFX7-NEXT:    s_bfe_u32 s17, s5, 0x40004
+; GFX7-NEXT:    s_bfe_u32 s19, s5, 0x4000c
+; GFX7-NEXT:    v_mov_b32_e32 v4, s17
+; GFX7-NEXT:    s_bfe_u32 s14, s5, 0x40018
+; GFX7-NEXT:    s_bfe_u32 s15, s5, 0x40014
+; GFX7-NEXT:    s_bfe_u32 s16, s5, 0x40010
+; GFX7-NEXT:    s_and_b32 s18, s5, 15
+; GFX7-NEXT:    s_lshr_b32 s13, s5, 28
+; GFX7-NEXT:    s_bfe_u32 s5, s5, 0x40008
+; GFX7-NEXT:    s_bfe_u32 s12, s4, 0x4000c
+; GFX7-NEXT:    v_mov_b32_e32 v2, s19
+; GFX7-NEXT:    v_mul_u32_u24_e32 v2, s12, v2
+; GFX7-NEXT:    v_mul_u32_u24_e32 v4, s10, v4
+; GFX7-NEXT:    s_lshr_b32 s6, s4, 28
+; GFX7-NEXT:    s_bfe_u32 s7, s4, 0x40018
+; GFX7-NEXT:    s_bfe_u32 s8, s4, 0x40014
+; GFX7-NEXT:    s_bfe_u32 s9, s4, 0x40010
+; GFX7-NEXT:    s_and_b32 s11, s4, 15
+; GFX7-NEXT:    v_mov_b32_e32 v3, s18
+; GFX7-NEXT:    s_bfe_u32 s4, s4, 0x40008
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mul_u32_u24_e32 v1, s4, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_mul_u32_u24_e32 v3, s12, v3
+; GFX7-NEXT:    v_mul_u32_u24_e32 v3, s11, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX7-NEXT:    v_or_b32_e32 v2, v3, v4
 ; GFX7-NEXT:    v_alignbit_b32 v3, v1, v2, 16
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
-; GFX7-NEXT:    v_mov_b32_e32 v5, s17
-; GFX7-NEXT:    v_mov_b32_e32 v6, s16
-; GFX7-NEXT:    v_mov_b32_e32 v7, s15
+; GFX7-NEXT:    v_mov_b32_e32 v5, s16
+; GFX7-NEXT:    v_mov_b32_e32 v6, s15
+; GFX7-NEXT:    v_mov_b32_e32 v7, s14
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
-; GFX7-NEXT:    v_mad_u32_u24 v0, s10, v5, v0
-; GFX7-NEXT:    v_mad_u32_u24 v0, s9, v6, v0
-; GFX7-NEXT:    v_mad_u32_u24 v0, s8, v7, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s14
-; GFX7-NEXT:    v_mad_u32_u24 v0, s2, v1, v0
-; GFX7-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s9, v5, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s8, v6, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s7, v7, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s13
+; GFX7-NEXT:    v_mad_u32_u24 v0, s6, v1, v0
+; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: udot8_acc16_vecMul:
@@ -2043,38 +2043,38 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 28
-; GFX8-NEXT:    s_bfe_u32 s11, s1, 0x40018
-; GFX8-NEXT:    s_bfe_u32 s12, s1, 0x40014
-; GFX8-NEXT:    s_bfe_u32 s13, s1, 0x40010
-; GFX8-NEXT:    s_bfe_u32 s14, s1, 0x4000c
-; GFX8-NEXT:    s_bfe_u32 s15, s1, 0x40008
-; GFX8-NEXT:    s_bfe_u32 s16, s1, 0x40004
-; GFX8-NEXT:    s_lshr_b32 s10, s1, 28
+; GFX8-NEXT:    s_bfe_u32 s10, s1, 0x40018
+; GFX8-NEXT:    s_bfe_u32 s11, s1, 0x40014
+; GFX8-NEXT:    s_bfe_u32 s12, s1, 0x40010
+; GFX8-NEXT:    s_bfe_u32 s13, s1, 0x4000c
+; GFX8-NEXT:    s_bfe_u32 s14, s1, 0x40008
+; GFX8-NEXT:    s_bfe_u32 s15, s1, 0x40004
+; GFX8-NEXT:    s_lshr_b32 s9, s1, 28
 ; GFX8-NEXT:    s_and_b32 s1, s1, 15
-; GFX8-NEXT:    s_bfe_u32 s4, s0, 0x40018
-; GFX8-NEXT:    s_bfe_u32 s5, s0, 0x40014
-; GFX8-NEXT:    s_bfe_u32 s6, s0, 0x40010
-; GFX8-NEXT:    s_bfe_u32 s7, s0, 0x4000c
-; GFX8-NEXT:    s_bfe_u32 s8, s0, 0x40008
-; GFX8-NEXT:    s_bfe_u32 s9, s0, 0x40004
+; GFX8-NEXT:    s_bfe_u32 s3, s0, 0x40018
+; GFX8-NEXT:    s_bfe_u32 s4, s0, 0x40014
+; GFX8-NEXT:    s_bfe_u32 s5, s0, 0x40010
+; GFX8-NEXT:    s_bfe_u32 s6, s0, 0x4000c
+; GFX8-NEXT:    s_bfe_u32 s7, s0, 0x40008
+; GFX8-NEXT:    s_bfe_u32 s8, s0, 0x40004
 ; GFX8-NEXT:    s_and_b32 s0, s0, 15
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    v_mov_b32_e32 v4, s16
-; GFX8-NEXT:    v_mov_b32_e32 v5, s15
-; GFX8-NEXT:    v_mov_b32_e32 v6, s14
-; GFX8-NEXT:    v_mov_b32_e32 v7, s13
-; GFX8-NEXT:    v_mov_b32_e32 v8, s12
-; GFX8-NEXT:    v_mov_b32_e32 v9, s11
+; GFX8-NEXT:    v_mov_b32_e32 v4, s15
+; GFX8-NEXT:    v_mov_b32_e32 v5, s14
+; GFX8-NEXT:    v_mov_b32_e32 v6, s13
+; GFX8-NEXT:    v_mov_b32_e32 v7, s12
+; GFX8-NEXT:    v_mov_b32_e32 v8, s11
+; GFX8-NEXT:    v_mov_b32_e32 v9, s10
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s9, v4, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s8, v4, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s8, v5, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s7, v6, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s6, v7, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s5, v8, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s4, v9, v2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s10
+; GFX8-NEXT:    v_mad_u32_u24 v2, s7, v5, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s6, v6, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s5, v7, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s4, v8, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s3, v9, v2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s9
 ; GFX8-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
 ; GFX8-NEXT:    flat_store_short v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
@@ -2088,36 +2088,36 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_bfe_u32 s7, s6, 0x40018
-; GFX9-NEXT:    s_lshr_b32 s13, s6, 28
-; GFX9-NEXT:    s_pack_ll_b32_b16 s7, s7, s13
-; GFX9-NEXT:    s_bfe_u32 s4, s2, 0x40018
-; GFX9-NEXT:    s_lshr_b32 s5, s2, 28
-; GFX9-NEXT:    s_bfe_u32 s14, s6, 0x40010
-; GFX9-NEXT:    s_bfe_u32 s15, s6, 0x40014
-; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s4, s5
+; GFX9-NEXT:    s_lshr_b32 s12, s6, 28
+; GFX9-NEXT:    s_pack_ll_b32_b16 s7, s7, s12
+; GFX9-NEXT:    s_bfe_u32 s3, s2, 0x40018
+; GFX9-NEXT:    s_lshr_b32 s4, s2, 28
+; GFX9-NEXT:    s_bfe_u32 s13, s6, 0x40010
+; GFX9-NEXT:    s_bfe_u32 s14, s6, 0x40014
+; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s3, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s7
-; GFX9-NEXT:    v_pk_mul_lo_u16 v2, s4, v0
-; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s14, s15
-; GFX9-NEXT:    s_bfe_u32 s8, s2, 0x40010
-; GFX9-NEXT:    s_bfe_u32 s9, s2, 0x40014
-; GFX9-NEXT:    s_bfe_u32 s16, s6, 0x40008
-; GFX9-NEXT:    s_bfe_u32 s17, s6, 0x4000c
-; GFX9-NEXT:    s_and_b32 s18, s6, 15
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    s_pack_ll_b32_b16 s5, s8, s9
-; GFX9-NEXT:    s_bfe_u32 s10, s2, 0x40008
-; GFX9-NEXT:    s_bfe_u32 s11, s2, 0x4000c
+; GFX9-NEXT:    v_pk_mul_lo_u16 v2, s3, v0
+; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s13, s14
+; GFX9-NEXT:    s_bfe_u32 s5, s2, 0x40010
+; GFX9-NEXT:    s_bfe_u32 s8, s2, 0x40014
+; GFX9-NEXT:    s_bfe_u32 s15, s6, 0x40008
+; GFX9-NEXT:    s_bfe_u32 s16, s6, 0x4000c
+; GFX9-NEXT:    s_and_b32 s17, s6, 15
+; GFX9-NEXT:    v_mov_b32_e32 v0, s3
+; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s5, s8
+; GFX9-NEXT:    s_bfe_u32 s9, s2, 0x40008
+; GFX9-NEXT:    s_bfe_u32 s10, s2, 0x4000c
 ; GFX9-NEXT:    s_bfe_u32 s6, s6, 0x40004
-; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s16, s17
-; GFX9-NEXT:    v_pk_mul_lo_u16 v3, s5, v0
-; GFX9-NEXT:    s_and_b32 s12, s2, 15
+; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s15, s16
+; GFX9-NEXT:    v_pk_mul_lo_u16 v3, s4, v0
+; GFX9-NEXT:    s_and_b32 s11, s2, 15
 ; GFX9-NEXT:    s_bfe_u32 s2, s2, 0x40004
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    s_pack_ll_b32_b16 s5, s10, s11
-; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s18, s6
-; GFX9-NEXT:    v_pk_mul_lo_u16 v4, s5, v0
-; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s12, s2
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v0, s3
+; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s9, s10
+; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s17, s6
+; GFX9-NEXT:    v_pk_mul_lo_u16 v4, s4, v0
+; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s11, s2
+; GFX9-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX9-NEXT:    v_pk_mul_lo_u16 v5, s2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
@@ -2143,36 +2143,36 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    s_bfe_u32 s7, s6, 0x40018
-; GFX9-DL-NEXT:    s_lshr_b32 s13, s6, 28
-; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s7, s7, s13
-; GFX9-DL-NEXT:    s_bfe_u32 s4, s2, 0x40018
-; GFX9-DL-NEXT:    s_lshr_b32 s5, s2, 28
-; GFX9-DL-NEXT:    s_bfe_u32 s14, s6, 0x40010
-; GFX9-DL-NEXT:    s_bfe_u32 s15, s6, 0x40014
-; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s4, s4, s5
+; GFX9-DL-NEXT:    s_lshr_b32 s12, s6, 28
+; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s7, s7, s12
+; GFX9-DL-NEXT:    s_bfe_u32 s3, s2, 0x40018
+; GFX9-DL-NEXT:    s_lshr_b32 s4, s2, 28
+; GFX9-DL-NEXT:    s_bfe_u32 s13, s6, 0x40010
+; GFX9-DL-NEXT:    s_bfe_u32 s14, s6, 0x40014
+; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s3, s3, s4
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s7
-; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v2, s4, v0
-; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s4, s14, s15
-; GFX9-DL-NEXT:    s_bfe_u32 s8, s2, 0x40010
-; GFX9-DL-NEXT:    s_bfe_u32 s9, s2, 0x40014
-; GFX9-DL-NEXT:    s_bfe_u32 s16, s6, 0x40008
-; GFX9-DL-NEXT:    s_bfe_u32 s17, s6, 0x4000c
-; GFX9-DL-NEXT:    s_and_b32 s18, s6, 15
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s5, s8, s9
-; GFX9-DL-NEXT:    s_bfe_u32 s10, s2, 0x40008
-; GFX9-DL-NEXT:    s_bfe_u32 s11, s2, 0x4000c
+; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v2, s3, v0
+; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s3, s13, s14
+; GFX9-DL-NEXT:    s_bfe_u32 s5, s2, 0x40010
+; GFX9-DL-NEXT:    s_bfe_u32 s8, s2, 0x40014
+; GFX9-DL-NEXT:    s_bfe_u32 s15, s6, 0x40008
+; GFX9-DL-NEXT:    s_bfe_u32 s16, s6, 0x4000c
+; GFX9-DL-NEXT:    s_and_b32 s17, s6, 15
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s3
+; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s4, s5, s8
+; GFX9-DL-NEXT:    s_bfe_u32 s9, s2, 0x40008
+; GFX9-DL-NEXT:    s_bfe_u32 s10, s2, 0x4000c
 ; GFX9-DL-NEXT:    s_bfe_u32 s6, s6, 0x40004
-; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s4, s16, s17
-; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v3, s5, v0
-; GFX9-DL-NEXT:    s_and_b32 s12, s2, 15
+; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s3, s15, s16
+; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v3, s4, v0
+; GFX9-DL-NEXT:    s_and_b32 s11, s2, 15
 ; GFX9-DL-NEXT:    s_bfe_u32 s2, s2, 0x40004
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s5, s10, s11
-; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s4, s18, s6
-; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v4, s5, v0
-; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s2, s12, s2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s3
+; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s4, s9, s10
+; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s3, s17, s6
+; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v4, s4, v0
+; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s2, s11, s2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v5, s2, v0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
@@ -2191,47 +2191,47 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ;
 ; GFX10-DL-LABEL: udot8_acc16_vecMul:
 ; GFX10-DL:       ; %bb.0: ; %entry
-; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-DL-NEXT:    global_load_ushort v2, v[0:1], off
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX10-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_and_b32 s2, s0, 15
-; GFX10-DL-NEXT:    s_bfe_u32 s6, s0, 0x40004
-; GFX10-DL-NEXT:    s_and_b32 s4, s1, 15
-; GFX10-DL-NEXT:    s_bfe_u32 s5, s1, 0x40004
-; GFX10-DL-NEXT:    s_bfe_u32 s7, s1, 0x4000c
-; GFX10-DL-NEXT:    s_bfe_u32 s8, s0, 0x4000c
-; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s2, s2, s6
-; GFX10-DL-NEXT:    s_bfe_u32 s6, s1, 0x40008
-; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s4, s4, s5
-; GFX10-DL-NEXT:    s_bfe_u32 s5, s0, 0x40008
-; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v3, s2, s4
-; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s4, s6, s7
-; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s5, s5, s8
+; GFX10-DL-NEXT:    s_bfe_u32 s5, s0, 0x40004
+; GFX10-DL-NEXT:    s_and_b32 s3, s1, 15
+; GFX10-DL-NEXT:    s_bfe_u32 s4, s1, 0x40004
+; GFX10-DL-NEXT:    s_bfe_u32 s6, s1, 0x4000c
+; GFX10-DL-NEXT:    s_bfe_u32 s7, s0, 0x4000c
+; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s2, s2, s5
+; GFX10-DL-NEXT:    s_bfe_u32 s5, s1, 0x40008
+; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s3, s3, s4
+; GFX10-DL-NEXT:    s_bfe_u32 s4, s0, 0x40008
+; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v3, s2, s3
+; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s3, s5, s6
+; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s4, s4, s7
 ; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40010
-; GFX10-DL-NEXT:    s_bfe_u32 s6, s0, 0x40014
-; GFX10-DL-NEXT:    s_bfe_u32 s7, s1, 0x40010
-; GFX10-DL-NEXT:    s_bfe_u32 s8, s1, 0x40014
-; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v4, s5, s4
-; GFX10-DL-NEXT:    s_bfe_u32 s4, s0, 0x40018
-; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s2, s2, s6
+; GFX10-DL-NEXT:    s_bfe_u32 s5, s0, 0x40014
+; GFX10-DL-NEXT:    s_bfe_u32 s6, s1, 0x40010
+; GFX10-DL-NEXT:    s_bfe_u32 s7, s1, 0x40014
+; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v4, s4, s3
+; GFX10-DL-NEXT:    s_bfe_u32 s3, s0, 0x40018
+; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s2, s2, s5
 ; GFX10-DL-NEXT:    s_lshr_b32 s0, s0, 28
-; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s5, s7, s8
-; GFX10-DL-NEXT:    s_bfe_u32 s6, s1, 0x40018
+; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s4, s6, s7
+; GFX10-DL-NEXT:    s_bfe_u32 s5, s1, 0x40018
 ; GFX10-DL-NEXT:    s_lshr_b32 s1, s1, 28
-; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s0, s4, s0
-; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s1, s6, s1
+; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s0, s3, s0
+; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s1, s5, s1
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v3, v2
 ; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v3, s2, s5
+; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v3, s2, s4
 ; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0
 ; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v4, s0, s1
@@ -2278,53 +2278,53 @@ entry:
 define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX7-LABEL: udot8_acc8_vecMul:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
-; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
-; GFX7-NEXT:    s_load_dword s0, s[8:9], 0x0
-; GFX7-NEXT:    s_load_dword s1, s[10:11], 0x0
+; GFX7-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_bfe_u32 s2, s0, 0x4000c
-; GFX7-NEXT:    s_bfe_u32 s14, s1, 0x4000c
-; GFX7-NEXT:    s_bfe_u32 s16, s1, 0x40004
-; GFX7-NEXT:    s_lshr_b32 s18, s1, 28
-; GFX7-NEXT:    v_mov_b32_e32 v8, s14
-; GFX7-NEXT:    s_bfe_u32 s15, s1, 0x40008
-; GFX7-NEXT:    s_and_b32 s17, s1, 15
-; GFX7-NEXT:    s_bfe_u32 s19, s1, 0x40018
-; GFX7-NEXT:    s_bfe_u32 s20, s1, 0x40014
-; GFX7-NEXT:    s_bfe_u32 s9, s0, 0x40004
-; GFX7-NEXT:    v_mov_b32_e32 v6, s16
-; GFX7-NEXT:    s_lshr_b32 s11, s0, 28
-; GFX7-NEXT:    v_mov_b32_e32 v4, s18
-; GFX7-NEXT:    v_mul_u32_u24_e32 v4, s11, v4
-; GFX7-NEXT:    v_mul_u32_u24_e32 v6, s9, v6
-; GFX7-NEXT:    v_mul_u32_u24_e32 v8, s2, v8
-; GFX7-NEXT:    s_bfe_u32 s1, s1, 0x40010
-; GFX7-NEXT:    s_bfe_u32 s8, s0, 0x40008
-; GFX7-NEXT:    v_mov_b32_e32 v7, s15
-; GFX7-NEXT:    s_and_b32 s10, s0, 15
-; GFX7-NEXT:    v_mov_b32_e32 v5, s17
-; GFX7-NEXT:    s_bfe_u32 s12, s0, 0x40018
-; GFX7-NEXT:    v_mov_b32_e32 v3, s19
-; GFX7-NEXT:    s_bfe_u32 s13, s0, 0x40014
-; GFX7-NEXT:    v_mov_b32_e32 v2, s20
-; GFX7-NEXT:    v_mul_u32_u24_e32 v2, s13, v2
-; GFX7-NEXT:    s_bfe_u32 s0, s0, 0x40010
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mul_u32_u24_e32 v3, s12, v3
+; GFX7-NEXT:    s_bfe_u32 s6, s4, 0x4000c
+; GFX7-NEXT:    s_bfe_u32 s13, s5, 0x4000c
+; GFX7-NEXT:    s_bfe_u32 s15, s5, 0x40004
+; GFX7-NEXT:    s_lshr_b32 s17, s5, 28
+; GFX7-NEXT:    v_mov_b32_e32 v8, s13
+; GFX7-NEXT:    s_bfe_u32 s14, s5, 0x40008
+; GFX7-NEXT:    s_and_b32 s16, s5, 15
+; GFX7-NEXT:    s_bfe_u32 s18, s5, 0x40018
+; GFX7-NEXT:    s_bfe_u32 s19, s5, 0x40014
+; GFX7-NEXT:    s_bfe_u32 s8, s4, 0x40004
+; GFX7-NEXT:    v_mov_b32_e32 v6, s15
+; GFX7-NEXT:    s_lshr_b32 s10, s4, 28
+; GFX7-NEXT:    v_mov_b32_e32 v4, s17
+; GFX7-NEXT:    v_mul_u32_u24_e32 v4, s10, v4
+; GFX7-NEXT:    v_mul_u32_u24_e32 v6, s8, v6
+; GFX7-NEXT:    v_mul_u32_u24_e32 v8, s6, v8
+; GFX7-NEXT:    s_bfe_u32 s5, s5, 0x40010
+; GFX7-NEXT:    s_bfe_u32 s7, s4, 0x40008
+; GFX7-NEXT:    v_mov_b32_e32 v7, s14
+; GFX7-NEXT:    s_and_b32 s9, s4, 15
+; GFX7-NEXT:    v_mov_b32_e32 v5, s16
+; GFX7-NEXT:    s_bfe_u32 s11, s4, 0x40018
+; GFX7-NEXT:    v_mov_b32_e32 v3, s18
+; GFX7-NEXT:    s_bfe_u32 s12, s4, 0x40014
+; GFX7-NEXT:    v_mov_b32_e32 v2, s19
+; GFX7-NEXT:    v_mul_u32_u24_e32 v2, s12, v2
+; GFX7-NEXT:    s_bfe_u32 s4, s4, 0x40010
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mul_u32_u24_e32 v3, s11, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
-; GFX7-NEXT:    v_mul_u32_u24_e32 v5, s10, v5
-; GFX7-NEXT:    v_mul_u32_u24_e32 v7, s8, v7
+; GFX7-NEXT:    v_mul_u32_u24_e32 v5, s9, v5
+; GFX7-NEXT:    v_mul_u32_u24_e32 v7, s7, v7
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 8, v8
 ; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX7-NEXT:    v_or_b32_e32 v4, v5, v6
 ; GFX7-NEXT:    v_or_b32_e32 v5, v7, v8
-; GFX7-NEXT:    v_mul_u32_u24_e32 v9, s0, v1
+; GFX7-NEXT:    v_mul_u32_u24_e32 v9, s4, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GFX7-NEXT:    v_or_b32_e32 v2, v9, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
@@ -2342,11 +2342,11 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v6, v0
-; GFX7-NEXT:    v_mad_u32_u24 v0, s0, v1, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v7
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GFX7-NEXT:    buffer_store_byte v0, off, s[4:7], 0
+; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: udot8_acc8_vecMul:
@@ -2361,42 +2361,42 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX8-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_bfe_u32 s8, s1, 0x40004
-; GFX8-NEXT:    s_bfe_u32 s10, s1, 0x4000c
-; GFX8-NEXT:    s_bfe_u32 s15, s2, 0x40004
-; GFX8-NEXT:    s_and_b32 s16, s2, 15
-; GFX8-NEXT:    s_bfe_u32 s17, s2, 0x4000c
-; GFX8-NEXT:    s_bfe_u32 s4, s1, 0x40014
-; GFX8-NEXT:    s_lshr_b32 s6, s1, 28
-; GFX8-NEXT:    s_bfe_u32 s11, s2, 0x40014
-; GFX8-NEXT:    s_bfe_u32 s12, s2, 0x40010
-; GFX8-NEXT:    s_lshr_b32 s13, s2, 28
-; GFX8-NEXT:    s_bfe_u32 s14, s2, 0x40018
+; GFX8-NEXT:    s_bfe_u32 s7, s1, 0x40004
+; GFX8-NEXT:    s_bfe_u32 s9, s1, 0x4000c
+; GFX8-NEXT:    s_bfe_u32 s14, s2, 0x40004
+; GFX8-NEXT:    s_and_b32 s15, s2, 15
+; GFX8-NEXT:    s_bfe_u32 s16, s2, 0x4000c
+; GFX8-NEXT:    s_bfe_u32 s3, s1, 0x40014
+; GFX8-NEXT:    s_lshr_b32 s5, s1, 28
+; GFX8-NEXT:    s_bfe_u32 s10, s2, 0x40014
+; GFX8-NEXT:    s_bfe_u32 s11, s2, 0x40010
+; GFX8-NEXT:    s_lshr_b32 s12, s2, 28
+; GFX8-NEXT:    s_bfe_u32 s13, s2, 0x40018
 ; GFX8-NEXT:    s_bfe_u32 s2, s2, 0x40008
-; GFX8-NEXT:    s_and_b32 s9, s1, 15
-; GFX8-NEXT:    v_mov_b32_e32 v4, s17
-; GFX8-NEXT:    v_mov_b32_e32 v5, s10
-; GFX8-NEXT:    v_mov_b32_e32 v6, s16
-; GFX8-NEXT:    v_mov_b32_e32 v7, s15
-; GFX8-NEXT:    v_mov_b32_e32 v8, s8
+; GFX8-NEXT:    s_and_b32 s8, s1, 15
+; GFX8-NEXT:    v_mov_b32_e32 v4, s16
+; GFX8-NEXT:    v_mov_b32_e32 v5, s9
+; GFX8-NEXT:    v_mov_b32_e32 v6, s15
+; GFX8-NEXT:    v_mov_b32_e32 v7, s14
+; GFX8-NEXT:    v_mov_b32_e32 v8, s7
 ; GFX8-NEXT:    v_mul_u32_u24_sdwa v4, v5, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_mul_u32_u24_e32 v5, s9, v6
+; GFX8-NEXT:    v_mul_u32_u24_e32 v5, s8, v6
 ; GFX8-NEXT:    v_mul_u32_u24_sdwa v6, v8, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    s_bfe_u32 s5, s1, 0x40010
-; GFX8-NEXT:    s_bfe_u32 s7, s1, 0x40018
-; GFX8-NEXT:    v_mov_b32_e32 v9, s14
+; GFX8-NEXT:    s_bfe_u32 s4, s1, 0x40010
+; GFX8-NEXT:    s_bfe_u32 s6, s1, 0x40018
+; GFX8-NEXT:    v_mov_b32_e32 v9, s13
 ; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x40008
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s2
-; GFX8-NEXT:    v_mov_b32_e32 v10, s13
-; GFX8-NEXT:    v_mov_b32_e32 v11, s6
-; GFX8-NEXT:    v_mov_b32_e32 v12, s12
-; GFX8-NEXT:    v_mov_b32_e32 v13, s11
-; GFX8-NEXT:    v_mov_b32_e32 v14, s4
+; GFX8-NEXT:    v_mov_b32_e32 v10, s12
+; GFX8-NEXT:    v_mov_b32_e32 v11, s5
+; GFX8-NEXT:    v_mov_b32_e32 v12, s11
+; GFX8-NEXT:    v_mov_b32_e32 v13, s10
+; GFX8-NEXT:    v_mov_b32_e32 v14, s3
 ; GFX8-NEXT:    v_mul_u32_u24_e32 v3, s1, v3
 ; GFX8-NEXT:    v_or_b32_e32 v5, v5, v6
-; GFX8-NEXT:    v_mul_u32_u24_e32 v7, s7, v9
+; GFX8-NEXT:    v_mul_u32_u24_e32 v7, s6, v9
 ; GFX8-NEXT:    v_mul_u32_u24_sdwa v8, v11, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_mul_u32_u24_e32 v9, s5, v12
+; GFX8-NEXT:    v_mul_u32_u24_e32 v9, s4, v12
 ; GFX8-NEXT:    v_mul_u32_u24_sdwa v10, v14, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_and_b32_e32 v5, s0, v5
 ; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -2431,40 +2431,40 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX9-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_bfe_u32 s4, s0, 0x40010
-; GFX9-NEXT:    s_bfe_u32 s11, s1, 0x40010
-; GFX9-NEXT:    s_bfe_u32 s12, s1, 0x40014
-; GFX9-NEXT:    s_bfe_u32 s13, s1, 0x40018
-; GFX9-NEXT:    s_lshr_b32 s14, s1, 28
-; GFX9-NEXT:    s_and_b32 s15, s1, 15
-; GFX9-NEXT:    s_bfe_u32 s16, s1, 0x40004
-; GFX9-NEXT:    s_bfe_u32 s17, s1, 0x40008
-; GFX9-NEXT:    v_mov_b32_e32 v3, s11
+; GFX9-NEXT:    s_bfe_u32 s3, s0, 0x40010
+; GFX9-NEXT:    s_bfe_u32 s10, s1, 0x40010
+; GFX9-NEXT:    s_bfe_u32 s11, s1, 0x40014
+; GFX9-NEXT:    s_bfe_u32 s12, s1, 0x40018
+; GFX9-NEXT:    s_lshr_b32 s13, s1, 28
+; GFX9-NEXT:    s_and_b32 s14, s1, 15
+; GFX9-NEXT:    s_bfe_u32 s15, s1, 0x40004
+; GFX9-NEXT:    s_bfe_u32 s16, s1, 0x40008
+; GFX9-NEXT:    v_mov_b32_e32 v3, s10
 ; GFX9-NEXT:    s_bfe_u32 s1, s1, 0x4000c
-; GFX9-NEXT:    s_bfe_u32 s5, s0, 0x40014
-; GFX9-NEXT:    v_mov_b32_e32 v4, s12
-; GFX9-NEXT:    s_bfe_u32 s6, s0, 0x40018
-; GFX9-NEXT:    v_mov_b32_e32 v5, s13
-; GFX9-NEXT:    s_lshr_b32 s7, s0, 28
-; GFX9-NEXT:    v_mov_b32_e32 v6, s14
-; GFX9-NEXT:    s_and_b32 s8, s0, 15
-; GFX9-NEXT:    v_mov_b32_e32 v7, s15
-; GFX9-NEXT:    s_bfe_u32 s9, s0, 0x40004
-; GFX9-NEXT:    v_mov_b32_e32 v8, s16
-; GFX9-NEXT:    s_bfe_u32 s10, s0, 0x40008
-; GFX9-NEXT:    v_mov_b32_e32 v9, s17
+; GFX9-NEXT:    s_bfe_u32 s4, s0, 0x40014
+; GFX9-NEXT:    v_mov_b32_e32 v4, s11
+; GFX9-NEXT:    s_bfe_u32 s5, s0, 0x40018
+; GFX9-NEXT:    v_mov_b32_e32 v5, s12
+; GFX9-NEXT:    s_lshr_b32 s6, s0, 28
+; GFX9-NEXT:    v_mov_b32_e32 v6, s13
+; GFX9-NEXT:    s_and_b32 s7, s0, 15
+; GFX9-NEXT:    v_mov_b32_e32 v7, s14
+; GFX9-NEXT:    s_bfe_u32 s8, s0, 0x40004
+; GFX9-NEXT:    v_mov_b32_e32 v8, s15
+; GFX9-NEXT:    s_bfe_u32 s9, s0, 0x40008
+; GFX9-NEXT:    v_mov_b32_e32 v9, s16
 ; GFX9-NEXT:    s_bfe_u32 s0, s0, 0x4000c
 ; GFX9-NEXT:    v_mov_b32_e32 v10, s1
-; GFX9-NEXT:    v_mul_lo_u16_e32 v3, s4, v3
-; GFX9-NEXT:    v_mul_lo_u16_sdwa v4, s5, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT:    v_mul_lo_u16_e32 v5, s6, v5
-; GFX9-NEXT:    v_mul_lo_u16_sdwa v6, s7, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT:    v_mul_lo_u16_e32 v7, s8, v7
-; GFX9-NEXT:    v_mul_lo_u16_sdwa v8, s9, v8 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_mul_lo_u16_e32 v3, s3, v3
+; GFX9-NEXT:    v_mul_lo_u16_sdwa v4, s4, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_mul_lo_u16_e32 v5, s5, v5
+; GFX9-NEXT:    v_mul_lo_u16_sdwa v6, s6, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_mul_lo_u16_e32 v7, s7, v7
+; GFX9-NEXT:    v_mul_lo_u16_sdwa v8, s8, v8 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX9-NEXT:    v_or_b32_sdwa v4, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_e32 v5, v7, v8
-; GFX9-NEXT:    v_mul_lo_u16_e32 v9, s10, v9
+; GFX9-NEXT:    v_mul_lo_u16_e32 v9, s9, v9
 ; GFX9-NEXT:    v_mul_lo_u16_sdwa v10, s0, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_and_b32_e32 v5, s2, v5
 ; GFX9-NEXT:    v_or_b32_sdwa v6, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -2497,40 +2497,40 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_bfe_u32 s4, s0, 0x40010
-; GFX9-DL-NEXT:    s_bfe_u32 s11, s1, 0x40010
-; GFX9-DL-NEXT:    s_bfe_u32 s12, s1, 0x40014
-; GFX9-DL-NEXT:    s_bfe_u32 s13, s1, 0x40018
-; GFX9-DL-NEXT:    s_lshr_b32 s14, s1, 28
-; GFX9-DL-NEXT:    s_and_b32 s15, s1, 15
-; GFX9-DL-NEXT:    s_bfe_u32 s16, s1, 0x40004
-; GFX9-DL-NEXT:    s_bfe_u32 s17, s1, 0x40008
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s11
+; GFX9-DL-NEXT:    s_bfe_u32 s3, s0, 0x40010
+; GFX9-DL-NEXT:    s_bfe_u32 s10, s1, 0x40010
+; GFX9-DL-NEXT:    s_bfe_u32 s11, s1, 0x40014
+; GFX9-DL-NEXT:    s_bfe_u32 s12, s1, 0x40018
+; GFX9-DL-NEXT:    s_lshr_b32 s13, s1, 28
+; GFX9-DL-NEXT:    s_and_b32 s14, s1, 15
+; GFX9-DL-NEXT:    s_bfe_u32 s15, s1, 0x40004
+; GFX9-DL-NEXT:    s_bfe_u32 s16, s1, 0x40008
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s10
 ; GFX9-DL-NEXT:    s_bfe_u32 s1, s1, 0x4000c
-; GFX9-DL-NEXT:    s_bfe_u32 s5, s0, 0x40014
-; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s12
-; GFX9-DL-NEXT:    s_bfe_u32 s6, s0, 0x40018
-; GFX9-DL-NEXT:    v_mov_b32_e32 v5, s13
-; GFX9-DL-NEXT:    s_lshr_b32 s7, s0, 28
-; GFX9-DL-NEXT:    v_mov_b32_e32 v6, s14
-; GFX9-DL-NEXT:    s_and_b32 s8, s0, 15
-; GFX9-DL-NEXT:    v_mov_b32_e32 v7, s15
-; GFX9-DL-NEXT:    s_bfe_u32 s9, s0, 0x40004
-; GFX9-DL-NEXT:    v_mov_b32_e32 v8, s16
-; GFX9-DL-NEXT:    s_bfe_u32 s10, s0, 0x40008
-; GFX9-DL-NEXT:    v_mov_b32_e32 v9, s17
+; GFX9-DL-NEXT:    s_bfe_u32 s4, s0, 0x40014
+; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s11
+; GFX9-DL-NEXT:    s_bfe_u32 s5, s0, 0x40018
+; GFX9-DL-NEXT:    v_mov_b32_e32 v5, s12
+; GFX9-DL-NEXT:    s_lshr_b32 s6, s0, 28
+; GFX9-DL-NEXT:    v_mov_b32_e32 v6, s13
+; GFX9-DL-NEXT:    s_and_b32 s7, s0, 15
+; GFX9-DL-NEXT:    v_mov_b32_e32 v7, s14
+; GFX9-DL-NEXT:    s_bfe_u32 s8, s0, 0x40004
+; GFX9-DL-NEXT:    v_mov_b32_e32 v8, s15
+; GFX9-DL-NEXT:    s_bfe_u32 s9, s0, 0x40008
+; GFX9-DL-NEXT:    v_mov_b32_e32 v9, s16
 ; GFX9-DL-NEXT:    s_bfe_u32 s0, s0, 0x4000c
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v10, s1
-; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v3, s4, v3
-; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v4, s5, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v5, s6, v5
-; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v6, s7, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v7, s8, v7
-; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v8, s9, v8 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v3, s3, v3
+; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v4, s4, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v5, s5, v5
+; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v6, s6, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v7, s7, v7
+; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v8, s8, v8 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-DL-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX9-DL-NEXT:    v_or_b32_sdwa v4, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-DL-NEXT:    v_or_b32_e32 v5, v7, v8
-; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v9, s10, v9
+; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v9, s9, v9
 ; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v10, s0, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-DL-NEXT:    v_and_b32_e32 v5, s2, v5
 ; GFX9-DL-NEXT:    v_or_b32_sdwa v6, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -2553,58 +2553,58 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ;
 ; GFX10-DL-LABEL: udot8_acc8_vecMul:
 ; GFX10-DL:       ; %bb.0: ; %entry
-; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-DL-NEXT:    global_load_ubyte v2, v[0:1], off
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX10-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40004
-; GFX10-DL-NEXT:    s_bfe_u32 s4, s1, 0x40004
-; GFX10-DL-NEXT:    s_and_b32 s5, s0, 15
-; GFX10-DL-NEXT:    s_and_b32 s7, s1, 15
-; GFX10-DL-NEXT:    s_bfe_u32 s6, s0, 0x4000c
-; GFX10-DL-NEXT:    s_bfe_u32 s8, s1, 0x4000c
-; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v3, s2, s4
+; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x40004
+; GFX10-DL-NEXT:    s_and_b32 s4, s0, 15
+; GFX10-DL-NEXT:    s_and_b32 s6, s1, 15
+; GFX10-DL-NEXT:    s_bfe_u32 s5, s0, 0x4000c
+; GFX10-DL-NEXT:    s_bfe_u32 s7, s1, 0x4000c
+; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v3, s2, s3
 ; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40008
-; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v4, s5, s7
-; GFX10-DL-NEXT:    s_bfe_u32 s4, s1, 0x40008
-; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v5, s6, s8
+; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v4, s4, s6
+; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x40008
+; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v5, s5, s7
 ; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v3, 8, v3
-; GFX10-DL-NEXT:    s_mov_b32 s5, 0xffff
-; GFX10-DL-NEXT:    s_bfe_u32 s7, s1, 0x40014
-; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v6, s2, s4
+; GFX10-DL-NEXT:    s_mov_b32 s4, 0xffff
+; GFX10-DL-NEXT:    s_bfe_u32 s6, s1, 0x40014
+; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v6, s2, s3
 ; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v5, 8, v5
 ; GFX10-DL-NEXT:    v_or_b32_e32 v3, v4, v3
-; GFX10-DL-NEXT:    s_bfe_u32 s4, s0, 0x40014
+; GFX10-DL-NEXT:    s_bfe_u32 s3, s0, 0x40014
 ; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40010
-; GFX10-DL-NEXT:    s_bfe_u32 s6, s0, 0x40018
+; GFX10-DL-NEXT:    s_bfe_u32 s5, s0, 0x40018
 ; GFX10-DL-NEXT:    v_or_b32_sdwa v4, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-DL-NEXT:    v_and_b32_e32 v3, s5, v3
-; GFX10-DL-NEXT:    s_bfe_u32 s8, s1, 0x40010
-; GFX10-DL-NEXT:    s_lshr_b32 s9, s1, 28
+; GFX10-DL-NEXT:    v_and_b32_e32 v3, s4, v3
+; GFX10-DL-NEXT:    s_bfe_u32 s7, s1, 0x40010
+; GFX10-DL-NEXT:    s_lshr_b32 s8, s1, 28
 ; GFX10-DL-NEXT:    s_lshr_b32 s0, s0, 28
-; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v5, s4, s7
+; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v5, s3, s6
 ; GFX10-DL-NEXT:    v_or_b32_e32 v4, v3, v4
 ; GFX10-DL-NEXT:    s_bfe_u32 s1, s1, 0x40018
-; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v6, s2, s8
-; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v7, s0, s9
+; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v6, s2, s7
+; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v7, s0, s8
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v8, 8, v4
 ; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v7, 8, v7
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v3, v2
 ; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v3, 8, v5
-; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v5, s6, s1
+; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v5, s5, s1
 ; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v2, v8
 ; GFX10-DL-NEXT:    v_or_b32_e32 v3, v6, v3
 ; GFX10-DL-NEXT:    v_or_b32_sdwa v5, v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2
-; GFX10-DL-NEXT:    v_and_b32_e32 v3, s5, v3
+; GFX10-DL-NEXT:    v_and_b32_e32 v3, s4, v3
 ; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
 ; GFX10-DL-NEXT:    v_or_b32_e32 v4, v3, v5
 ; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v2, v3
@@ -2651,50 +2651,50 @@ entry:
 define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX7-LABEL: udot8_acc4_vecMul:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
-; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
-; GFX7-NEXT:    s_load_dword s0, s[8:9], 0x0
-; GFX7-NEXT:    s_load_dword s1, s[10:11], 0x0
+; GFX7-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_lshr_b32 s2, s0, 28
-; GFX7-NEXT:    s_bfe_u32 s15, s1, 0x40018
-; GFX7-NEXT:    s_bfe_u32 s16, s1, 0x40014
-; GFX7-NEXT:    s_bfe_u32 s17, s1, 0x40010
-; GFX7-NEXT:    s_bfe_u32 s18, s1, 0x4000c
-; GFX7-NEXT:    s_bfe_u32 s19, s1, 0x40008
-; GFX7-NEXT:    s_bfe_u32 s20, s1, 0x40004
-; GFX7-NEXT:    s_lshr_b32 s14, s1, 28
-; GFX7-NEXT:    s_and_b32 s1, s1, 15
-; GFX7-NEXT:    s_bfe_u32 s8, s0, 0x40018
-; GFX7-NEXT:    s_bfe_u32 s9, s0, 0x40014
-; GFX7-NEXT:    s_bfe_u32 s10, s0, 0x40010
-; GFX7-NEXT:    s_bfe_u32 s11, s0, 0x4000c
-; GFX7-NEXT:    s_bfe_u32 s12, s0, 0x40008
-; GFX7-NEXT:    s_bfe_u32 s13, s0, 0x40004
-; GFX7-NEXT:    s_and_b32 s0, s0, 15
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s20
-; GFX7-NEXT:    v_mov_b32_e32 v3, s19
-; GFX7-NEXT:    v_mov_b32_e32 v4, s18
-; GFX7-NEXT:    v_mov_b32_e32 v5, s17
-; GFX7-NEXT:    v_mov_b32_e32 v6, s16
-; GFX7-NEXT:    v_mov_b32_e32 v7, s15
+; GFX7-NEXT:    s_lshr_b32 s6, s4, 28
+; GFX7-NEXT:    s_bfe_u32 s14, s5, 0x40018
+; GFX7-NEXT:    s_bfe_u32 s15, s5, 0x40014
+; GFX7-NEXT:    s_bfe_u32 s16, s5, 0x40010
+; GFX7-NEXT:    s_bfe_u32 s17, s5, 0x4000c
+; GFX7-NEXT:    s_bfe_u32 s18, s5, 0x40008
+; GFX7-NEXT:    s_bfe_u32 s19, s5, 0x40004
+; GFX7-NEXT:    s_lshr_b32 s13, s5, 28
+; GFX7-NEXT:    s_and_b32 s5, s5, 15
+; GFX7-NEXT:    s_bfe_u32 s7, s4, 0x40018
+; GFX7-NEXT:    s_bfe_u32 s8, s4, 0x40014
+; GFX7-NEXT:    s_bfe_u32 s9, s4, 0x40010
+; GFX7-NEXT:    s_bfe_u32 s10, s4, 0x4000c
+; GFX7-NEXT:    s_bfe_u32 s11, s4, 0x40008
+; GFX7-NEXT:    s_bfe_u32 s12, s4, 0x40004
+; GFX7-NEXT:    s_and_b32 s4, s4, 15
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s19
+; GFX7-NEXT:    v_mov_b32_e32 v3, s18
+; GFX7-NEXT:    v_mov_b32_e32 v4, s17
+; GFX7-NEXT:    v_mov_b32_e32 v5, s16
+; GFX7-NEXT:    v_mov_b32_e32 v6, s15
+; GFX7-NEXT:    v_mov_b32_e32 v7, s14
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mad_u32_u24 v0, s0, v1, v0
-; GFX7-NEXT:    v_mad_u32_u24 v0, s13, v2, v0
-; GFX7-NEXT:    v_mad_u32_u24 v0, s12, v3, v0
-; GFX7-NEXT:    v_mad_u32_u24 v0, s11, v4, v0
-; GFX7-NEXT:    v_mad_u32_u24 v0, s10, v5, v0
-; GFX7-NEXT:    v_mad_u32_u24 v0, s9, v6, v0
-; GFX7-NEXT:    v_mad_u32_u24 v0, s8, v7, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s14
-; GFX7-NEXT:    v_mad_u32_u24 v0, s2, v1, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s12, v2, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s11, v3, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s10, v4, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s9, v5, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s8, v6, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s7, v7, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s13
+; GFX7-NEXT:    v_mad_u32_u24 v0, s6, v1, v0
 ; GFX7-NEXT:    v_and_b32_e32 v0, 15, v0
-; GFX7-NEXT:    buffer_store_byte v0, off, s[4:7], 0
+; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: udot8_acc4_vecMul:
@@ -2708,41 +2708,41 @@ define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX8-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_and_b32 s9, s0, 15
-; GFX8-NEXT:    s_and_b32 s16, s1, 15
-; GFX8-NEXT:    s_bfe_u32 s15, s1, 0x40004
-; GFX8-NEXT:    v_mov_b32_e32 v4, s16
-; GFX8-NEXT:    s_bfe_u32 s11, s1, 0x40018
-; GFX8-NEXT:    s_bfe_u32 s12, s1, 0x40014
-; GFX8-NEXT:    s_bfe_u32 s13, s1, 0x40010
-; GFX8-NEXT:    s_bfe_u32 s14, s1, 0x40008
-; GFX8-NEXT:    s_lshr_b32 s10, s1, 28
+; GFX8-NEXT:    s_and_b32 s8, s0, 15
+; GFX8-NEXT:    s_and_b32 s15, s1, 15
+; GFX8-NEXT:    s_bfe_u32 s14, s1, 0x40004
+; GFX8-NEXT:    v_mov_b32_e32 v4, s15
+; GFX8-NEXT:    s_bfe_u32 s10, s1, 0x40018
+; GFX8-NEXT:    s_bfe_u32 s11, s1, 0x40014
+; GFX8-NEXT:    s_bfe_u32 s12, s1, 0x40010
+; GFX8-NEXT:    s_bfe_u32 s13, s1, 0x40008
+; GFX8-NEXT:    s_lshr_b32 s9, s1, 28
 ; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x4000c
-; GFX8-NEXT:    s_bfe_u32 s8, s0, 0x40004
-; GFX8-NEXT:    v_mov_b32_e32 v5, s15
+; GFX8-NEXT:    s_bfe_u32 s7, s0, 0x40004
+; GFX8-NEXT:    v_mov_b32_e32 v5, s14
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 28
-; GFX8-NEXT:    s_bfe_u32 s4, s0, 0x40018
-; GFX8-NEXT:    s_bfe_u32 s5, s0, 0x40014
-; GFX8-NEXT:    s_bfe_u32 s6, s0, 0x40010
-; GFX8-NEXT:    s_bfe_u32 s7, s0, 0x40008
+; GFX8-NEXT:    s_bfe_u32 s3, s0, 0x40018
+; GFX8-NEXT:    s_bfe_u32 s4, s0, 0x40014
+; GFX8-NEXT:    s_bfe_u32 s5, s0, 0x40010
+; GFX8-NEXT:    s_bfe_u32 s6, s0, 0x40008
 ; GFX8-NEXT:    s_bfe_u32 s0, s0, 0x4000c
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    v_mov_b32_e32 v6, s14
+; GFX8-NEXT:    v_mov_b32_e32 v6, s13
 ; GFX8-NEXT:    v_mul_u32_u24_e32 v3, s0, v3
 ; GFX8-NEXT:    v_and_b32_e32 v3, 15, v3
-; GFX8-NEXT:    v_mov_b32_e32 v7, s13
-; GFX8-NEXT:    v_mov_b32_e32 v8, s12
-; GFX8-NEXT:    v_mov_b32_e32 v9, s11
+; GFX8-NEXT:    v_mov_b32_e32 v7, s12
+; GFX8-NEXT:    v_mov_b32_e32 v8, s11
+; GFX8-NEXT:    v_mov_b32_e32 v9, s10
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mad_u32_u24 v2, s9, v4, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s8, v5, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s7, v6, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s8, v4, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s7, v5, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s6, v6, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s6, v7, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s5, v8, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s4, v9, v2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s10
+; GFX8-NEXT:    v_mad_u32_u24 v2, s5, v7, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s4, v8, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s3, v9, v2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s9
 ; GFX8-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX8-NEXT:    flat_store_byte v[0:1], v2
@@ -2759,41 +2759,41 @@ define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX9-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s9, s0, 15
-; GFX9-NEXT:    s_and_b32 s16, s1, 15
-; GFX9-NEXT:    s_bfe_u32 s15, s1, 0x40004
-; GFX9-NEXT:    v_mov_b32_e32 v4, s16
-; GFX9-NEXT:    s_bfe_u32 s11, s1, 0x40018
-; GFX9-NEXT:    s_bfe_u32 s12, s1, 0x40014
-; GFX9-NEXT:    s_bfe_u32 s13, s1, 0x40010
-; GFX9-NEXT:    s_bfe_u32 s14, s1, 0x40008
-; GFX9-NEXT:    s_lshr_b32 s10, s1, 28
+; GFX9-NEXT:    s_and_b32 s8, s0, 15
+; GFX9-NEXT:    s_and_b32 s15, s1, 15
+; GFX9-NEXT:    s_bfe_u32 s14, s1, 0x40004
+; GFX9-NEXT:    v_mov_b32_e32 v4, s15
+; GFX9-NEXT:    s_bfe_u32 s10, s1, 0x40018
+; GFX9-NEXT:    s_bfe_u32 s11, s1, 0x40014
+; GFX9-NEXT:    s_bfe_u32 s12, s1, 0x40010
+; GFX9-NEXT:    s_bfe_u32 s13, s1, 0x40008
+; GFX9-NEXT:    s_lshr_b32 s9, s1, 28
 ; GFX9-NEXT:    s_bfe_u32 s1, s1, 0x4000c
-; GFX9-NEXT:    s_bfe_u32 s8, s0, 0x40004
-; GFX9-NEXT:    v_mov_b32_e32 v5, s15
+; GFX9-NEXT:    s_bfe_u32 s7, s0, 0x40004
+; GFX9-NEXT:    v_mov_b32_e32 v5, s14
 ; GFX9-NEXT:    s_lshr_b32 s2, s0, 28
-; GFX9-NEXT:    s_bfe_u32 s4, s0, 0x40018
-; GFX9-NEXT:    s_bfe_u32 s5, s0, 0x40014
-; GFX9-NEXT:    s_bfe_u32 s6, s0, 0x40010
-; GFX9-NEXT:    s_bfe_u32 s7, s0, 0x40008
+; GFX9-NEXT:    s_bfe_u32 s3, s0, 0x40018
+; GFX9-NEXT:    s_bfe_u32 s4, s0, 0x40014
+; GFX9-NEXT:    s_bfe_u32 s5, s0, 0x40010
+; GFX9-NEXT:    s_bfe_u32 s6, s0, 0x40008
 ; GFX9-NEXT:    s_bfe_u32 s0, s0, 0x4000c
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    v_mov_b32_e32 v6, s14
+; GFX9-NEXT:    v_mov_b32_e32 v6, s13
 ; GFX9-NEXT:    v_mul_u32_u24_e32 v3, s0, v3
 ; GFX9-NEXT:    v_and_b32_e32 v3, 15, v3
-; GFX9-NEXT:    v_mov_b32_e32 v7, s13
-; GFX9-NEXT:    v_mov_b32_e32 v8, s12
-; GFX9-NEXT:    v_mov_b32_e32 v9, s11
+; GFX9-NEXT:    v_mov_b32_e32 v7, s12
+; GFX9-NEXT:    v_mov_b32_e32 v8, s11
+; GFX9-NEXT:    v_mov_b32_e32 v9, s10
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mad_u32_u24 v2, s9, v4, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s8, v5, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s7, v6, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s8, v4, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s7, v5, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s6, v6, v2
 ; GFX9-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX9-NEXT:    v_add_u32_e32 v2, v2, v3
-; GFX9-NEXT:    v_mad_u32_u24 v2, s6, v7, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s5, v8, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s4, v9, v2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s10
+; GFX9-NEXT:    v_mad_u32_u24 v2, s5, v7, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s4, v8, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s3, v9, v2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s9
 ; GFX9-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
 ; GFX9-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX9-NEXT:    global_store_byte v[0:1], v2, off
@@ -2810,41 +2810,41 @@ define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_and_b32 s9, s0, 15
-; GFX9-DL-NEXT:    s_and_b32 s16, s1, 15
-; GFX9-DL-NEXT:    s_bfe_u32 s15, s1, 0x40004
-; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s16
-; GFX9-DL-NEXT:    s_bfe_u32 s11, s1, 0x40018
-; GFX9-DL-NEXT:    s_bfe_u32 s12, s1, 0x40014
-; GFX9-DL-NEXT:    s_bfe_u32 s13, s1, 0x40010
-; GFX9-DL-NEXT:    s_bfe_u32 s14, s1, 0x40008
-; GFX9-DL-NEXT:    s_lshr_b32 s10, s1, 28
+; GFX9-DL-NEXT:    s_and_b32 s8, s0, 15
+; GFX9-DL-NEXT:    s_and_b32 s15, s1, 15
+; GFX9-DL-NEXT:    s_bfe_u32 s14, s1, 0x40004
+; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s15
+; GFX9-DL-NEXT:    s_bfe_u32 s10, s1, 0x40018
+; GFX9-DL-NEXT:    s_bfe_u32 s11, s1, 0x40014
+; GFX9-DL-NEXT:    s_bfe_u32 s12, s1, 0x40010
+; GFX9-DL-NEXT:    s_bfe_u32 s13, s1, 0x40008
+; GFX9-DL-NEXT:    s_lshr_b32 s9, s1, 28
 ; GFX9-DL-NEXT:    s_bfe_u32 s1, s1, 0x4000c
-; GFX9-DL-NEXT:    s_bfe_u32 s8, s0, 0x40004
-; GFX9-DL-NEXT:    v_mov_b32_e32 v5, s15
+; GFX9-DL-NEXT:    s_bfe_u32 s7, s0, 0x40004
+; GFX9-DL-NEXT:    v_mov_b32_e32 v5, s14
 ; GFX9-DL-NEXT:    s_lshr_b32 s2, s0, 28
-; GFX9-DL-NEXT:    s_bfe_u32 s4, s0, 0x40018
-; GFX9-DL-NEXT:    s_bfe_u32 s5, s0, 0x40014
-; GFX9-DL-NEXT:    s_bfe_u32 s6, s0, 0x40010
-; GFX9-DL-NEXT:    s_bfe_u32 s7, s0, 0x40008
+; GFX9-DL-NEXT:    s_bfe_u32 s3, s0, 0x40018
+; GFX9-DL-NEXT:    s_bfe_u32 s4, s0, 0x40014
+; GFX9-DL-NEXT:    s_bfe_u32 s5, s0, 0x40010
+; GFX9-DL-NEXT:    s_bfe_u32 s6, s0, 0x40008
 ; GFX9-DL-NEXT:    s_bfe_u32 s0, s0, 0x4000c
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-DL-NEXT:    v_mov_b32_e32 v6, s14
+; GFX9-DL-NEXT:    v_mov_b32_e32 v6, s13
 ; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v3, s0, v3
 ; GFX9-DL-NEXT:    v_and_b32_e32 v3, 15, v3
-; GFX9-DL-NEXT:    v_mov_b32_e32 v7, s13
-; GFX9-DL-NEXT:    v_mov_b32_e32 v8, s12
-; GFX9-DL-NEXT:    v_mov_b32_e32 v9, s11
+; GFX9-DL-NEXT:    v_mov_b32_e32 v7, s12
+; GFX9-DL-NEXT:    v_mov_b32_e32 v8, s11
+; GFX9-DL-NEXT:    v_mov_b32_e32 v9, s10
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s9, v4, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s8, v5, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s7, v6, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s8, v4, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s7, v5, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s6, v6, v2
 ; GFX9-DL-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX9-DL-NEXT:    v_add_u32_e32 v2, v2, v3
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s6, v7, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s5, v8, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s4, v9, v2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s10
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s5, v7, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s4, v8, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s3, v9, v2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s9
 ; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
 ; GFX9-DL-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX9-DL-NEXT:    global_store_byte v[0:1], v2, off
@@ -2852,44 +2852,44 @@ define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1,
 ;
 ; GFX10-DL-LABEL: udot8_acc4_vecMul:
 ; GFX10-DL:       ; %bb.0: ; %entry
-; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-DL-NEXT:    global_load_ubyte v2, v[0:1], off
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX10-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_and_b32 s2, s0, 15
-; GFX10-DL-NEXT:    s_and_b32 s4, s1, 15
-; GFX10-DL-NEXT:    s_bfe_u32 s5, s0, 0x40004
-; GFX10-DL-NEXT:    s_bfe_u32 s6, s1, 0x40004
-; GFX10-DL-NEXT:    s_bfe_u32 s7, s1, 0x40008
+; GFX10-DL-NEXT:    s_and_b32 s3, s1, 15
+; GFX10-DL-NEXT:    s_bfe_u32 s4, s0, 0x40004
+; GFX10-DL-NEXT:    s_bfe_u32 s5, s1, 0x40004
+; GFX10-DL-NEXT:    s_bfe_u32 s6, s1, 0x40008
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s4, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s3, v2
 ; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40008
-; GFX10-DL-NEXT:    s_bfe_u32 s4, s0, 0x4000c
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s5, s6, v2
-; GFX10-DL-NEXT:    s_bfe_u32 s5, s1, 0x4000c
-; GFX10-DL-NEXT:    s_bfe_u32 s6, s1, 0x40014
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s7, v2
-; GFX10-DL-NEXT:    v_mul_u32_u24_e64 v3, s4, s5
+; GFX10-DL-NEXT:    s_bfe_u32 s3, s0, 0x4000c
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s4, s5, v2
+; GFX10-DL-NEXT:    s_bfe_u32 s4, s1, 0x4000c
+; GFX10-DL-NEXT:    s_bfe_u32 s5, s1, 0x40014
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s6, v2
+; GFX10-DL-NEXT:    v_mul_u32_u24_e64 v3, s3, s4
 ; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40010
-; GFX10-DL-NEXT:    s_bfe_u32 s4, s1, 0x40010
-; GFX10-DL-NEXT:    s_bfe_u32 s5, s0, 0x40014
+; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x40010
+; GFX10-DL-NEXT:    s_bfe_u32 s4, s0, 0x40014
 ; GFX10-DL-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX10-DL-NEXT:    v_and_b32_e32 v3, 15, v3
 ; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v2, v3
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s4, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s3, v2
 ; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40018
-; GFX10-DL-NEXT:    s_bfe_u32 s4, s1, 0x40018
+; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x40018
 ; GFX10-DL-NEXT:    s_lshr_b32 s0, s0, 28
 ; GFX10-DL-NEXT:    s_lshr_b32 s1, s1, 28
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s5, s6, v2
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s4, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s4, s5, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s3, v2
 ; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s0, s1, v2
 ; GFX10-DL-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX10-DL-NEXT:    global_store_byte v[0:1], v2, off

diff  --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll
index 2d85df7c2b94..f004bbe50093 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll
@@ -16,40 +16,40 @@ define amdgpu_kernel void @extract_w_offset_vgpr(i32 addrspace(1)* %out) {
   ; GCN:   renamable $sgpr2 = COPY renamable $sgpr1
   ; GCN:   renamable $sgpr0 = COPY renamable $sgpr0, implicit killed $sgpr0_sgpr1
   ; GCN:   renamable $sgpr1 = S_MOV_B32 61440
-  ; GCN:   renamable $sgpr4 = S_MOV_B32 -1
-  ; GCN:   undef renamable $sgpr8 = COPY killed renamable $sgpr0, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
-  ; GCN:   renamable $sgpr9 = COPY killed renamable $sgpr2
-  ; GCN:   renamable $sgpr10 = COPY killed renamable $sgpr4
-  ; GCN:   renamable $sgpr11 = COPY killed renamable $sgpr1
+  ; GCN:   renamable $sgpr3 = S_MOV_B32 -1
+  ; GCN:   undef renamable $sgpr4 = COPY killed renamable $sgpr0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7
+  ; GCN:   renamable $sgpr5 = COPY killed renamable $sgpr2
+  ; GCN:   renamable $sgpr6 = COPY killed renamable $sgpr3
+  ; GCN:   renamable $sgpr7 = COPY killed renamable $sgpr1
   ; GCN:   renamable $sgpr0 = S_MOV_B32 16
   ; GCN:   renamable $sgpr1 = S_MOV_B32 15
   ; GCN:   renamable $sgpr2 = S_MOV_B32 14
-  ; GCN:   renamable $sgpr4 = S_MOV_B32 13
-  ; GCN:   renamable $sgpr5 = S_MOV_B32 12
-  ; GCN:   renamable $sgpr6 = S_MOV_B32 11
-  ; GCN:   renamable $sgpr7 = S_MOV_B32 10
-  ; GCN:   renamable $sgpr12 = S_MOV_B32 9
-  ; GCN:   renamable $sgpr13 = S_MOV_B32 8
-  ; GCN:   renamable $sgpr14 = S_MOV_B32 7
-  ; GCN:   renamable $sgpr15 = S_MOV_B32 6
-  ; GCN:   renamable $sgpr16 = S_MOV_B32 5
-  ; GCN:   renamable $sgpr17 = S_MOV_B32 3
-  ; GCN:   renamable $sgpr18 = S_MOV_B32 2
-  ; GCN:   renamable $sgpr19 = S_MOV_B32 1
-  ; GCN:   renamable $sgpr20 = S_MOV_B32 0
-  ; GCN:   renamable $vgpr1 = COPY killed renamable $sgpr20
-  ; GCN:   renamable $vgpr2 = COPY killed renamable $sgpr19
-  ; GCN:   renamable $vgpr3 = COPY killed renamable $sgpr18
-  ; GCN:   renamable $vgpr4 = COPY killed renamable $sgpr17
-  ; GCN:   renamable $vgpr5 = COPY killed renamable $sgpr16
-  ; GCN:   renamable $vgpr6 = COPY killed renamable $sgpr15
-  ; GCN:   renamable $vgpr7 = COPY killed renamable $sgpr14
-  ; GCN:   renamable $vgpr8 = COPY killed renamable $sgpr13
-  ; GCN:   renamable $vgpr9 = COPY killed renamable $sgpr12
-  ; GCN:   renamable $vgpr10 = COPY killed renamable $sgpr7
-  ; GCN:   renamable $vgpr11 = COPY killed renamable $sgpr6
-  ; GCN:   renamable $vgpr12 = COPY killed renamable $sgpr5
-  ; GCN:   renamable $vgpr13 = COPY killed renamable $sgpr4
+  ; GCN:   renamable $sgpr3 = S_MOV_B32 13
+  ; GCN:   renamable $sgpr8 = S_MOV_B32 12
+  ; GCN:   renamable $sgpr9 = S_MOV_B32 11
+  ; GCN:   renamable $sgpr10 = S_MOV_B32 10
+  ; GCN:   renamable $sgpr11 = S_MOV_B32 9
+  ; GCN:   renamable $sgpr12 = S_MOV_B32 8
+  ; GCN:   renamable $sgpr13 = S_MOV_B32 7
+  ; GCN:   renamable $sgpr14 = S_MOV_B32 6
+  ; GCN:   renamable $sgpr15 = S_MOV_B32 5
+  ; GCN:   renamable $sgpr16 = S_MOV_B32 3
+  ; GCN:   renamable $sgpr17 = S_MOV_B32 2
+  ; GCN:   renamable $sgpr18 = S_MOV_B32 1
+  ; GCN:   renamable $sgpr19 = S_MOV_B32 0
+  ; GCN:   renamable $vgpr1 = COPY killed renamable $sgpr19
+  ; GCN:   renamable $vgpr2 = COPY killed renamable $sgpr18
+  ; GCN:   renamable $vgpr3 = COPY killed renamable $sgpr17
+  ; GCN:   renamable $vgpr4 = COPY killed renamable $sgpr16
+  ; GCN:   renamable $vgpr5 = COPY killed renamable $sgpr15
+  ; GCN:   renamable $vgpr6 = COPY killed renamable $sgpr14
+  ; GCN:   renamable $vgpr7 = COPY killed renamable $sgpr13
+  ; GCN:   renamable $vgpr8 = COPY killed renamable $sgpr12
+  ; GCN:   renamable $vgpr9 = COPY killed renamable $sgpr11
+  ; GCN:   renamable $vgpr10 = COPY killed renamable $sgpr10
+  ; GCN:   renamable $vgpr11 = COPY killed renamable $sgpr9
+  ; GCN:   renamable $vgpr12 = COPY killed renamable $sgpr8
+  ; GCN:   renamable $vgpr13 = COPY killed renamable $sgpr3
   ; GCN:   renamable $vgpr14 = COPY killed renamable $sgpr2
   ; GCN:   renamable $vgpr15 = COPY killed renamable $sgpr1
   ; GCN:   renamable $vgpr16 = COPY killed renamable $sgpr0
@@ -69,44 +69,44 @@ define amdgpu_kernel void @extract_w_offset_vgpr(i32 addrspace(1)* %out) {
   ; GCN:   renamable $vgpr30 = COPY killed renamable $vgpr14
   ; GCN:   renamable $vgpr31 = COPY killed renamable $vgpr15
   ; GCN:   renamable $vgpr32 = COPY killed renamable $vgpr16
-  ; GCN:   renamable $sgpr22_sgpr23 = S_MOV_B64 $exec
+  ; GCN:   renamable $sgpr20_sgpr21 = S_MOV_B64 $exec
   ; GCN:   renamable $vgpr1 = IMPLICIT_DEF
-  ; GCN:   renamable $sgpr24_sgpr25 = IMPLICIT_DEF
-  ; GCN:   SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5)
-  ; GCN:   SI_SPILL_S128_SAVE killed $sgpr8_sgpr9_sgpr10_sgpr11, %stack.1, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr3 :: (store 16 into %stack.1, align 4, addrspace 5)
-  ; GCN:   SI_SPILL_V512_SAVE killed $vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32, %stack.2, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (store 64 into %stack.2, align 4, addrspace 5)
-  ; GCN:   SI_SPILL_S64_SAVE killed $sgpr22_sgpr23, %stack.3, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr3 :: (store 8 into %stack.3, align 4, addrspace 5)
-  ; GCN:   SI_SPILL_V32_SAVE killed $vgpr1, %stack.4, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (store 4 into %stack.4, addrspace 5)
-  ; GCN:   SI_SPILL_S64_SAVE killed $sgpr24_sgpr25, %stack.5, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr3 :: (store 8 into %stack.5, align 4, addrspace 5)
+  ; GCN:   renamable $sgpr22_sgpr23 = IMPLICIT_DEF
+  ; GCN:   SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5)
+  ; GCN:   SI_SPILL_S128_SAVE killed $sgpr4_sgpr5_sgpr6_sgpr7, %stack.1, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (store 16 into %stack.1, align 4, addrspace 5)
+  ; GCN:   SI_SPILL_V512_SAVE killed $vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32, %stack.2, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 64 into %stack.2, align 4, addrspace 5)
+  ; GCN:   SI_SPILL_S64_SAVE killed $sgpr20_sgpr21, %stack.3, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (store 8 into %stack.3, align 4, addrspace 5)
+  ; GCN:   SI_SPILL_V32_SAVE killed $vgpr1, %stack.4, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 4 into %stack.4, addrspace 5)
+  ; GCN:   SI_SPILL_S64_SAVE killed $sgpr22_sgpr23, %stack.5, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (store 8 into %stack.5, align 4, addrspace 5)
   ; GCN: bb.1:
   ; GCN:   successors: %bb.1(0x40000000), %bb.3(0x40000000)
-  ; GCN:   $sgpr0_sgpr1 = SI_SPILL_S64_RESTORE %stack.5, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr3 :: (load 8 from %stack.5, align 4, addrspace 5)
-  ; GCN:   $vgpr0 = SI_SPILL_V32_RESTORE %stack.4, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (load 4 from %stack.4, addrspace 5)
-  ; GCN:   $vgpr1 = SI_SPILL_V32_RESTORE %stack.0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5)
+  ; GCN:   $sgpr0_sgpr1 = SI_SPILL_S64_RESTORE %stack.5, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (load 8 from %stack.5, align 4, addrspace 5)
+  ; GCN:   $vgpr0 = SI_SPILL_V32_RESTORE %stack.4, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 4 from %stack.4, addrspace 5)
+  ; GCN:   $vgpr1 = SI_SPILL_V32_RESTORE %stack.0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5)
   ; GCN:   renamable $sgpr2 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec
   ; GCN:   renamable $sgpr4_sgpr5 = V_CMP_EQ_U32_e64 $sgpr2, killed $vgpr1, implicit $exec
   ; GCN:   renamable $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 killed renamable $sgpr4_sgpr5, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GCN:   S_SET_GPR_IDX_ON killed renamable $sgpr2, 1, implicit-def $m0, implicit undef $m0
-  ; GCN:   $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = SI_SPILL_V512_RESTORE %stack.2, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (load 64 from %stack.2, align 4, addrspace 5)
+  ; GCN:   $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = SI_SPILL_V512_RESTORE %stack.2, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 64 from %stack.2, align 4, addrspace 5)
   ; GCN:   renamable $vgpr18 = V_MOV_B32_e32 undef $vgpr3, implicit $exec, implicit killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, implicit $m0
   ; GCN:   S_SET_GPR_IDX_OFF
   ; GCN:   renamable $vgpr19 = COPY renamable $vgpr18
   ; GCN:   renamable $sgpr6_sgpr7 = COPY renamable $sgpr4_sgpr5
-  ; GCN:   SI_SPILL_S64_SAVE killed $sgpr6_sgpr7, %stack.5, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr3 :: (store 8 into %stack.5, align 4, addrspace 5)
-  ; GCN:   SI_SPILL_S64_SAVE killed $sgpr0_sgpr1, %stack.6, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr3 :: (store 8 into %stack.6, align 4, addrspace 5)
-  ; GCN:   SI_SPILL_V32_SAVE killed $vgpr19, %stack.4, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (store 4 into %stack.4, addrspace 5)
-  ; GCN:   SI_SPILL_V32_SAVE killed $vgpr0, %stack.7, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (store 4 into %stack.7, addrspace 5)
-  ; GCN:   SI_SPILL_V32_SAVE killed $vgpr18, %stack.8, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (store 4 into %stack.8, addrspace 5)
+  ; GCN:   SI_SPILL_S64_SAVE killed $sgpr6_sgpr7, %stack.5, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (store 8 into %stack.5, align 4, addrspace 5)
+  ; GCN:   SI_SPILL_S64_SAVE killed $sgpr0_sgpr1, %stack.6, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (store 8 into %stack.6, align 4, addrspace 5)
+  ; GCN:   SI_SPILL_V32_SAVE killed $vgpr19, %stack.4, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 4 into %stack.4, addrspace 5)
+  ; GCN:   SI_SPILL_V32_SAVE killed $vgpr0, %stack.7, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 4 into %stack.7, addrspace 5)
+  ; GCN:   SI_SPILL_V32_SAVE killed $vgpr18, %stack.8, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 4 into %stack.8, addrspace 5)
   ; GCN:   $exec = S_XOR_B64_term $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc
   ; GCN:   S_CBRANCH_EXECNZ %bb.1, implicit $exec
   ; GCN: bb.3:
   ; GCN:   successors: %bb.2(0x80000000)
-  ; GCN:   $sgpr0_sgpr1 = SI_SPILL_S64_RESTORE %stack.3, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr3 :: (load 8 from %stack.3, align 4, addrspace 5)
+  ; GCN:   $sgpr0_sgpr1 = SI_SPILL_S64_RESTORE %stack.3, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (load 8 from %stack.3, align 4, addrspace 5)
   ; GCN:   $exec = S_MOV_B64 killed renamable $sgpr0_sgpr1
   ; GCN: bb.2:
-  ; GCN:   $vgpr0 = SI_SPILL_V32_RESTORE %stack.8, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (load 4 from %stack.8, addrspace 5)
-  ; GCN:   $sgpr4_sgpr5_sgpr6_sgpr7 = SI_SPILL_S128_RESTORE %stack.1, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr3 :: (load 16 from %stack.1, align 4, addrspace 5)
-  ; GCN:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr0, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.load, addrspace 1)
+  ; GCN:   $vgpr0 = SI_SPILL_V32_RESTORE %stack.8, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 4 from %stack.8, addrspace 5)
+  ; GCN:   $sgpr0_sgpr1_sgpr2_sgpr3 = SI_SPILL_S128_RESTORE %stack.1, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (load 16 from %stack.1, align 4, addrspace 5)
+  ; GCN:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.load, addrspace 1)
   ; GCN:   S_ENDPGM 0
 entry:
   %id = call i32 @llvm.amdgcn.workitem.id.x() #1

diff  --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
index 548124627997..ca033e5289f3 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
@@ -75,11 +75,12 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr() {
 ; GCN-NEXT:     runtime_loader_kernel_symbol = 0
 ; GCN-NEXT:    .end_amd_kernel_code_t
 ; GCN-NEXT:  ; %bb.0:
-; GCN-NEXT:    s_mov_b32 s33, s17
-; GCN-NEXT:    s_mov_b32 s32, s33
+; GCN-NEXT:    s_mov_b32 s32, 0
 ; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s13
-; GCN-NEXT:    s_add_u32 s12, s12, s33
+; GCN-NEXT:    s_add_u32 s12, s12, s17
 ; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-NEXT:    s_add_u32 s0, s0, s17
+; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
 ; GCN-NEXT:    s_add_u32 s4, s4, gv.fptr0 at rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s5, s5, gv.fptr0 at rel32@hi+4
@@ -167,11 +168,12 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr_arg() {
 ; GCN-NEXT:     runtime_loader_kernel_symbol = 0
 ; GCN-NEXT:    .end_amd_kernel_code_t
 ; GCN-NEXT:  ; %bb.0:
-; GCN-NEXT:    s_mov_b32 s33, s17
-; GCN-NEXT:    s_mov_b32 s32, s33
+; GCN-NEXT:    s_mov_b32 s32, 0
 ; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s13
-; GCN-NEXT:    s_add_u32 s12, s12, s33
+; GCN-NEXT:    s_add_u32 s12, s12, s17
 ; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-NEXT:    s_add_u32 s0, s0, s17
+; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
 ; GCN-NEXT:    s_add_u32 s4, s4, gv.fptr1 at rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s5, s5, gv.fptr1 at rel32@hi+4

diff  --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
index d29bca5aee73..070a36dd4a21 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -1620,9 +1620,9 @@ define amdgpu_kernel void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)
 ; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; SI-NEXT:    s_load_dwordx16 s[12:27], s[4:5], 0x10
 ; SI-NEXT:    s_load_dword s4, s[4:5], 0x20
+; SI-NEXT:    s_add_u32 s0, s0, s7
+; SI-NEXT:    s_addc_u32 s1, s1, 0
 ; SI-NEXT:    v_mov_b32_e32 v16, 64
-; SI-NEXT:    s_mov_b32 s11, 0x100f000
-; SI-NEXT:    s_mov_b32 s10, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, s12
 ; SI-NEXT:    s_and_b32 s4, s4, 7
@@ -1642,18 +1642,20 @@ define amdgpu_kernel void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)
 ; SI-NEXT:    v_mov_b32_e32 v9, s21
 ; SI-NEXT:    v_mov_b32_e32 v10, s22
 ; SI-NEXT:    v_mov_b32_e32 v11, s23
-; SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], s7 offset:112
-; SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], s7 offset:96
-; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], s7 offset:80
-; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], s7 offset:64
+; SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:112
+; SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:96
+; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
 ; SI-NEXT:    v_or_b32_e32 v16, s4, v16
 ; SI-NEXT:    v_mov_b32_e32 v0, 0
 ; SI-NEXT:    v_mov_b32_e32 v1, 0x40200000
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], v16, s[0:3], s7 offen
-; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], s7 offset:64
-; SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[0:3], s7 offset:80
-; SI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[0:3], s7 offset:96
-; SI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[0:3], s7 offset:112
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], v16, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:64
+; SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:80
+; SI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:96
+; SI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:112
+; SI-NEXT:    s_mov_b32 s11, 0x100f000
+; SI-NEXT:    s_mov_b32 s10, -1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[8:11], 0 offset:48
 ; SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[8:11], 0 offset:32
@@ -1666,9 +1668,9 @@ define amdgpu_kernel void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)
 ; VI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; VI-NEXT:    s_load_dwordx16 s[12:27], s[4:5], 0x40
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x80
+; VI-NEXT:    s_add_u32 s0, s0, s7
+; VI-NEXT:    s_addc_u32 s1, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v16, 64
-; VI-NEXT:    s_mov_b32 s11, 0x1100f000
-; VI-NEXT:    s_mov_b32 s10, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s12
 ; VI-NEXT:    s_and_b32 s4, s4, 7
@@ -1688,18 +1690,20 @@ define amdgpu_kernel void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)
 ; VI-NEXT:    v_mov_b32_e32 v9, s21
 ; VI-NEXT:    v_mov_b32_e32 v10, s22
 ; VI-NEXT:    v_mov_b32_e32 v11, s23
-; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], s7 offset:112
-; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], s7 offset:96
-; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], s7 offset:80
-; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], s7 offset:64
+; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:112
+; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:96
+; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
 ; VI-NEXT:    v_or_b32_e32 v16, s4, v16
 ; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    v_mov_b32_e32 v1, 0x40200000
-; VI-NEXT:    buffer_store_dwordx2 v[0:1], v16, s[0:3], s7 offen
-; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], s7 offset:64
-; VI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[0:3], s7 offset:80
-; VI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[0:3], s7 offset:96
-; VI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[0:3], s7 offset:112
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], v16, s[0:3], 0 offen
+; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:64
+; VI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:80
+; VI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:96
+; VI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:112
+; VI-NEXT:    s_mov_b32 s11, 0x1100f000
+; VI-NEXT:    s_mov_b32 s10, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[8:11], 0 offset:48
 ; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[8:11], 0 offset:32

diff  --git a/llvm/test/CodeGen/AMDGPU/ipra.ll b/llvm/test/CodeGen/AMDGPU/ipra.ll
index 1b7f4df214c9..0a93986cd232 100644
--- a/llvm/test/CodeGen/AMDGPU/ipra.ll
+++ b/llvm/test/CodeGen/AMDGPU/ipra.ll
@@ -30,7 +30,7 @@ define hidden void @func() #1 {
 ; GCN-NOT: writelane
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v8
 
-; GCN: ; NumSgprs: 38
+; GCN: ; NumSgprs: 37
 ; GCN: ; NumVgprs: 9
 define amdgpu_kernel void @kernel_call() #0 {
   %vgpr = load volatile i32, i32 addrspace(1)* undef

diff  --git a/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll b/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll
index 8cb822938fd7..40d622c25e18 100644
--- a/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll
+++ b/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll
@@ -48,8 +48,8 @@
 ; GFX10HSA-DAG: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), [[FLAT_SCR_LO]]
 ; GFX10HSA-DAG: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), [[FLAT_SCR_HI]]
 
-; GCNHSA: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[0:3], s9 offen
-; GCNHSA: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[0:3], s9 offen
+; GCNHSA: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[0:3], 0 offen
+; GCNHSA: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[0:3], 0 offen
 
 ; Scratch size = alloca size + emergency stack slot, align {{.*}}, addrspace(5)
 ; ALL: ; ScratchSize: 32772

diff  --git a/llvm/test/CodeGen/AMDGPU/large-alloca-graphics.ll b/llvm/test/CodeGen/AMDGPU/large-alloca-graphics.ll
index e69ddbe83e1a..4436b60be2a9 100644
--- a/llvm/test/CodeGen/AMDGPU/large-alloca-graphics.ll
+++ b/llvm/test/CodeGen/AMDGPU/large-alloca-graphics.ll
@@ -3,15 +3,19 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=ALL %s
 
 ; ALL-LABEL: {{^}}large_alloca_pixel_shader:
-; GCN-DAG: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GCN-DAG: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; GCN-DAG: s_mov_b32 s10, -1
-; CI-DAG: s_mov_b32 s11, 0xe8f000
-; VI-DAG: s_mov_b32 s11, 0xe80000
-; GFX9-DAG: s_mov_b32 s11, 0xe00000
+; GCN-DAG: s_mov_b32 s4, SCRATCH_RSRC_DWORD0
+; GCN-DAG: s_mov_b32 s5, SCRATCH_RSRC_DWORD1
+; GCN-DAG: s_mov_b32 s6, -1
 
-; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s0 offen
-; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s0 offen
+; CI-DAG: s_mov_b32 s7, 0xe8f000
+; VI-DAG: s_mov_b32 s7, 0xe80000
+; GFX9-DAG: s_mov_b32 s7, 0xe00000
+
+; GCN: s_add_u32 s4, s4, s0
+; GCN: s_addc_u32 s5, s5, 0
+
+; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[4:7], 0 offen
+; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[4:7], 0 offen
 
 ; ALL: ; ScratchSize: 32772
 define amdgpu_ps void @large_alloca_pixel_shader(i32 %x, i32 %y) #0 {
@@ -25,15 +29,19 @@ define amdgpu_ps void @large_alloca_pixel_shader(i32 %x, i32 %y) #0 {
 }
 
 ; ALL-LABEL: {{^}}large_alloca_pixel_shader_inreg:
-; GCN-DAG: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GCN-DAG: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; GCN-DAG: s_mov_b32 s10, -1
-; CI-DAG: s_mov_b32 s11, 0xe8f000
-; VI-DAG: s_mov_b32 s11, 0xe80000
-; GFX9-DAG: s_mov_b32 s11, 0xe00000
-
-; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s2 offen
-; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s2 offen
+; GCN-DAG: s_mov_b32 s4, SCRATCH_RSRC_DWORD0
+; GCN-DAG: s_mov_b32 s5, SCRATCH_RSRC_DWORD1
+; GCN-DAG: s_mov_b32 s6, -1
+
+; CI-DAG: s_mov_b32 s7, 0xe8f000
+; VI-DAG: s_mov_b32 s7, 0xe80000
+; GFX9-DAG: s_mov_b32 s7, 0xe00000
+
+; GCN: s_add_u32 s4, s4, s2
+; GCN: s_addc_u32 s5, s5, 0
+
+; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[4:7], 0 offen
+; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[4:7], 0 offen
 
 ; ALL: ; ScratchSize: 32772
 define amdgpu_ps void @large_alloca_pixel_shader_inreg(i32 inreg %x, i32 inreg %y) #0 {

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.ll
index 3a69ef673b85..6b22d92e367d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.ll
@@ -3,7 +3,7 @@
 ; FIXME: Requires stack object to not assert
 ; GCN-LABEL: {{^}}test_ps:
 ; GCN: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GCN: buffer_store_dword v0, off, s[4:7], s2 offset:4
+; GCN: buffer_store_dword v0, off, s[4:7], 0 offset:4
 ; GCN: s_load_dword s{{[0-9]+}}, s[0:1], 0x0
 ; GCN-NEXT: s_waitcnt
 ; GCN-NEXT: ; return
@@ -18,7 +18,7 @@ define amdgpu_ps i32 @test_ps() #1 {
 
 ; GCN-LABEL: {{^}}test_cs:
 ; GCN: s_mov_b64 s[4:5], s[0:1]
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s[4:7], s2 offset:4
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s[4:7], 0 offset:4
 ; GCN: s_load_dword s0, s[0:1], 0x0
 define amdgpu_cs i32 @test_cs() #1 {
   %alloca = alloca i32, addrspace(5)

diff  --git a/llvm/test/CodeGen/AMDGPU/load-hi16.ll b/llvm/test/CodeGen/AMDGPU/load-hi16.ll
index 49f691b8ae30..37ca76e9489e 100644
--- a/llvm/test/CodeGen/AMDGPU/load-hi16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-hi16.ll
@@ -531,13 +531,13 @@ entry:
 
 ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff:
 ; GCN: s_waitcnt
-; GFX900: buffer_load_short_d16_hi v0, off, s[0:3], s33 offset:4094{{$}}
+; GFX900: buffer_load_short_d16_hi v0, off, s[0:3], 0 offset:4094{{$}}
 ; GFX900: s_waitcnt
 ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
 ; GFX900-NEXT: s_waitcnt
 ; GFX900-NEXT: s_setpc_b64
 
-; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s33 offset:4094{{$}}
+; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], 0 offset:4094{{$}}
 define void @load_private_hi_v2i16_reglo_vreg_nooff(i16 addrspace(5)* byval %in, i16 %reg) #0 {
 entry:
   %load = load volatile i16, i16 addrspace(5)* inttoptr (i32 4094 to i16 addrspace(5)*)
@@ -549,13 +549,13 @@ entry:
 
 ; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_nooff:
 ; GCN: s_waitcnt
-; GFX900-NEXT: buffer_load_short_d16_hi v1, off, s[0:3], s33 offset:4094{{$}}
+; GFX900-NEXT: buffer_load_short_d16_hi v1, off, s[0:3], 0 offset:4094{{$}}
 ; GFX900-NEXT: s_waitcnt
 ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
 ; GFX900-NEXT: s_waitcnt
 ; GFX900-NEXT: s_setpc_b64
 
-; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s33 offset:4094{{$}}
+; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], 0 offset:4094{{$}}
 define void @load_private_hi_v2f16_reglo_vreg_nooff(half addrspace(5)* %in, half %reg) #0 {
 entry:
   %load = load volatile half, half addrspace(5)* inttoptr (i32 4094 to half addrspace(5)*)
@@ -649,13 +649,13 @@ entry:
 
 ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff_zexti8:
 ; GCN: s_waitcnt
-; GFX900-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], s33 offset:4094{{$}}
+; GFX900-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], 0 offset:4094{{$}}
 ; GFX900-NEXT: s_waitcnt
 ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
 ; GFX900-NEXT: s_waitcnt
 ; GFX900-NEXT: s_setpc_b64
 
-; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], s33 offset:4094{{$}}
+; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094{{$}}
 define void @load_private_hi_v2i16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, i16 %reg) #0 {
 entry:
   %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*)
@@ -668,13 +668,13 @@ entry:
 
 ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff_sexti8:
 ; GCN: s_waitcnt
-; GFX900-NEXT: buffer_load_sbyte_d16_hi v1, off, s[0:3], s33 offset:4094{{$}}
+; GFX900-NEXT: buffer_load_sbyte_d16_hi v1, off, s[0:3], 0 offset:4094{{$}}
 ; GFX900-NEXT: s_waitcnt
 ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
 ; GFX900-NEXT: s_waitcnt
 ; GFX900-NEXT: s_setpc_b64
 
-; NO-D16-HI: buffer_load_sbyte v0, off, s[0:3], s33 offset:4094{{$}}
+; NO-D16-HI: buffer_load_sbyte v0, off, s[0:3], 0 offset:4094{{$}}
 define void @load_private_hi_v2i16_reglo_vreg_nooff_sexti8(i8 addrspace(5)* %in, i16 %reg) #0 {
 entry:
   %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*)
@@ -687,13 +687,13 @@ entry:
 
 ; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_nooff_zexti8:
 ; GCN: s_waitcnt
-; GFX900-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], s33 offset:4094{{$}}
+; GFX900-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], 0 offset:4094{{$}}
 ; GFX900-NEXT: s_waitcnt
 ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
 ; GFX900-NEXT: s_waitcnt
 ; GFX900-NEXT: s_setpc_b64
 
-; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], s33 offset:4094{{$}}
+; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094{{$}}
 define void @load_private_hi_v2f16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, half %reg) #0 {
 entry:
   %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*)

diff  --git a/llvm/test/CodeGen/AMDGPU/load-lo16.ll b/llvm/test/CodeGen/AMDGPU/load-lo16.ll
index b7df7a58e82c..d83fda5d7861 100644
--- a/llvm/test/CodeGen/AMDGPU/load-lo16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-lo16.ll
@@ -1303,7 +1303,7 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff(i16 addrspace(5)* %in, i32 %
 ; GFX900-LABEL: load_private_lo_v2i16_reglo_vreg_nooff:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    buffer_load_short_d16 v1, off, s[0:3], s33 offset:4094
+; GFX900-NEXT:    buffer_load_short_d16 v1, off, s[0:3], 0 offset:4094
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    global_store_dword v[0:1], v1, off
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
@@ -1312,7 +1312,7 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff(i16 addrspace(5)* %in, i32 %
 ; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_nooff:
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    buffer_load_ushort v0, off, s[0:3], s33 offset:4094
+; GFX906-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:4094
 ; GFX906-NEXT:    v_mov_b32_e32 v2, 0xffff
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_bfi_b32 v0, v2, v0, v1
@@ -1323,7 +1323,7 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff(i16 addrspace(5)* %in, i32 %
 ; GFX803-LABEL: load_private_lo_v2i16_reglo_vreg_nooff:
 ; GFX803:       ; %bb.0: ; %entry
 ; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX803-NEXT:    buffer_load_ushort v0, off, s[0:3], s33 offset:4094
+; GFX803-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:4094
 ; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -1342,7 +1342,7 @@ define void @load_private_lo_v2i16_reghi_vreg_nooff(i16 addrspace(5)* %in, i32 %
 ; GFX900-LABEL: load_private_lo_v2i16_reghi_vreg_nooff:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    buffer_load_short_d16 v1, off, s[0:3], s33 offset:4094
+; GFX900-NEXT:    buffer_load_short_d16 v1, off, s[0:3], 0 offset:4094
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    global_store_dword v[0:1], v1, off
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
@@ -1351,7 +1351,7 @@ define void @load_private_lo_v2i16_reghi_vreg_nooff(i16 addrspace(5)* %in, i32 %
 ; GFX906-LABEL: load_private_lo_v2i16_reghi_vreg_nooff:
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    buffer_load_ushort v0, off, s[0:3], s33 offset:4094
+; GFX906-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:4094
 ; GFX906-NEXT:    v_mov_b32_e32 v2, 0xffff
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_bfi_b32 v0, v2, v0, v1
@@ -1362,7 +1362,7 @@ define void @load_private_lo_v2i16_reghi_vreg_nooff(i16 addrspace(5)* %in, i32 %
 ; GFX803-LABEL: load_private_lo_v2i16_reghi_vreg_nooff:
 ; GFX803:       ; %bb.0: ; %entry
 ; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX803-NEXT:    buffer_load_ushort v0, off, s[0:3], s33 offset:4094
+; GFX803-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:4094
 ; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -1381,7 +1381,7 @@ define void @load_private_lo_v2f16_reglo_vreg_nooff(half addrspace(5)* %in, i32
 ; GFX900-LABEL: load_private_lo_v2f16_reglo_vreg_nooff:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    buffer_load_short_d16 v1, off, s[0:3], s33 offset:4094
+; GFX900-NEXT:    buffer_load_short_d16 v1, off, s[0:3], 0 offset:4094
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    global_store_dword v[0:1], v1, off
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
@@ -1390,7 +1390,7 @@ define void @load_private_lo_v2f16_reglo_vreg_nooff(half addrspace(5)* %in, i32
 ; GFX906-LABEL: load_private_lo_v2f16_reglo_vreg_nooff:
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    buffer_load_ushort v0, off, s[0:3], s33 offset:4094
+; GFX906-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:4094
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -1402,7 +1402,7 @@ define void @load_private_lo_v2f16_reglo_vreg_nooff(half addrspace(5)* %in, i32
 ; GFX803-LABEL: load_private_lo_v2f16_reglo_vreg_nooff:
 ; GFX803:       ; %bb.0: ; %entry
 ; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX803-NEXT:    buffer_load_ushort v0, off, s[0:3], s33 offset:4094
+; GFX803-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:4094
 ; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -1504,7 +1504,7 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in,
 ; GFX900-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_zexti8:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    buffer_load_ubyte_d16 v1, off, s[0:3], s33 offset:4094
+; GFX900-NEXT:    buffer_load_ubyte_d16 v1, off, s[0:3], 0 offset:4094
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    global_store_dword v[0:1], v1, off
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
@@ -1513,7 +1513,7 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in,
 ; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_zexti8:
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    buffer_load_ubyte v0, off, s[0:3], s33 offset:4094
+; GFX906-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 offset:4094
 ; GFX906-NEXT:    v_mov_b32_e32 v2, 0xffff
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_bfi_b32 v0, v2, v0, v1
@@ -1525,7 +1525,7 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in,
 ; GFX803:       ; %bb.0: ; %entry
 ; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
-; GFX803-NEXT:    buffer_load_ubyte v1, off, s[0:3], s33 offset:4094
+; GFX803-NEXT:    buffer_load_ubyte v1, off, s[0:3], 0 offset:4094
 ; GFX803-NEXT:    s_mov_b32 s4, 0x5040c00
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    v_perm_b32 v0, v0, v1, s4
@@ -1545,7 +1545,7 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff_sexti8(i8 addrspace(5)* %in,
 ; GFX900-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_sexti8:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    buffer_load_sbyte_d16 v1, off, s[0:3], s33 offset:4094
+; GFX900-NEXT:    buffer_load_sbyte_d16 v1, off, s[0:3], 0 offset:4094
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    global_store_dword v[0:1], v1, off
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
@@ -1554,7 +1554,7 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff_sexti8(i8 addrspace(5)* %in,
 ; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_sexti8:
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    buffer_load_sbyte v0, off, s[0:3], s33 offset:4094
+; GFX906-NEXT:    buffer_load_sbyte v0, off, s[0:3], 0 offset:4094
 ; GFX906-NEXT:    v_mov_b32_e32 v2, 0xffff
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_bfi_b32 v0, v2, v0, v1
@@ -1565,7 +1565,7 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff_sexti8(i8 addrspace(5)* %in,
 ; GFX803-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_sexti8:
 ; GFX803:       ; %bb.0: ; %entry
 ; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX803-NEXT:    buffer_load_sbyte v0, off, s[0:3], s33 offset:4094
+; GFX803-NEXT:    buffer_load_sbyte v0, off, s[0:3], 0 offset:4094
 ; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -1585,7 +1585,7 @@ define void @load_private_lo_v2f16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in,
 ; GFX900-LABEL: load_private_lo_v2f16_reglo_vreg_nooff_zexti8:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    buffer_load_ubyte_d16 v1, off, s[0:3], s33 offset:4094
+; GFX900-NEXT:    buffer_load_ubyte_d16 v1, off, s[0:3], 0 offset:4094
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    global_store_dword v[0:1], v1, off
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
@@ -1594,7 +1594,7 @@ define void @load_private_lo_v2f16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in,
 ; GFX906-LABEL: load_private_lo_v2f16_reglo_vreg_nooff_zexti8:
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    buffer_load_ubyte v0, off, s[0:3], s33 offset:4094
+; GFX906-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 offset:4094
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -1607,7 +1607,7 @@ define void @load_private_lo_v2f16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in,
 ; GFX803:       ; %bb.0: ; %entry
 ; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
-; GFX803-NEXT:    buffer_load_ubyte v1, off, s[0:3], s33 offset:4094
+; GFX803-NEXT:    buffer_load_ubyte v1, off, s[0:3], 0 offset:4094
 ; GFX803-NEXT:    s_mov_b32 s4, 0x5040c00
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    v_perm_b32 v0, v0, v1, s4

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-load.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-load.ll
index 77b9fcb3915e..6aeb69335646 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-load.ll
@@ -447,8 +447,8 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}nontemporal_private_0:
-; GFX89: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen glc slc{{$}}
-; GFX10: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen slc{{$}}
+; GFX89: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offen glc slc{{$}}
+; GFX10: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offen slc{{$}}
 ; GFX10:         .amdhsa_kernel nontemporal_private_0
 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
@@ -462,8 +462,8 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}nontemporal_private_1:
-; GFX89: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen glc slc{{$}}
-; GFX10: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen slc{{$}}
+; GFX89: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offen glc slc{{$}}
+; GFX10: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offen slc{{$}}
 ; GFX10:         .amdhsa_kernel nontemporal_private_1
 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-store.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-store.ll
index 2294b7d200d1..dd81f00a40cb 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-store.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-store.ll
@@ -314,8 +314,8 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}nontemporal_private_0:
-; GFX89: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen glc slc{{$}}
-; GFX10: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen slc{{$}}
+; GFX89: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offen glc slc{{$}}
+; GFX10: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offen slc{{$}}
 ; GFX10:         .amdhsa_kernel nontemporal_private_0
 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
@@ -329,8 +329,8 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}nontemporal_private_1:
-; GFX89: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen glc slc{{$}}
-; GFX10: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen slc{{$}}
+; GFX89: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offen glc slc{{$}}
+; GFX10: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offen slc{{$}}
 ; GFX10:         .amdhsa_kernel nontemporal_private_1
 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0

diff  --git a/llvm/test/CodeGen/AMDGPU/memory_clause.ll b/llvm/test/CodeGen/AMDGPU/memory_clause.ll
index 51716847743c..15a8e9283e38 100644
--- a/llvm/test/CodeGen/AMDGPU/memory_clause.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory_clause.ll
@@ -115,61 +115,50 @@ define void @mubuf_clause(<4 x i32> addrspace(5)* noalias nocapture readonly %ar
 ; GCN-NEXT:    v_and_b32_e32 v2, 0x3ff, v2
 ; GCN-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
 ; GCN-NEXT:    v_add_u32_e32 v0, v0, v2
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    buffer_load_dword v3, v0, s[0:3], s33 offen
-; GCN-NEXT:    buffer_load_dword v4, v0, s[0:3], s33 offen offset:4
-; GCN-NEXT:    buffer_load_dword v5, v0, s[0:3], s33 offen offset:8
-; GCN-NEXT:    buffer_load_dword v6, v0, s[0:3], s33 offen offset:12
-; GCN-NEXT:    buffer_load_dword v7, v0, s[0:3], s33 offen offset:16
-; GCN-NEXT:    buffer_load_dword v8, v0, s[0:3], s33 offen offset:20
-; GCN-NEXT:    buffer_load_dword v9, v0, s[0:3], s33 offen offset:24
-; GCN-NEXT:    buffer_load_dword v10, v0, s[0:3], s33 offen offset:28
-; GCN-NEXT:    buffer_load_dword v11, v0, s[0:3], s33 offen offset:32
-; GCN-NEXT:    buffer_load_dword v12, v0, s[0:3], s33 offen offset:36
-; GCN-NEXT:    buffer_load_dword v13, v0, s[0:3], s33 offen offset:40
-; GCN-NEXT:    buffer_load_dword v14, v0, s[0:3], s33 offen offset:44
-; GCN-NEXT:    buffer_load_dword v15, v0, s[0:3], s33 offen offset:48
-; GCN-NEXT:    buffer_load_dword v16, v0, s[0:3], s33 offen offset:52
-; GCN-NEXT:    buffer_load_dword v17, v0, s[0:3], s33 offen offset:56
 ; GCN-NEXT:    v_add_u32_e32 v1, v1, v2
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    buffer_load_dword v0, v0, s[0:3], s33 offen offset:60
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_waitcnt vmcnt(15)
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    buffer_store_dword v3, v1, s[0:3], s33 offen
-; GCN-NEXT:    s_waitcnt vmcnt(15)
-; GCN-NEXT:    buffer_store_dword v4, v1, s[0:3], s33 offen offset:4
-; GCN-NEXT:    s_waitcnt vmcnt(15)
-; GCN-NEXT:    buffer_store_dword v5, v1, s[0:3], s33 offen offset:8
-; GCN-NEXT:    s_waitcnt vmcnt(15)
-; GCN-NEXT:    buffer_store_dword v6, v1, s[0:3], s33 offen offset:12
-; GCN-NEXT:    s_waitcnt vmcnt(15)
-; GCN-NEXT:    buffer_store_dword v7, v1, s[0:3], s33 offen offset:16
-; GCN-NEXT:    s_waitcnt vmcnt(15)
-; GCN-NEXT:    buffer_store_dword v8, v1, s[0:3], s33 offen offset:20
-; GCN-NEXT:    s_waitcnt vmcnt(15)
-; GCN-NEXT:    buffer_store_dword v9, v1, s[0:3], s33 offen offset:24
-; GCN-NEXT:    s_waitcnt vmcnt(15)
-; GCN-NEXT:    buffer_store_dword v10, v1, s[0:3], s33 offen offset:28
-; GCN-NEXT:    s_waitcnt vmcnt(15)
-; GCN-NEXT:    buffer_store_dword v11, v1, s[0:3], s33 offen offset:32
-; GCN-NEXT:    s_waitcnt vmcnt(15)
-; GCN-NEXT:    buffer_store_dword v12, v1, s[0:3], s33 offen offset:36
-; GCN-NEXT:    s_waitcnt vmcnt(15)
-; GCN-NEXT:    buffer_store_dword v13, v1, s[0:3], s33 offen offset:40
-; GCN-NEXT:    s_waitcnt vmcnt(15)
-; GCN-NEXT:    buffer_store_dword v14, v1, s[0:3], s33 offen offset:44
-; GCN-NEXT:    s_waitcnt vmcnt(15)
-; GCN-NEXT:    buffer_store_dword v15, v1, s[0:3], s33 offen offset:48
-; GCN-NEXT:    s_waitcnt vmcnt(15)
-; GCN-NEXT:    buffer_store_dword v16, v1, s[0:3], s33 offen offset:52
-; GCN-NEXT:    s_waitcnt vmcnt(15)
-; GCN-NEXT:    buffer_store_dword v17, v1, s[0:3], s33 offen offset:56
-; GCN-NEXT:    s_waitcnt vmcnt(15)
-; GCN-NEXT:    buffer_store_dword v0, v1, s[0:3], s33 offen offset:60
+; GCN-NEXT:    buffer_load_dword v6, v0, s[0:3], 0 offen offset:20
+; GCN-NEXT:    buffer_load_dword v7, v0, s[0:3], 0 offen offset:24
+; GCN-NEXT:    buffer_load_dword v8, v0, s[0:3], 0 offen offset:28
+; GCN-NEXT:    buffer_load_dword v9, v0, s[0:3], 0 offen offset:32
+; GCN-NEXT:    buffer_load_dword v10, v0, s[0:3], 0 offen offset:36
+; GCN-NEXT:    buffer_load_dword v11, v0, s[0:3], 0 offen offset:40
+; GCN-NEXT:    buffer_load_dword v12, v0, s[0:3], 0 offen offset:44
+; GCN-NEXT:    buffer_load_dword v13, v0, s[0:3], 0 offen offset:48
+; GCN-NEXT:    buffer_load_dword v14, v0, s[0:3], 0 offen offset:52
+; GCN-NEXT:    buffer_load_dword v15, v0, s[0:3], 0 offen offset:56
+; GCN-NEXT:    buffer_load_dword v16, v0, s[0:3], 0 offen offset:60
+; GCN-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 offen
+; GCN-NEXT:    buffer_load_dword v3, v0, s[0:3], 0 offen offset:4
+; GCN-NEXT:    buffer_load_dword v4, v0, s[0:3], 0 offen offset:8
+; GCN-NEXT:    buffer_load_dword v5, v0, s[0:3], 0 offen offset:12
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    s_waitcnt vmcnt(4)
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(4)
+; GCN-NEXT:    buffer_store_dword v3, v1, s[0:3], 0 offen offset:4
+; GCN-NEXT:    s_waitcnt vmcnt(4)
+; GCN-NEXT:    buffer_store_dword v4, v1, s[0:3], 0 offen offset:8
+; GCN-NEXT:    s_waitcnt vmcnt(4)
+; GCN-NEXT:    buffer_store_dword v5, v1, s[0:3], 0 offen offset:12
+; GCN-NEXT:    s_waitcnt vmcnt(4)
+; GCN-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GCN-NEXT:    buffer_store_dword v6, v1, s[0:3], 0 offen offset:20
+; GCN-NEXT:    buffer_store_dword v7, v1, s[0:3], 0 offen offset:24
+; GCN-NEXT:    buffer_store_dword v8, v1, s[0:3], 0 offen offset:28
+; GCN-NEXT:    buffer_store_dword v9, v1, s[0:3], 0 offen offset:32
+; GCN-NEXT:    buffer_store_dword v10, v1, s[0:3], 0 offen offset:36
+; GCN-NEXT:    buffer_store_dword v11, v1, s[0:3], 0 offen offset:40
+; GCN-NEXT:    buffer_store_dword v12, v1, s[0:3], 0 offen offset:44
+; GCN-NEXT:    buffer_store_dword v13, v1, s[0:3], 0 offen offset:48
+; GCN-NEXT:    buffer_store_dword v14, v1, s[0:3], 0 offen offset:52
+; GCN-NEXT:    buffer_store_dword v15, v1, s[0:3], 0 offen offset:56
+; GCN-NEXT:    buffer_store_dword v16, v1, s[0:3], 0 offen offset:60
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 bb:

diff  --git a/llvm/test/CodeGen/AMDGPU/mesa3d.ll b/llvm/test/CodeGen/AMDGPU/mesa3d.ll
index 4f09b3f74804..41011afd5c84 100644
--- a/llvm/test/CodeGen/AMDGPU/mesa3d.ll
+++ b/llvm/test/CodeGen/AMDGPU/mesa3d.ll
@@ -5,7 +5,7 @@
 ; GCN-DAG: s_mov_b32 s6, -1{{$}}
 ; GCN-DAG: s_mov_b32 s7, 0xe8f000
 ; GCN-DAG: v_mov_b32_e32 [[V:v[0-9]+]], 2
-; GCN: buffer_store_dword [[V]], off, s[4:7], s2 offset:4
+; GCN: buffer_store_dword [[V]], off, s[4:7], 0 offset:4
 define amdgpu_ps void @scratch_ps(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %alloca = alloca i32, addrspace(5)

diff  --git a/llvm/test/CodeGen/AMDGPU/mir-print-dead-csr-fi.mir b/llvm/test/CodeGen/AMDGPU/mir-print-dead-csr-fi.mir
index cccf2c113ebe..71c7e32ff23b 100644
--- a/llvm/test/CodeGen/AMDGPU/mir-print-dead-csr-fi.mir
+++ b/llvm/test/CodeGen/AMDGPU/mir-print-dead-csr-fi.mir
@@ -15,7 +15,6 @@ frameInfo:
   maxAlignment:    4
 machineFunctionInfo:
   scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
-  scratchWaveOffsetReg: '$sgpr4'
   frameOffsetReg:  '$sgpr5'
   stackPtrOffsetReg: '$sgpr32'
 body:             |

diff  --git a/llvm/test/CodeGen/AMDGPU/misched-killflags.mir b/llvm/test/CodeGen/AMDGPU/misched-killflags.mir
index 75d297e111a5..9d0c32214100 100644
--- a/llvm/test/CodeGen/AMDGPU/misched-killflags.mir
+++ b/llvm/test/CodeGen/AMDGPU/misched-killflags.mir
@@ -6,7 +6,6 @@ tracksRegLiveness: true
 machineFunctionInfo:
   isEntryFunction: true
   scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
-  scratchWaveOffsetReg: '$sgpr7'
   frameOffsetReg:  '$sgpr7'
 body: |
   bb.0:

diff  --git a/llvm/test/CodeGen/AMDGPU/mubuf-offset-private.ll b/llvm/test/CodeGen/AMDGPU/mubuf-offset-private.ll
index 5a0d87f5186d..f984bb49c7b7 100644
--- a/llvm/test/CodeGen/AMDGPU/mubuf-offset-private.ll
+++ b/llvm/test/CodeGen/AMDGPU/mubuf-offset-private.ll
@@ -5,49 +5,49 @@
 ; Test addressing modes when the scratch base is not a frame index.
 
 ; GCN-LABEL: {{^}}store_private_offset_i8:
-; GCN: buffer_store_byte v{{[0-9]+}}, off, s[4:7], s2 offset:8
+; GCN: buffer_store_byte v{{[0-9]+}}, off, s[4:7], 0 offset:8
 define amdgpu_kernel void @store_private_offset_i8() #0 {
   store volatile i8 5, i8 addrspace(5)* inttoptr (i32 8 to i8 addrspace(5)*)
   ret void
 }
 
 ; GCN-LABEL: {{^}}store_private_offset_i16:
-; GCN: buffer_store_short v{{[0-9]+}}, off, s[4:7], s2 offset:8
+; GCN: buffer_store_short v{{[0-9]+}}, off, s[4:7], 0 offset:8
 define amdgpu_kernel void @store_private_offset_i16() #0 {
   store volatile i16 5, i16 addrspace(5)* inttoptr (i32 8 to i16 addrspace(5)*)
   ret void
 }
 
 ; GCN-LABEL: {{^}}store_private_offset_i32:
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s[4:7], s2 offset:8
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s[4:7], 0 offset:8
 define amdgpu_kernel void @store_private_offset_i32() #0 {
   store volatile i32 5, i32 addrspace(5)* inttoptr (i32 8 to i32 addrspace(5)*)
   ret void
 }
 
 ; GCN-LABEL: {{^}}store_private_offset_v2i32:
-; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], s2 offset:8
+; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], 0 offset:8
 define amdgpu_kernel void @store_private_offset_v2i32() #0 {
   store volatile <2 x i32> <i32 5, i32 10>, <2 x i32> addrspace(5)* inttoptr (i32 8 to <2 x i32> addrspace(5)*)
   ret void
 }
 
 ; GCN-LABEL: {{^}}store_private_offset_v4i32:
-; GCN: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], s2 offset:8
+; GCN: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], 0 offset:8
 define amdgpu_kernel void @store_private_offset_v4i32() #0 {
   store volatile <4 x i32> <i32 5, i32 10, i32 15, i32 0>, <4 x i32> addrspace(5)* inttoptr (i32 8 to <4 x i32> addrspace(5)*)
   ret void
 }
 
 ; GCN-LABEL: {{^}}load_private_offset_i8:
-; GCN: buffer_load_ubyte v{{[0-9]+}}, off, s[4:7], s2 offset:8
+; GCN: buffer_load_ubyte v{{[0-9]+}}, off, s[4:7], 0 offset:8
 define amdgpu_kernel void @load_private_offset_i8() #0 {
   %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 8 to i8 addrspace(5)*)
   ret void
 }
 
 ; GCN-LABEL: {{^}}sextload_private_offset_i8:
-; GCN: buffer_load_sbyte v{{[0-9]+}}, off, s[4:7], s8 offset:8
+; GCN: buffer_load_sbyte v{{[0-9]+}}, off, s[4:7], 0 offset:8
 define amdgpu_kernel void @sextload_private_offset_i8(i32 addrspace(1)* %out) #0 {
   %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 8 to i8 addrspace(5)*)
   %sextload = sext i8 %load to i32
@@ -56,7 +56,7 @@ define amdgpu_kernel void @sextload_private_offset_i8(i32 addrspace(1)* %out) #0
 }
 
 ; GCN-LABEL: {{^}}zextload_private_offset_i8:
-; GCN: buffer_load_ubyte v{{[0-9]+}}, off, s[4:7], s8 offset:8
+; GCN: buffer_load_ubyte v{{[0-9]+}}, off, s[4:7], 0 offset:8
 define amdgpu_kernel void @zextload_private_offset_i8(i32 addrspace(1)* %out) #0 {
   %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 8 to i8 addrspace(5)*)
   %zextload = zext i8 %load to i32
@@ -65,14 +65,14 @@ define amdgpu_kernel void @zextload_private_offset_i8(i32 addrspace(1)* %out) #0
 }
 
 ; GCN-LABEL: {{^}}load_private_offset_i16:
-; GCN: buffer_load_ushort v{{[0-9]+}}, off, s[4:7], s2 offset:8
+; GCN: buffer_load_ushort v{{[0-9]+}}, off, s[4:7], 0 offset:8
 define amdgpu_kernel void @load_private_offset_i16() #0 {
   %load = load volatile i16, i16 addrspace(5)* inttoptr (i32 8 to i16 addrspace(5)*)
   ret void
 }
 
 ; GCN-LABEL: {{^}}sextload_private_offset_i16:
-; GCN: buffer_load_sshort v{{[0-9]+}}, off, s[4:7], s8 offset:8
+; GCN: buffer_load_sshort v{{[0-9]+}}, off, s[4:7], 0 offset:8
 define amdgpu_kernel void @sextload_private_offset_i16(i32 addrspace(1)* %out) #0 {
   %load = load volatile i16, i16 addrspace(5)* inttoptr (i32 8 to i16 addrspace(5)*)
   %sextload = sext i16 %load to i32
@@ -81,7 +81,7 @@ define amdgpu_kernel void @sextload_private_offset_i16(i32 addrspace(1)* %out) #
 }
 
 ; GCN-LABEL: {{^}}zextload_private_offset_i16:
-; GCN: buffer_load_ushort v{{[0-9]+}}, off, s[4:7], s8 offset:8
+; GCN: buffer_load_ushort v{{[0-9]+}}, off, s[4:7], 0 offset:8
 define amdgpu_kernel void @zextload_private_offset_i16(i32 addrspace(1)* %out) #0 {
   %load = load volatile i16, i16 addrspace(5)* inttoptr (i32 8 to i16 addrspace(5)*)
   %zextload = zext i16 %load to i32
@@ -90,28 +90,28 @@ define amdgpu_kernel void @zextload_private_offset_i16(i32 addrspace(1)* %out) #
 }
 
 ; GCN-LABEL: {{^}}load_private_offset_i32:
-; GCN: buffer_load_dword v{{[0-9]+}}, off, s[4:7], s2 offset:8
+; GCN: buffer_load_dword v{{[0-9]+}}, off, s[4:7], 0 offset:8
 define amdgpu_kernel void @load_private_offset_i32() #0 {
   %load = load volatile i32, i32 addrspace(5)* inttoptr (i32 8 to i32 addrspace(5)*)
   ret void
 }
 
 ; GCN-LABEL: {{^}}load_private_offset_v2i32:
-; GCN: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], s2 offset:8
+; GCN: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], 0 offset:8
 define amdgpu_kernel void @load_private_offset_v2i32() #0 {
   %load = load volatile <2 x i32>, <2 x i32> addrspace(5)* inttoptr (i32 8 to <2 x i32> addrspace(5)*)
   ret void
 }
 
 ; GCN-LABEL: {{^}}load_private_offset_v4i32:
-; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], s2 offset:8
+; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], 0 offset:8
 define amdgpu_kernel void @load_private_offset_v4i32() #0 {
   %load = load volatile <4 x i32>, <4 x i32> addrspace(5)* inttoptr (i32 8 to <4 x i32> addrspace(5)*)
   ret void
 }
 
 ; GCN-LABEL: {{^}}store_private_offset_i8_max_offset:
-; GCN: buffer_store_byte v{{[0-9]+}}, off, s[4:7], s2 offset:4095
+; GCN: buffer_store_byte v{{[0-9]+}}, off, s[4:7], 0 offset:4095
 define amdgpu_kernel void @store_private_offset_i8_max_offset() #0 {
   store volatile i8 5, i8 addrspace(5)* inttoptr (i32 4095 to i8 addrspace(5)*)
   ret void
@@ -119,7 +119,7 @@ define amdgpu_kernel void @store_private_offset_i8_max_offset() #0 {
 
 ; GCN-LABEL: {{^}}store_private_offset_i8_max_offset_plus1:
 ; GCN: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x1000
-; GCN: buffer_store_byte v{{[0-9]+}}, [[OFFSET]], s[4:7], s2 offen{{$}}
+; GCN: buffer_store_byte v{{[0-9]+}}, [[OFFSET]], s[4:7], 0 offen{{$}}
 define amdgpu_kernel void @store_private_offset_i8_max_offset_plus1() #0 {
   store volatile i8 5, i8 addrspace(5)* inttoptr (i32 4096 to i8 addrspace(5)*)
   ret void
@@ -127,7 +127,7 @@ define amdgpu_kernel void @store_private_offset_i8_max_offset_plus1() #0 {
 
 ; GCN-LABEL: {{^}}store_private_offset_i8_max_offset_plus2:
 ; GCN: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x1000
-; GCN: buffer_store_byte v{{[0-9]+}}, [[OFFSET]], s[4:7], s2 offen offset:1{{$}}
+; GCN: buffer_store_byte v{{[0-9]+}}, [[OFFSET]], s[4:7], 0 offen offset:1{{$}}
 define amdgpu_kernel void @store_private_offset_i8_max_offset_plus2() #0 {
   store volatile i8 5, i8 addrspace(5)* inttoptr (i32 4097 to i8 addrspace(5)*)
   ret void
@@ -139,10 +139,10 @@ define amdgpu_kernel void @store_private_offset_i8_max_offset_plus2() #0 {
 ; GCN-LABEL: {{^}}store_private_unknown_bits_vaddr:
 ; SICIVI: v_add_{{i|u}}32_e32 [[ADDR0:v[0-9]+]], vcc, 4
 ; SICIVI: v_add_{{i|u}}32_e32 [[ADDR1:v[0-9]+]], vcc, 32, [[ADDR0]]
-; SICIVI: buffer_store_dword v{{[0-9]+}}, [[ADDR1]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
+; SICIVI: buffer_store_dword v{{[0-9]+}}, [[ADDR1]], s{{\[[0-9]+:[0-9]+\]}}, 0 offen{{$}}
 
 ; GFX9: v_add_u32_e32 [[ADDR:v[0-9]+]], 4,
-; GFX9: buffer_store_dword v{{[0-9]+}}, [[ADDR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:32
+; GFX9: buffer_store_dword v{{[0-9]+}}, [[ADDR]], s{{\[[0-9]+:[0-9]+\]}}, 0 offen offset:32
 define amdgpu_kernel void @store_private_unknown_bits_vaddr() #0 {
   %alloca = alloca [16 x i32], align 4, addrspace(5)
   %vaddr = load volatile i32, i32 addrspace(1)* undef

diff  --git a/llvm/test/CodeGen/AMDGPU/optimize-exec-masking-pre-ra.mir b/llvm/test/CodeGen/AMDGPU/optimize-exec-masking-pre-ra.mir
index 0ea085afc405..fee6b52d1a11 100644
--- a/llvm/test/CodeGen/AMDGPU/optimize-exec-masking-pre-ra.mir
+++ b/llvm/test/CodeGen/AMDGPU/optimize-exec-masking-pre-ra.mir
@@ -9,7 +9,6 @@ tracksRegLiveness: true
 machineFunctionInfo:
   isEntryFunction: true
   scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
-  scratchWaveOffsetReg: '$sgpr101'
   frameOffsetReg:  '$sgpr101'
 body:             |
   ; GCN-LABEL: name: exec_src1_is_not_copy

diff  --git a/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll
index a38bacd97a67..ef656f66fcc8 100644
--- a/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll
+++ b/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll
@@ -13,7 +13,6 @@
 ; GCN: def s[4:11]
 ; GCN: def s[12:19]
 ; GCN: def s[20:27]
-; GCN: def s[28:35]
 ; GCN: def s[36:43]
 ; GCN: def s[44:51]
 ; GCN: def s[52:59]
@@ -37,8 +36,8 @@
 ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 10
 ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 11
 ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 12
-; GCN-NEXT: v_writelane_b32 v0, s9, 13
-; GCN-NEXT: v_writelane_b32 v0, s10, 14
+; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 13
+; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 14
 ; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 15
 
 ; GCN: def s{{\[}}[[TMP_LO]]:[[TMP_HI]]{{\]}}
@@ -47,8 +46,8 @@
 ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 18
 ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 19
 ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 20
-; GCN-NEXT: v_writelane_b32 v0, s9, 21
-; GCN-NEXT: v_writelane_b32 v0, s10, 22
+; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 21
+; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 22
 ; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 23
 
 ; GCN: def s{{\[}}[[TMP_LO]]:[[TMP_HI]]{{\]}}
@@ -57,8 +56,8 @@
 ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 26
 ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 27
 ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 28
-; GCN-NEXT: v_writelane_b32 v0, s9, 29
-; GCN-NEXT: v_writelane_b32 v0, s10, 30
+; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 29
+; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 30
 ; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 31
 
 ; GCN: def s{{\[}}[[TMP_LO]]:[[TMP_HI]]{{\]}}
@@ -67,8 +66,8 @@
 ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 34
 ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 35
 ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 36
-; GCN-NEXT: v_writelane_b32 v0, s9, 37
-; GCN-NEXT: v_writelane_b32 v0, s10, 38
+; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 37
+; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 38
 ; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 39
 
 ; GCN: def s{{\[}}[[TMP_LO]]:[[TMP_HI]]{{\]}}
@@ -77,36 +76,38 @@
 ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 42
 ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 43
 ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 44
-; GCN-NEXT: v_writelane_b32 v0, s9, 45
-; GCN-NEXT: v_writelane_b32 v0, s10, 46
+; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 45
+; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 46
 ; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 47
 
 ; GCN: def s{{\[}}[[TMP_LO]]:[[TMP_HI]]{{\]}}
-; GCN: v_writelane_b32 v0, s12, 48
-; GCN-NEXT: v_writelane_b32 v0, s13, 49
-; GCN-NEXT: v_writelane_b32 v0, s14, 50
-; GCN-NEXT: v_writelane_b32 v0, s15, 51
-; GCN-NEXT: v_writelane_b32 v0, s16, 52
-; GCN-NEXT: v_writelane_b32 v0, s17, 53
-; GCN-NEXT: v_writelane_b32 v0, s18, 54
-; GCN-NEXT: v_writelane_b32 v0, s19, 55
-
-; GCN-NEXT: v_writelane_b32 v0, s20, 56
-; GCN-NEXT: v_writelane_b32 v0, s21, 57
-; GCN-NEXT: v_writelane_b32 v0, s22, 58
-; GCN-NEXT: v_writelane_b32 v0, s23, 59
-; GCN-NEXT: v_writelane_b32 v0, s24, 60
-; GCN-NEXT: v_writelane_b32 v0, s25, 61
-; GCN-NEXT: v_writelane_b32 v0, s26, 62
-; GCN-NEXT: v_writelane_b32 v0, s27, 63
-; GCN-NEXT: v_writelane_b32 v1, s28, 0
-; GCN-NEXT: v_writelane_b32 v1, s29, 1
-; GCN-NEXT: v_writelane_b32 v1, s30, 2
-; GCN-NEXT: v_writelane_b32 v1, s31, 3
-; GCN-NEXT: v_writelane_b32 v1, s32, 4
-; GCN-NEXT: v_writelane_b32 v1, s33, 5
-; GCN-NEXT: v_writelane_b32 v1, s34, 6
-; GCN-NEXT: v_writelane_b32 v1, s35, 7
+; GCN: v_writelane_b32 v0, s[[TMP_LO]], 48
+; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 49
+; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 50
+; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 51
+; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 52
+; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 53
+; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 54
+; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 55
+
+; GCN: def s{{\[}}[[TMP_LO]]:[[TMP_HI]]{{\]}}
+; GCN: v_writelane_b32 v0, s12, 56
+; GCN-NEXT: v_writelane_b32 v0, s13, 57
+; GCN-NEXT: v_writelane_b32 v0, s14, 58
+; GCN-NEXT: v_writelane_b32 v0, s15, 59
+; GCN-NEXT: v_writelane_b32 v0, s16, 60
+; GCN-NEXT: v_writelane_b32 v0, s17, 61
+; GCN-NEXT: v_writelane_b32 v0, s18, 62
+; GCN-NEXT: v_writelane_b32 v0, s19, 63
+
+; GCN-NEXT: v_writelane_b32 v1, s20, 0
+; GCN-NEXT: v_writelane_b32 v1, s21, 1
+; GCN-NEXT: v_writelane_b32 v1, s22, 2
+; GCN-NEXT: v_writelane_b32 v1, s23, 3
+; GCN-NEXT: v_writelane_b32 v1, s24, 4
+; GCN-NEXT: v_writelane_b32 v1, s25, 5
+; GCN-NEXT: v_writelane_b32 v1, s26, 6
+; GCN-NEXT: v_writelane_b32 v1, s27, 7
 ; GCN-NEXT: v_writelane_b32 v1, s36, 8
 ; GCN-NEXT: v_writelane_b32 v1, s37, 9
 ; GCN-NEXT: v_writelane_b32 v1, s38, 10
@@ -184,16 +185,6 @@
 ; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI:[0-9]+]], v0, 7
 ; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
 
-; GCN: v_readlane_b32 s[[USE_TMP_LO:[0-9]+]], v0, 48
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 49
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 50
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 51
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 52
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 53
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 54
-; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI:[0-9]+]], v0, 55
-; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
-
 ; GCN: v_readlane_b32 s[[USE_TMP_LO:[0-9]+]], v0, 56
 ; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 57
 ; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 58
@@ -393,7 +384,7 @@ ret:
 
 ; GCN-LABEL: {{^}}split_sgpr_spill_2_vgprs:
 ; GCN: def s[4:19]
-; GCN: def s[20:35]
+; GCN: def s[36:51]
 
 ; GCN: v_writelane_b32 v0, s4, 48
 ; GCN-NEXT: v_writelane_b32 v0, s5, 49
@@ -412,22 +403,22 @@ ret:
 ; GCN-NEXT: v_writelane_b32 v0, s18, 62
 ; GCN-NEXT: v_writelane_b32 v0, s19, 63
 
-; GCN: v_readlane_b32 s4, v0, 48
-; GCN-NEXT: v_readlane_b32 s5, v0, 49
-; GCN-NEXT: v_readlane_b32 s6, v0, 50
-; GCN-NEXT: v_readlane_b32 s7, v0, 51
-; GCN-NEXT: v_readlane_b32 s8, v0, 52
-; GCN-NEXT: v_readlane_b32 s9, v0, 53
-; GCN-NEXT: v_readlane_b32 s10, v0, 54
-; GCN-NEXT: v_readlane_b32 s11, v0, 55
-; GCN-NEXT: v_readlane_b32 s12, v0, 56
-; GCN-NEXT: v_readlane_b32 s13, v0, 57
-; GCN-NEXT: v_readlane_b32 s14, v0, 58
-; GCN-NEXT: v_readlane_b32 s15, v0, 59
-; GCN-NEXT: v_readlane_b32 s16, v0, 60
-; GCN-NEXT: v_readlane_b32 s17, v0, 61
-; GCN-NEXT: v_readlane_b32 s18, v0, 62
-; GCN-NEXT: v_readlane_b32 s19, v0, 63
+; GCN: v_readlane_b32 s0, v0, 48
+; GCN-NEXT: v_readlane_b32 s1, v0, 49
+; GCN-NEXT: v_readlane_b32 s2, v0, 50
+; GCN-NEXT: v_readlane_b32 s3, v0, 51
+; GCN-NEXT: v_readlane_b32 s4, v0, 52
+; GCN-NEXT: v_readlane_b32 s5, v0, 53
+; GCN-NEXT: v_readlane_b32 s6, v0, 54
+; GCN-NEXT: v_readlane_b32 s7, v0, 55
+; GCN-NEXT: v_readlane_b32 s8, v0, 56
+; GCN-NEXT: v_readlane_b32 s9, v0, 57
+; GCN-NEXT: v_readlane_b32 s10, v0, 58
+; GCN-NEXT: v_readlane_b32 s11, v0, 59
+; GCN-NEXT: v_readlane_b32 s12, v0, 60
+; GCN-NEXT: v_readlane_b32 s13, v0, 61
+; GCN-NEXT: v_readlane_b32 s14, v0, 62
+; GCN-NEXT: v_readlane_b32 s15, v0, 63
 define amdgpu_kernel void @split_sgpr_spill_2_vgprs(i32 addrspace(1)* %out, i32 %in) #1 {
   %wide.sgpr0 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0
   %wide.sgpr1 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0
@@ -457,133 +448,133 @@ ret:
 
 ; GCN-LABEL: {{^}}no_vgprs_last_sgpr_spill:
 
-; GCN: v_writelane_b32 v23, s{{[0-9]+}}, 0
-; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 1
-; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 2
-; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 3
-; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 4
-; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 5
-; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 6
-; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 7
-; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 8
-; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 9
-; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 10
-; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 11
-; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 12
-; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 13
-; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 14
-; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 15
-
-; GCN: v_writelane_b32 v23, s{{[0-9]+}}, 16
-; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 17
-; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 18
-; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 19
-; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 20
-; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 21
-; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 22
-; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 23
-; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 24
-; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 25
-; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 26
-; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 27
-; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 28
-; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 29
-; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 30
-; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 31
+; GCN: v_writelane_b32 v31, s{{[0-9]+}}, 0
+; GCN-NEXT: v_writelane_b32 v31, s{{[0-9]+}}, 1
+; GCN-NEXT: v_writelane_b32 v31, s{{[0-9]+}}, 2
+; GCN-NEXT: v_writelane_b32 v31, s{{[0-9]+}}, 3
+; GCN-NEXT: v_writelane_b32 v31, s{{[0-9]+}}, 4
+; GCN-NEXT: v_writelane_b32 v31, s{{[0-9]+}}, 5
+; GCN-NEXT: v_writelane_b32 v31, s{{[0-9]+}}, 6
+; GCN-NEXT: v_writelane_b32 v31, s{{[0-9]+}}, 7
+; GCN-NEXT: v_writelane_b32 v31, s{{[0-9]+}}, 8
+; GCN-NEXT: v_writelane_b32 v31, s{{[0-9]+}}, 9
+; GCN-NEXT: v_writelane_b32 v31, s{{[0-9]+}}, 10
+; GCN-NEXT: v_writelane_b32 v31, s{{[0-9]+}}, 11
+; GCN-NEXT: v_writelane_b32 v31, s{{[0-9]+}}, 12
+; GCN-NEXT: v_writelane_b32 v31, s{{[0-9]+}}, 13
+; GCN-NEXT: v_writelane_b32 v31, s{{[0-9]+}}, 14
+; GCN-NEXT: v_writelane_b32 v31, s{{[0-9]+}}, 15
+
+; GCN: v_writelane_b32 v31, s{{[0-9]+}}, 16
+; GCN-NEXT: v_writelane_b32 v31, s{{[0-9]+}}, 17
+; GCN-NEXT: v_writelane_b32 v31, s{{[0-9]+}}, 18
+; GCN-NEXT: v_writelane_b32 v31, s{{[0-9]+}}, 19
+; GCN-NEXT: v_writelane_b32 v31, s{{[0-9]+}}, 20
+; GCN-NEXT: v_writelane_b32 v31, s{{[0-9]+}}, 21
+; GCN-NEXT: v_writelane_b32 v31, s{{[0-9]+}}, 22
+; GCN-NEXT: v_writelane_b32 v31, s{{[[0-9]+}}, 23
+; GCN-NEXT: v_writelane_b32 v31, s{{[[0-9]+}}, 24
+; GCN-NEXT: v_writelane_b32 v31, s{{[[0-9]+}}, 25
+; GCN-NEXT: v_writelane_b32 v31, s{{[[0-9]+}}, 26
+; GCN-NEXT: v_writelane_b32 v31, s{{[[0-9]+}}, 27
+; GCN-NEXT: v_writelane_b32 v31, s{{[[0-9]+}}, 28
+; GCN-NEXT: v_writelane_b32 v31, s{{[[0-9]+}}, 29
+; GCN-NEXT: v_writelane_b32 v31, s{{[[0-9]+}}, 30
+; GCN-NEXT: v_writelane_b32 v31, s{{[[0-9]+}}, 31
 
 ; GCN: def s[0:1]
-; GCN:      v_writelane_b32 v23, s20, 32
-; GCN-NEXT: v_writelane_b32 v23, s21, 33
-
-; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 34
-; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 35
-; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 36
-; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 37
-; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 38
-; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 39
-; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 40
-; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 41
-; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 42
-; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 43
-; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 44
-; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 45
-; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 46
-; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 47
-; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 48
-; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 49
-
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+; GCN:      v_writelane_b32 v31, s{{[[0-9]+}}, 32
+; GCN-NEXT: v_writelane_b32 v31, s{{[[0-9]+}}, 33
+; GCN-NEXT: v_writelane_b32 v31, s{{[[0-9]+}}, 34
+; GCN-NEXT: v_writelane_b32 v31, s{{[[0-9]+}}, 35
+; GCN-NEXT: v_writelane_b32 v31, s{{[[0-9]+}}, 36
+; GCN-NEXT: v_writelane_b32 v31, s{{[[0-9]+}}, 37
+; GCN-NEXT: v_writelane_b32 v31, s{{[[0-9]+}}, 38
+; GCN-NEXT: v_writelane_b32 v31, s{{[[0-9]+}}, 39
+; GCN-NEXT: v_writelane_b32 v31, s{{[[0-9]+}}, 40
+; GCN-NEXT: v_writelane_b32 v31, s{{[[0-9]+}}, 41
+; GCN-NEXT: v_writelane_b32 v31, s{{[[0-9]+}}, 42
+; GCN-NEXT: v_writelane_b32 v31, s{{[[0-9]+}}, 43
+; GCN-NEXT: v_writelane_b32 v31, s{{[[0-9]+}}, 44
+; GCN-NEXT: v_writelane_b32 v31, s{{[[0-9]+}}, 45
+; GCN-NEXT: v_writelane_b32 v31, s{{[[0-9]+}}, 46
+; GCN-NEXT: v_writelane_b32 v31, s{{[[0-9]+}}, 47
+; GCN-NEXT: v_writelane_b32 v31, s{{[[0-9]+}}, 48
+; GCN-NEXT: v_writelane_b32 v31, s{{[[0-9]+}}, 49
+
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0
 ; GCN: s_cbranch_scc1
 
 
-; GCN: v_readlane_b32 s[[USE_TMP_LO:[0-9]+]], v23, 0
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 1
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 2
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 3
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 4
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 5
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 6
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 7
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 8
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 9
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 10
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 11
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 12
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 13
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 14
-; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI:[0-9]+]], v23, 15
+; GCN: v_readlane_b32 s[[USE_TMP_LO:[0-9]+]], v31, 0
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 1
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 2
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 3
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 4
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 5
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 6
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 7
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 8
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 9
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 10
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 11
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 12
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 13
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 14
+; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI:[0-9]+]], v31, 15
 ; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
 
 
-; GCN: v_readlane_b32 s[[USE_TMP_LO:[0-9]+]], v23, 32
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 33
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 34
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 35
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 36
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 37
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 38
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 39
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 40
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 41
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 42
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 43
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 44
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 45
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 46
-; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI:[0-9]+]], v23, 47
+; GCN: v_readlane_b32 s[[USE_TMP_LO:[0-9]+]], v31, 32
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 33
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 34
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 35
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 36
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 37
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 38
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 39
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 40
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 41
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 42
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 43
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 44
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 45
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 46
+; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI:[0-9]+]], v31, 47
 ; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
 
-; GCN: v_readlane_b32 s[[USE_TMP_LO:[0-9]+]], v23, 16
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 17
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 18
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 19
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 20
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 21
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 22
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 23
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 24
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 25
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 26
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 27
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 28
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 29
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 30
-; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI:[0-9]+]], v23, 31
+; GCN: v_readlane_b32 s[[USE_TMP_LO:[0-9]+]], v31, 16
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 17
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 18
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 19
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 20
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 21
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 22
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 23
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 24
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 25
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 26
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 27
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 28
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 29
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 30
+; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI:[0-9]+]], v31, 31
 ; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
 
-; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
-; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
-
-; GCN: v_readfirstlane_b32 s1, v0
+; GCN: buffer_load_dword v[[RESTORE_TMP:[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0
+; GCN: v_readfirstlane_b32 s[[USE_TMP_LO:[0-9]+]], v[[RESTORE_TMP]]
+; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0
+; GCN: v_readfirstlane_b32 s[[USE_TMP_HI:[0-9]+]], v[[RESTORE_TMP]]
 ; GCN: ;;#ASMSTART
-; GCN: ; use s[0:1]
+; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
 define amdgpu_kernel void @no_vgprs_last_sgpr_spill(i32 addrspace(1)* %out, i32 %in) #1 {
   call void asm sideeffect "", "~{v[0:7]}" () #0
   call void asm sideeffect "", "~{v[8:15]}" () #0
-  call void asm sideeffect "", "~{v[16:19]}"() #0
-  call void asm sideeffect "", "~{v[20:21]}"() #0
-  call void asm sideeffect "", "~{v22}"() #0
+  call void asm sideeffect "", "~{v[16:23]}" () #0
+  call void asm sideeffect "", "~{v[24:27]}"() #0
+  call void asm sideeffect "", "~{v[28:29]}"() #0
+  call void asm sideeffect "", "~{v30}"() #0
 
   %wide.sgpr0 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0
   %wide.sgpr1 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0
@@ -606,4 +597,4 @@ ret:
 }
 
 attributes #0 = { nounwind }
-attributes #1 = { nounwind "amdgpu-waves-per-eu"="10,10" }
+attributes #1 = { nounwind "amdgpu-waves-per-eu"="8,8" }

diff  --git a/llvm/test/CodeGen/AMDGPU/pei-reg-scavenger-position.mir b/llvm/test/CodeGen/AMDGPU/pei-reg-scavenger-position.mir
index 39915f2755ce..ba62ca822733 100644
--- a/llvm/test/CodeGen/AMDGPU/pei-reg-scavenger-position.mir
+++ b/llvm/test/CodeGen/AMDGPU/pei-reg-scavenger-position.mir
@@ -17,21 +17,23 @@ stack:
 machineFunctionInfo:
   isEntryFunction: true
   scratchRSrcReg:  $sgpr0_sgpr1_sgpr2_sgpr3
-  scratchWaveOffsetReg: $sgpr33
-  frameOffsetReg:  $sgpr5
   stackPtrOffsetReg:  $sgpr32
+  argumentInfo:
+    privateSegmentWaveByteOffset: { reg: '$sgpr4' }
 
 body:             |
   ; CHECK-LABEL: name: scavenge_register_position
   ; CHECK: bb.0:
   ; CHECK:   successors: %bb.1(0x80000000)
-  ; CHECK:   liveins: $sgpr33, $sgpr0_sgpr1_sgpr2_sgpr3
-  ; CHECK:   $sgpr4 = S_ADD_U32 $sgpr32, 524288, implicit-def $scc
-  ; CHECK:   $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, align 8192, addrspace 5)
+  ; CHECK:   liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4
+  ; CHECK:   $sgpr0 = S_ADD_U32 $sgpr0, $sgpr4, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK:   $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK:   $sgpr5 = S_MOV_B32 524288
+  ; CHECK:   $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, align 8192, addrspace 5)
   ; CHECK:   S_BRANCH %bb.1
   ; CHECK: bb.1:
   ; CHECK:   liveins: $sgpr0_sgpr1_sgpr2_sgpr3
-  ; CHECK:   $sgpr4 = S_ADD_U32 $sgpr32, 524288, implicit-def $scc
+  ; CHECK:   $sgpr4 = S_MOV_B32 524288
   ; CHECK:   $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, align 8192, addrspace 5)
   ; CHECK:   S_ENDPGM 0, implicit $vgpr0
   bb.0:

diff  --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir
index 1c7adc39fe29..67d62e0d5003 100644
--- a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir
+++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir
@@ -19,8 +19,7 @@ stack:
 machineFunctionInfo:
   isEntryFunction: false
   scratchRSrcReg:  $sgpr0_sgpr1_sgpr2_sgpr3
-  scratchWaveOffsetReg: $sgpr34
-  frameOffsetReg:  $sgpr33
+  frameOffsetReg:  $sgpr34
   stackPtrOffsetReg:  $sgpr32
 
 body:             |
@@ -29,21 +28,19 @@ body:             |
 
     ; CHECK-LABEL: name: scavenge_sgpr_pei_no_sgprs
     ; CHECK: liveins: $vgpr1
-    ; CHECK: $sgpr27 = frame-setup COPY $sgpr33
+    ; CHECK: $sgpr27 = frame-setup COPY $sgpr34
     ; CHECK: $sgpr4 = frame-setup S_ADD_U32 $sgpr32, 524224, implicit-def $scc
-    ; CHECK: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc
+    ; CHECK: $sgpr34 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc
     ; CHECK: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 1572864, implicit-def $scc
     ; CHECK: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
-    ; CHECK: $sgpr33 = S_SUB_U32 $sgpr33, $sgpr34, implicit-def $scc
-    ; CHECK: $sgpr33 = S_LSHR_B32 killed $sgpr33, 6, implicit-def $scc
-    ; CHECK: $sgpr33 = S_ADD_U32 killed $sgpr33, 8192, implicit-def $scc
-    ; CHECK: $vgpr2 = COPY killed $sgpr33
-    ; CHECK: $sgpr33 = S_SUB_U32 killed $sgpr33, 8192, implicit-def $scc
-    ; CHECK: $sgpr33 = S_LSHL_B32 killed $sgpr33, 6, implicit-def $scc
-    ; CHECK: $sgpr33 = S_ADD_U32 $sgpr33, $sgpr34, implicit-def $scc
+    ; CHECK: $sgpr34 = S_LSHR_B32 $sgpr34, 6, implicit-def $scc
+    ; CHECK: $sgpr34 = S_ADD_U32 killed $sgpr34, 8192, implicit-def $scc
+    ; CHECK: $vgpr2 = COPY killed $sgpr34
+    ; CHECK: $sgpr34 = S_SUB_U32 killed $sgpr34, 8192, implicit-def $scc
+    ; CHECK: $sgpr34 = S_LSHL_B32 $sgpr34, 6, implicit-def $scc
     ; CHECK: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31
     ; CHECK: $sgpr32 = frame-destroy S_SUB_U32 $sgpr32, 1572864, implicit-def $scc
-    ; CHECK: $sgpr33 = frame-setup COPY $sgpr27
+    ; CHECK: $sgpr34 = frame-setup COPY $sgpr27
     ; CHECK: S_ENDPGM 0, implicit $vcc
     S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
     $vgpr0 = V_OR_B32_e32 %stack.1, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31
@@ -64,8 +61,7 @@ stack:
 machineFunctionInfo:
   isEntryFunction: false
   scratchRSrcReg:  $sgpr0_sgpr1_sgpr2_sgpr3
-  scratchWaveOffsetReg: $sgpr34
-  frameOffsetReg:  $sgpr33
+  frameOffsetReg:  $sgpr34
   stackPtrOffsetReg:  $sgpr32
 
 body:             |
@@ -74,18 +70,17 @@ body:             |
 
     ; CHECK-LABEL: name: scavenge_sgpr_pei_one_sgpr
     ; CHECK: liveins: $vgpr1
-    ; CHECK: $sgpr27 = frame-setup COPY $sgpr33
+    ; CHECK: $sgpr27 = frame-setup COPY $sgpr34
     ; CHECK: $sgpr4 = frame-setup S_ADD_U32 $sgpr32, 524224, implicit-def $scc
-    ; CHECK: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc
+    ; CHECK: $sgpr34 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc
     ; CHECK: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 1572864, implicit-def $scc
     ; CHECK: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr28, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
-    ; CHECK: $sgpr29 = S_SUB_U32 $sgpr33, $sgpr34, implicit-def $scc
-    ; CHECK: $sgpr29 = S_LSHR_B32 killed $sgpr29, 6, implicit-def $scc
+    ; CHECK: $sgpr29 = S_LSHR_B32 $sgpr34, 6, implicit-def $scc
     ; CHECK: $sgpr29 = S_ADD_U32 killed $sgpr29, 8192, implicit-def $scc
     ; CHECK: $vgpr2 = COPY killed $sgpr29
     ; CHECK: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr28, implicit $sgpr31
     ; CHECK: $sgpr32 = frame-destroy S_SUB_U32 $sgpr32, 1572864, implicit-def $scc
-    ; CHECK: $sgpr33 = frame-setup COPY $sgpr27
+    ; CHECK: $sgpr34 = frame-setup COPY $sgpr27
     ; CHECK: S_ENDPGM 0, implicit $vcc
     S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr28, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
     $vgpr0 = V_OR_B32_e32 %stack.1, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr28, implicit $sgpr31
@@ -106,8 +101,7 @@ stack:
 machineFunctionInfo:
   isEntryFunction: false
   scratchRSrcReg:  $sgpr0_sgpr1_sgpr2_sgpr3
-  scratchWaveOffsetReg: $sgpr34
-  frameOffsetReg:  $sgpr33
+  frameOffsetReg:  $sgpr34
   stackPtrOffsetReg:  $sgpr32
 
 body:             |
@@ -116,18 +110,17 @@ body:             |
 
     ; CHECK-LABEL: name: scavenge_sgpr_pei_one_sgpr_64
     ; CHECK: liveins: $vgpr1
-    ; CHECK: $sgpr27 = frame-setup COPY $sgpr33
+    ; CHECK: $sgpr27 = frame-setup COPY $sgpr34
     ; CHECK: $sgpr4 = frame-setup S_ADD_U32 $sgpr32, 524224, implicit-def $scc
-    ; CHECK: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc
+    ; CHECK: $sgpr34 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc
     ; CHECK: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 1572864, implicit-def $scc
     ; CHECK: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
-    ; CHECK: $sgpr28 = S_SUB_U32 $sgpr33, $sgpr34, implicit-def $scc
-    ; CHECK: $vgpr3 = V_LSHRREV_B32_e64 6, killed $sgpr28, implicit $exec
+    ; CHECK: $vgpr3 = V_LSHRREV_B32_e64 6, $sgpr34, implicit $exec
     ; CHECK: $sgpr28 = S_MOV_B32 8192
     ; CHECK: $vgpr2, dead $sgpr28_sgpr29 = V_ADD_I32_e64 killed $sgpr28, killed $vgpr3, 0, implicit $exec
     ; CHECK: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr31
     ; CHECK: $sgpr32 = frame-destroy S_SUB_U32 $sgpr32, 1572864, implicit-def $scc
-    ; CHECK: $sgpr33 = frame-setup COPY $sgpr27
+    ; CHECK: $sgpr34 = frame-setup COPY $sgpr27
     ; CHECK: S_ENDPGM 0, implicit $vcc
     S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
     $vgpr0 = V_OR_B32_e32 %stack.1, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr31
@@ -147,8 +140,7 @@ stack:
 machineFunctionInfo:
   isEntryFunction: false
   scratchRSrcReg:  $sgpr0_sgpr1_sgpr2_sgpr3
-  scratchWaveOffsetReg: $sgpr34
-  frameOffsetReg:  $sgpr33
+  frameOffsetReg:  $sgpr34
   stackPtrOffsetReg:  $sgpr32
 
 body:             |
@@ -157,18 +149,17 @@ body:             |
 
     ; CHECK-LABEL: name: scavenge_sgpr_pei_prefer_vcc
     ; CHECK: liveins: $vgpr1
-    ; CHECK: $sgpr27 = frame-setup COPY $sgpr33
+    ; CHECK: $sgpr27 = frame-setup COPY $sgpr34
     ; CHECK: $sgpr4 = frame-setup S_ADD_U32 $sgpr32, 524224, implicit-def $scc
-    ; CHECK: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc
+    ; CHECK: $sgpr34 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc
     ; CHECK: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 1572864, implicit-def $scc
     ; CHECK: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr30, implicit-def $sgpr31
-    ; CHECK: $vcc_hi = S_SUB_U32 $sgpr33, $sgpr34, implicit-def $scc
-    ; CHECK: $vgpr3 = V_LSHRREV_B32_e64 6, killed $vcc_hi, implicit $exec
+    ; CHECK: $vgpr3 = V_LSHRREV_B32_e64 6, $sgpr34, implicit $exec
     ; CHECK: $vcc_lo = S_MOV_B32 8192
     ; CHECK: $vgpr2, dead $vcc = V_ADD_I32_e64 killed $vcc_lo, killed $vgpr3, 0, implicit $exec
     ; CHECK: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr31
     ; CHECK: $sgpr32 = frame-destroy S_SUB_U32 $sgpr32, 1572864, implicit-def $scc
-    ; CHECK: $sgpr33 = frame-setup COPY $sgpr27
+    ; CHECK: $sgpr34 = frame-setup COPY $sgpr27
     ; CHECK: S_ENDPGM 0
     S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr30, implicit-def $sgpr31
     $vgpr0 = V_OR_B32_e32 %stack.1, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr31

diff  --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir
index 90afb185ccb7..67e271bc2122 100644
--- a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir
+++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir
@@ -14,7 +14,6 @@ stack:
 machineFunctionInfo:
   isEntryFunction: false
   scratchRSrcReg:  $sgpr0_sgpr1_sgpr2_sgpr3
-  scratchWaveOffsetReg: $sgpr34
   frameOffsetReg:  $sgpr33
   stackPtrOffsetReg:  $sgpr32
 
@@ -29,10 +28,8 @@ body:             |
     ; CHECK: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc
     ; CHECK: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 1572864, implicit-def $scc
     ; CHECK: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
-    ; CHECK: $sgpr33 = S_SUB_U32 $sgpr33, $sgpr34, implicit-def $scc
-    ; CHECK: $vgpr3 = V_LSHRREV_B32_e64 6, killed $sgpr33, implicit $exec
+    ; CHECK: $vgpr3 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
     ; CHECK: $vgpr2 = V_ADD_U32_e32 8192, killed $vgpr3, implicit $exec
-    ; CHECK: $sgpr33 = S_ADD_U32 $sgpr33, $sgpr34, implicit-def $scc
     ; CHECK: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31
     ; CHECK: $sgpr32 = frame-destroy S_SUB_U32 $sgpr32, 1572864, implicit-def $scc
     ; CHECK: $sgpr33 = frame-setup COPY $sgpr27

diff  --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr.mir
index 853e2346031e..2fbc51f036f0 100644
--- a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr.mir
+++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr.mir
@@ -14,7 +14,6 @@ stack:
 machineFunctionInfo:
   isEntryFunction: false
   scratchRSrcReg:  $sgpr0_sgpr1_sgpr2_sgpr3
-  scratchWaveOffsetReg: $sgpr34
   frameOffsetReg:  $sgpr33
   stackPtrOffsetReg:  $sgpr32
 
@@ -29,9 +28,7 @@ body:             |
     ; CHECK: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294705152, implicit-def $scc
     ; CHECK: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 524288, implicit-def $scc
     ; CHECK: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
-    ; CHECK: $sgpr33 = S_SUB_U32 $sgpr33, $sgpr34, implicit-def $scc
     ; CHECK: $vgpr2 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
-    ; CHECK: $sgpr33 = S_ADD_U32 $sgpr33, $sgpr34, implicit-def $scc
     ; CHECK: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31
     ; CHECK: $sgpr32 = frame-destroy S_SUB_U32 $sgpr32, 524288, implicit-def $scc
     ; CHECK: $sgpr33 = frame-setup COPY $sgpr27

diff  --git a/llvm/test/CodeGen/AMDGPU/private-access-no-objects.ll b/llvm/test/CodeGen/AMDGPU/private-access-no-objects.ll
index 8e96fbb178ca..1c48689f07a4 100644
--- a/llvm/test/CodeGen/AMDGPU/private-access-no-objects.ll
+++ b/llvm/test/CodeGen/AMDGPU/private-access-no-objects.ll
@@ -10,14 +10,13 @@
 ; GCN-LABEL: {{^}}store_to_undef:
 ; OPT-DAG: s_mov_b64 s{{\[}}[[RSRC_LO:[0-9]+]]:{{[0-9]+\]}}, s[0:1]
 ; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3]
-; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s5{{$}}
-; OPT: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offen{{$}}
+; OPT: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, 0 offen{{$}}
 
 ; -O0 should assume spilling, so the input scratch resource descriptor
 ; -should be used directly without any copies.
 
 ; OPTNONE-NOT: s_mov_b32
-; OPTNONE: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s5 offen{{$}}
+; OPTNONE: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen{{$}}
 define amdgpu_kernel void @store_to_undef() #0 {
   store volatile i32 0, i32 addrspace(5)* undef
   ret void
@@ -26,8 +25,7 @@ define amdgpu_kernel void @store_to_undef() #0 {
 ; GCN-LABEL: {{^}}store_to_inttoptr:
 ; OPT-DAG: s_mov_b64 s{{\[}}[[RSRC_LO:[0-9]+]]:{{[0-9]+\]}}, s[0:1]
 ; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3]
-; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s5{{$}}
-; OPT: buffer_store_dword v{{[0-9]+}}, off, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offset:124{{$}}
+; OPT: buffer_store_dword v{{[0-9]+}}, off, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, 0 offset:124{{$}}
 define amdgpu_kernel void @store_to_inttoptr() #0 {
  store volatile i32 0, i32 addrspace(5)* inttoptr (i32 124 to i32 addrspace(5)*)
  ret void
@@ -36,8 +34,7 @@ define amdgpu_kernel void @store_to_inttoptr() #0 {
 ; GCN-LABEL: {{^}}load_from_undef:
 ; OPT-DAG: s_mov_b64 s{{\[}}[[RSRC_LO:[0-9]+]]:{{[0-9]+\]}}, s[0:1]
 ; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3]
-; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s5{{$}}
-; OPT: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offen{{$}}
+; OPT: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, 0 offen{{$}}
 define amdgpu_kernel void @load_from_undef() #0 {
   %ld = load volatile i32, i32 addrspace(5)* undef
   ret void
@@ -46,8 +43,7 @@ define amdgpu_kernel void @load_from_undef() #0 {
 ; GCN-LABEL: {{^}}load_from_inttoptr:
 ; OPT-DAG: s_mov_b64 s{{\[}}[[RSRC_LO:[0-9]+]]:{{[0-9]+\]}}, s[0:1]
 ; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3]
-; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s5{{$}}
-; OPT: buffer_load_dword v{{[0-9]+}}, off, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offset:124{{$}}
+; OPT: buffer_load_dword v{{[0-9]+}}, off, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, 0 offset:124{{$}}
 define amdgpu_kernel void @load_from_inttoptr() #0 {
   %ld = load volatile i32, i32 addrspace(5)* inttoptr (i32 124 to i32 addrspace(5)*)
   ret void

diff  --git a/llvm/test/CodeGen/AMDGPU/private-element-size.ll b/llvm/test/CodeGen/AMDGPU/private-element-size.ll
index 920eaa20a97b..d5e5ba5202fc 100644
--- a/llvm/test/CodeGen/AMDGPU/private-element-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/private-element-size.ll
@@ -10,32 +10,32 @@
 ; HSA-ELT4: private_element_size = 1
 
 
-; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:16
-; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:32
-; HSA-ELT16-DAG: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
-
-; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:24{{$}}
-; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:16
-; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:32
-; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:40
-
-; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen
-; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen
-
-
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:16{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:20{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:24{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:28{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:32{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:36{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:40{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:44{{$}}
-
-; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
-; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}}
-; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8{{$}}
-; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:12{{$}}
+; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:16
+; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:32
+; HSA-ELT16-DAG: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], 0 offen{{$}}
+
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:24{{$}}
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:16
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:32
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:40
+
+; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], 0 offen
+; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], 0 offen
+
+
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:16{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:20{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:24{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:28{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:32{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:36{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:40{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:44{{$}}
+
+; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen{{$}}
+; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen offset:4{{$}}
+; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen offset:8{{$}}
+; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen offset:12{{$}}
 define amdgpu_kernel void @private_elt_size_v4i32(<4 x i32> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -59,53 +59,53 @@ entry:
 ; HSA-ELT8: private_element_size = 2
 ; HSA-ELT4: private_element_size = 1
 
-; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:32
-; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:48
-; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:64
-; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:80
-
-; HSA-ELT16-DAG: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
-; HSA-ELT16-DAG: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
-
-
-; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:32
-; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:40
-; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:48
-; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:56
-; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:88
-; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:80
-; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:72
-; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:64
-
-; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen
-; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen
-
-
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:32{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:36{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:40{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:44{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:48{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:52{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:56{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:60{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:64{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:68{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:72{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:76{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:80{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:84{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:88{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:92{{$}}
-
-; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
-; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}}
-; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8{{$}}
-; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:12{{$}}
-; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:16{{$}}
-; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:20{{$}}
-; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:24{{$}}
-; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:28{{$}}
+; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:32
+; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:48
+; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:64
+; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:80
+
+; HSA-ELT16-DAG: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], 0 offen{{$}}
+; HSA-ELT16-DAG: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], 0 offen{{$}}
+
+
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:32
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:40
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:48
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:56
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:88
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:80
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:72
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:64
+
+; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], 0 offen
+; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], 0 offen
+
+
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:32{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:36{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:40{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:44{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:48{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:52{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:56{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:60{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:64{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:68{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:72{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:76{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:80{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:84{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:88{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:92{{$}}
+
+; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen{{$}}
+; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen offset:4{{$}}
+; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen offset:8{{$}}
+; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen offset:12{{$}}
+; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen offset:16{{$}}
+; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen offset:20{{$}}
+; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen offset:24{{$}}
+; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen offset:28{{$}}
 define amdgpu_kernel void @private_elt_size_v8i32(<8 x i32> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -130,19 +130,19 @@ entry:
 ; HSA-ELT8: private_element_size = 2
 ; HSA-ELT4: private_element_size = 1
 
-; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, {{off|v[0-9]}}, s[0:3], s9 offset:1
-; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, {{off|v[0-9]}}, s[0:3], s9 offset:2
+; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, {{off|v[0-9]}}, s[0:3], 0 offset:1
+; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, {{off|v[0-9]}}, s[0:3], 0 offset:2
 
-; HSA-ELTGE8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen
+; HSA-ELTGE8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], 0 offen
 
 
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:16{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:20{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:24{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:28{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:16{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:20{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:24{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:28{{$}}
 
-; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
-; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}}
+; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen{{$}}
+; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen offset:4{{$}}
 define amdgpu_kernel void @private_elt_size_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -166,19 +166,19 @@ entry:
 ; HSA-ELT8: private_element_size = 2
 ; HSA-ELT4: private_element_size = 1
 
-; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:16
-; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:24
+; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:16
+; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:24
 
-; HSA-ELTGE8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen
+; HSA-ELTGE8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], 0 offen
 
 
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:16{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:20{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:24{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:28{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:16{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:20{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:24{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:28{{$}}
 
-; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
-; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}}
+; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen{{$}}
+; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen offset:4{{$}}
 define amdgpu_kernel void @private_elt_size_f64(double addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -202,32 +202,32 @@ entry:
 ; HSA-ELT8: private_element_size = 2
 ; HSA-ELT4: private_element_size = 1
 
-; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:16
-; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:32
-; HSA-ELT16-DAG: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
+; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:16
+; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:32
+; HSA-ELT16-DAG: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], 0 offen{{$}}
 
-; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:16{{$}}
-; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:24
-; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:40
-; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:32
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:16{{$}}
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:24
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:40
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:32
 
-; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen
-; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen
+; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], 0 offen
+; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], 0 offen
 
 
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:16{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:20{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:24{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:28{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:32{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:36{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:40{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:44{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:16{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:20{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:24{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:28{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:32{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:36{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:40{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:44{{$}}
 
-; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
-; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}}
-; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8{{$}}
-; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:12{{$}}
+; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen{{$}}
+; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen offset:4{{$}}
+; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen offset:8{{$}}
+; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen offset:12{{$}}
 define amdgpu_kernel void @private_elt_size_v2i64(<2 x i64> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()

diff  --git a/llvm/test/CodeGen/AMDGPU/rename-independent-subregs-mac-operands.mir b/llvm/test/CodeGen/AMDGPU/rename-independent-subregs-mac-operands.mir
index 331cccd853c2..d7892a0c9759 100644
--- a/llvm/test/CodeGen/AMDGPU/rename-independent-subregs-mac-operands.mir
+++ b/llvm/test/CodeGen/AMDGPU/rename-independent-subregs-mac-operands.mir
@@ -13,7 +13,6 @@ selected:        false
 tracksRegLiveness: true
 machineFunctionInfo:
   scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
-  scratchWaveOffsetReg: '$sgpr4'
   frameOffsetReg:  '$sgpr4'
 
 registers:
@@ -99,7 +98,6 @@ selected:        false
 tracksRegLiveness: true
 machineFunctionInfo:
   scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
-  scratchWaveOffsetReg: '$sgpr4'
   frameOffsetReg:  '$sgpr4'
 registers:
   - { id: 0, class: vgpr_32, preferred-register: '' }

diff  --git a/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir b/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir
index 0d2f90793fc3..e12cff942bc6 100644
--- a/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir
+++ b/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir
@@ -10,7 +10,6 @@ tracksRegLiveness: true
 machineFunctionInfo:
   isEntryFunction: true
   scratchRSrcReg:  '$sgpr24_sgpr25_sgpr26_sgpr27'
-  scratchWaveOffsetReg: '$sgpr32'
   frameOffsetReg:  '$sgpr32'
   stackPtrOffsetReg: '$sgpr32'
   argumentInfo:

diff  --git a/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir b/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir
index e9f0678f7782..bd0423c5457c 100644
--- a/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir
+++ b/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir
@@ -11,7 +11,6 @@ frameInfo:
 machineFunctionInfo:
   isEntryFunction: true
   scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
-  scratchWaveOffsetReg: '$sgpr101'
   frameOffsetReg:  '$sgpr101'
   stackPtrOffsetReg: '$sgpr101'
   argumentInfo:

diff  --git a/llvm/test/CodeGen/AMDGPU/scratch-buffer.ll b/llvm/test/CodeGen/AMDGPU/scratch-buffer.ll
index 3631d673fa25..d1903f457b30 100644
--- a/llvm/test/CodeGen/AMDGPU/scratch-buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/scratch-buffer.ll
@@ -9,9 +9,9 @@
 ; should be able to reuse the same regiser for each scratch buffer access.
 
 ; GCN-LABEL: {{^}}legal_offset_fi:
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offset:4{{$}}
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:4{{$}}
 ; GCN: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x8004
-; GCN: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen{{$}}
+; GCN: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offen{{$}}
 
 define amdgpu_kernel void @legal_offset_fi(i32 addrspace(1)* %out, i32 %cond, i32 %if_offset, i32 %else_offset) {
 entry:
@@ -47,11 +47,11 @@ done:
 }
 
 ; GCN-LABEL: {{^}}legal_offset_fi_offset:
-; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offen{{$}}
 ; This constant isn't folded, because it has multiple uses.
 ; GCN-DAG: v_mov_b32_e32 [[K8000:v[0-9]+]], 0x8004
 ; GCN-DAG: v_add_{{[iu]}}32_e32 [[OFFSET:v[0-9]+]], vcc, [[K8000]]
-; GCN: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen{{$}}
+; GCN: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offen{{$}}
 
 define amdgpu_kernel void @legal_offset_fi_offset(i32 addrspace(1)* %out, i32 %cond, i32 addrspace(1)* %offsets, i32 %if_offset, i32 %else_offset) {
 entry:
@@ -88,7 +88,7 @@ done:
 
 ; GCN-LABEL: {{^}}neg_vaddr_offset_inbounds:
 ; GCN: v_add_{{[iu]}}32_e32 [[ADD:v[0-9]+]], vcc, 16, v{{[0-9]+}}
-; GCN: buffer_store_dword v{{[0-9]+}}, [[ADD]], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen{{$}}
+; GCN: buffer_store_dword v{{[0-9]+}}, [[ADD]], s[{{[0-9]+:[0-9]+}}], 0 offen{{$}}
 define amdgpu_kernel void @neg_vaddr_offset_inbounds(i32 %offset) {
 entry:
   %array = alloca [8192 x i32], addrspace(5)
@@ -100,7 +100,7 @@ entry:
 
 ; GCN-LABEL: {{^}}neg_vaddr_offset:
 ; GCN: v_add_{{[iu]}}32_e32 [[ADD:v[0-9]+]], vcc, 16, v{{[0-9]+}}
-; GCN: buffer_store_dword v{{[0-9]+}}, [[ADD]], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen{{$}}
+; GCN: buffer_store_dword v{{[0-9]+}}, [[ADD]], s[{{[0-9]+:[0-9]+}}], 0 offen{{$}}
 define amdgpu_kernel void @neg_vaddr_offset(i32 %offset) {
 entry:
   %array = alloca [8192 x i32], addrspace(5)
@@ -111,7 +111,7 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}pos_vaddr_offset:
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:20
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:20
 define amdgpu_kernel void @pos_vaddr_offset(i32 addrspace(1)* %out, i32 %offset) {
 entry:
   %array = alloca [8192 x i32], addrspace(5)

diff  --git a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll
index a7718496ee9a..af933dc94d6e 100644
--- a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll
+++ b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll
@@ -29,8 +29,8 @@
 ; GCN-DAG: v_or_b32_e32 [[LO_OFF:v[0-9]+]], 0x200, [[CLAMP_IDX]]
 ; GCN-DAG: v_or_b32_e32 [[HI_OFF:v[0-9]+]], 0x400, [[CLAMP_IDX]]
 
-; GCN: buffer_load_dword {{v[0-9]+}}, [[LO_OFF]], {{s\[[0-9]+:[0-9]+\]}}, s0 offen
-; GCN: buffer_load_dword {{v[0-9]+}}, [[HI_OFF]], {{s\[[0-9]+:[0-9]+\]}}, s0 offen
+; GCN: buffer_load_dword {{v[0-9]+}}, [[LO_OFF]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen
+; GCN: buffer_load_dword {{v[0-9]+}}, [[HI_OFF]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen
 define amdgpu_ps float @ps_main(i32 %idx) {
   %v1 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0xBFEA477C60000000, float 0xBFEBE5DC60000000, float 0xBFEC71C720000000, float 0xBFEBE5DC60000000, float 0xBFEA477C60000000, float 0xBFE7A693C0000000, float 0xBFE41CFEA0000000, float 0x3FDF9B13E0000000, float 0x3FDF9B1380000000, float 0x3FD5C53B80000000, float 0x3FD5C53B00000000, float 0x3FC6326AC0000000, float 0x3FC63269E0000000, float 0xBEE05CEB00000000, float 0xBEE086A320000000, float 0xBFC63269E0000000, float 0xBFC6326AC0000000, float 0xBFD5C53B80000000, float 0xBFD5C53B80000000, float 0xBFDF9B13E0000000, float 0xBFDF9B1460000000, float 0xBFE41CFE80000000, float 0x3FE7A693C0000000, float 0x3FEA477C20000000, float 0x3FEBE5DC40000000, float 0x3FEC71C6E0000000, float 0x3FEBE5DC40000000, float 0x3FEA477C20000000, float 0x3FE7A693C0000000, float 0xBFE41CFE80000000>, i32 %idx
   %v2 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0x3FE7A693C0000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFEBE5DC40000000, float 0x3FEBE5DC40000000, float 0xBFEC71C720000000, float 0x3FEC71C6E0000000, float 0xBFEBE5DC60000000, float 0x3FEBE5DC40000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFE7A693C0000000, float 0x3FE7A69380000000, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFE80000000>, i32 %idx
@@ -41,8 +41,8 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; GCN-LABEL: {{^}}vs_main:
 ; GCN-DAG: s_mov_b32 s4, SCRATCH_RSRC_DWORD0
 ; GCN-NOT: s_mov_b32 s0
-; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s0 offen
-; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s0 offen
+; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
+; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
 define amdgpu_vs float @vs_main(i32 %idx) {
   %v1 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0xBFEA477C60000000, float 0xBFEBE5DC60000000, float 0xBFEC71C720000000, float 0xBFEBE5DC60000000, float 0xBFEA477C60000000, float 0xBFE7A693C0000000, float 0xBFE41CFEA0000000, float 0x3FDF9B13E0000000, float 0x3FDF9B1380000000, float 0x3FD5C53B80000000, float 0x3FD5C53B00000000, float 0x3FC6326AC0000000, float 0x3FC63269E0000000, float 0xBEE05CEB00000000, float 0xBEE086A320000000, float 0xBFC63269E0000000, float 0xBFC6326AC0000000, float 0xBFD5C53B80000000, float 0xBFD5C53B80000000, float 0xBFDF9B13E0000000, float 0xBFDF9B1460000000, float 0xBFE41CFE80000000, float 0x3FE7A693C0000000, float 0x3FEA477C20000000, float 0x3FEBE5DC40000000, float 0x3FEC71C6E0000000, float 0x3FEBE5DC40000000, float 0x3FEA477C20000000, float 0x3FE7A693C0000000, float 0xBFE41CFE80000000>, i32 %idx
   %v2 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0x3FE7A693C0000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFEBE5DC40000000, float 0x3FEBE5DC40000000, float 0xBFEC71C720000000, float 0x3FEC71C6E0000000, float 0xBFEBE5DC60000000, float 0x3FEBE5DC40000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFE7A693C0000000, float 0x3FE7A69380000000, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFE80000000>, i32 %idx
@@ -52,8 +52,8 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 
 ; GCN-LABEL: {{^}}cs_main:
 ; GCN-DAG: s_mov_b32 s4, SCRATCH_RSRC_DWORD0
-; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s0 offen
-; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s0 offen
+; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
+; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
 define amdgpu_cs float @cs_main(i32 %idx) {
   %v1 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0xBFEA477C60000000, float 0xBFEBE5DC60000000, float 0xBFEC71C720000000, float 0xBFEBE5DC60000000, float 0xBFEA477C60000000, float 0xBFE7A693C0000000, float 0xBFE41CFEA0000000, float 0x3FDF9B13E0000000, float 0x3FDF9B1380000000, float 0x3FD5C53B80000000, float 0x3FD5C53B00000000, float 0x3FC6326AC0000000, float 0x3FC63269E0000000, float 0xBEE05CEB00000000, float 0xBEE086A320000000, float 0xBFC63269E0000000, float 0xBFC6326AC0000000, float 0xBFD5C53B80000000, float 0xBFD5C53B80000000, float 0xBFDF9B13E0000000, float 0xBFDF9B1460000000, float 0xBFE41CFE80000000, float 0x3FE7A693C0000000, float 0x3FEA477C20000000, float 0x3FEBE5DC40000000, float 0x3FEC71C6E0000000, float 0x3FEBE5DC40000000, float 0x3FEA477C20000000, float 0x3FE7A693C0000000, float 0xBFE41CFE80000000>, i32 %idx
   %v2 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0x3FE7A693C0000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFEBE5DC40000000, float 0x3FEBE5DC40000000, float 0xBFEC71C720000000, float 0x3FEC71C6E0000000, float 0xBFEBE5DC60000000, float 0x3FEBE5DC40000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFE7A693C0000000, float 0x3FE7A69380000000, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFE80000000>, i32 %idx
@@ -64,13 +64,13 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; GCN-LABEL: {{^}}hs_main:
 ; SIVI: s_mov_b32 s4, SCRATCH_RSRC_DWORD0
 ; SIVI-NOT: s_mov_b32 s0
-; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s0 offen
-; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s0 offen
+; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
+; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
 
 ; GFX9_10: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
 ; GFX9_10-NOT: s_mov_b32 s5
-; GFX9_10: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen
-; GFX9_10: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen
+; GFX9_10: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
+; GFX9_10: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
 define amdgpu_hs float @hs_main(i32 %idx) {
   %v1 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0xBFEA477C60000000, float 0xBFEBE5DC60000000, float 0xBFEC71C720000000, float 0xBFEBE5DC60000000, float 0xBFEA477C60000000, float 0xBFE7A693C0000000, float 0xBFE41CFEA0000000, float 0x3FDF9B13E0000000, float 0x3FDF9B1380000000, float 0x3FD5C53B80000000, float 0x3FD5C53B00000000, float 0x3FC6326AC0000000, float 0x3FC63269E0000000, float 0xBEE05CEB00000000, float 0xBEE086A320000000, float 0xBFC63269E0000000, float 0xBFC6326AC0000000, float 0xBFD5C53B80000000, float 0xBFD5C53B80000000, float 0xBFDF9B13E0000000, float 0xBFDF9B1460000000, float 0xBFE41CFE80000000, float 0x3FE7A693C0000000, float 0x3FEA477C20000000, float 0x3FEBE5DC40000000, float 0x3FEC71C6E0000000, float 0x3FEBE5DC40000000, float 0x3FEA477C20000000, float 0x3FE7A693C0000000, float 0xBFE41CFE80000000>, i32 %idx
   %v2 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0x3FE7A693C0000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFEBE5DC40000000, float 0x3FEBE5DC40000000, float 0xBFEC71C720000000, float 0x3FEC71C6E0000000, float 0xBFEBE5DC60000000, float 0x3FEBE5DC40000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFE7A693C0000000, float 0x3FE7A69380000000, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFE80000000>, i32 %idx
@@ -80,12 +80,12 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 
 ; GCN-LABEL: {{^}}gs_main:
 ; SIVI: s_mov_b32 s4, SCRATCH_RSRC_DWORD0
-; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s0 offen
-; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s0 offen
+; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
+; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
 
 ; GFX9_10: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
-; GFX9_10: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen
-; GFX9_10: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen
+; GFX9_10: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
+; GFX9_10: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
 define amdgpu_gs float @gs_main(i32 %idx) {
   %v1 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0xBFEA477C60000000, float 0xBFEBE5DC60000000, float 0xBFEC71C720000000, float 0xBFEBE5DC60000000, float 0xBFEA477C60000000, float 0xBFE7A693C0000000, float 0xBFE41CFEA0000000, float 0x3FDF9B13E0000000, float 0x3FDF9B1380000000, float 0x3FD5C53B80000000, float 0x3FD5C53B00000000, float 0x3FC6326AC0000000, float 0x3FC63269E0000000, float 0xBEE05CEB00000000, float 0xBEE086A320000000, float 0xBFC63269E0000000, float 0xBFC6326AC0000000, float 0xBFD5C53B80000000, float 0xBFD5C53B80000000, float 0xBFDF9B13E0000000, float 0xBFDF9B1460000000, float 0xBFE41CFE80000000, float 0x3FE7A693C0000000, float 0x3FEA477C20000000, float 0x3FEBE5DC40000000, float 0x3FEC71C6E0000000, float 0x3FEBE5DC40000000, float 0x3FEA477C20000000, float 0x3FE7A693C0000000, float 0xBFE41CFE80000000>, i32 %idx
   %v2 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0x3FE7A693C0000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFEBE5DC40000000, float 0x3FEBE5DC40000000, float 0xBFEC71C720000000, float 0x3FEC71C6E0000000, float 0xBFEBE5DC60000000, float 0x3FEBE5DC40000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFE7A693C0000000, float 0x3FE7A69380000000, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFE80000000>, i32 %idx
@@ -93,16 +93,21 @@ define amdgpu_gs float @gs_main(i32 %idx) {
   ret float %r
 }
 
+; Mesa GS and HS shaders have the preloaded scratch wave offset SGPR fixed at
+; SGPR5, and the inreg implementation is used to reference it in the IR. The
+; following tests confirm the shader and anything inserted after the return
+; (i.e. SI_RETURN_TO_EPILOG) can access the scratch wave offset.
+
 ; GCN-LABEL: {{^}}hs_ir_uses_scratch_offset:
 ; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
 
 ; SIVI-NOT: s_mov_b32 s6
-; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s6 offen
-; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s6 offen
+; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
+; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
 
 ; GFX9_10-NOT: s_mov_b32 s5
-; GFX9_10-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen
-; GFX9_10-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen
+; GFX9_10-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
+; GFX9_10-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
 
 ; GCN-DAG: s_mov_b32 s2, s5
 define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg %swo, i32 %idx) {
@@ -117,11 +122,11 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; GCN-LABEL: {{^}}gs_ir_uses_scratch_offset:
 ; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
 
-; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s6 offen
-; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s6 offen
+; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
+; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
 
-; GFX9_10-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen
-; GFX9_10-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen
+; GFX9_10-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
+; GFX9_10-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
 
 ; GCN-DAG: s_mov_b32 s2, s5
 define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg %swo, i32 %idx) {

diff  --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir
index afb0a8aa7e40..5021923f4675 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir
@@ -35,13 +35,13 @@
 
 # SHARE: SI_SPILL_S32_SAVE $sgpr32, %stack.2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (store 4 into %stack.2, addrspace 5)
 # SHARE: SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5)
-# SHARE: SI_SPILL_S64_SAVE killed renamable $sgpr6_sgpr7, %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (store 8 into %stack.1, align 4, addrspace 5)
-# SHARE: renamable $sgpr6_sgpr7 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 8 from %stack.1, align 4, addrspace 5)
-# SHARE: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr6_sgpr7, @func, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4, implicit undef $vgpr0
+# SHARE: SI_SPILL_S64_SAVE killed renamable $sgpr4_sgpr5, %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (store 8 into %stack.1, align 4, addrspace 5)
+# SHARE: renamable $sgpr4_sgpr5 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 8 from %stack.1, align 4, addrspace 5)
+# SHARE: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit undef $vgpr0
 # SHARE: $sgpr32 = SI_SPILL_S32_RESTORE %stack.2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 4 from %stack.2, addrspace 5)
 # SHARE: $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5)
-# SHARE: renamable $sgpr6_sgpr7 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 8 from %stack.1, align 4, addrspace 5)
-# SHARE: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr6_sgpr7, @func, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4, implicit $vgpr0
+# SHARE: renamable $sgpr4_sgpr5 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 8 from %stack.1, align 4, addrspace 5)
+# SHARE: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0
 # SHARE:  $sgpr32 = SI_SPILL_S32_RESTORE %stack.2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 4 from %stack.2, addrspace 5)
 
 # NOSHARE: stack:
@@ -60,14 +60,14 @@
 
 # NOSHARE: SI_SPILL_S32_SAVE $sgpr32, %stack.2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (store 4 into %stack.2, addrspace 5)
 # NOSHARE: SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5)
-# NOSHARE: SI_SPILL_S64_SAVE killed renamable $sgpr6_sgpr7, %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (store 8 into %stack.1, align 4, addrspace 5)
-# NOSHARE: renamable $sgpr6_sgpr7 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 8 from %stack.1, align 4, addrspace 5)
-# NOSHARE: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr6_sgpr7, @func, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4, implicit undef $vgpr0
+# NOSHARE: SI_SPILL_S64_SAVE killed renamable $sgpr4_sgpr5, %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (store 8 into %stack.1, align 4, addrspace 5)
+# NOSHARE: renamable $sgpr4_sgpr5 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 8 from %stack.1, align 4, addrspace 5)
+# NOSHARE: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit undef $vgpr0
 # NOSHARE: $sgpr32 = SI_SPILL_S32_RESTORE %stack.2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 4 from %stack.2, addrspace 5)
 # NOSHARE: SI_SPILL_S32_SAVE $sgpr32, %stack.3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (store 4 into %stack.3, addrspace 5)
 # NOSHARE: $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5)
-# NOSHARE: renamable $sgpr6_sgpr7 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 8 from %stack.1, align 4, addrspace 5)
-# NOSHARE: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr6_sgpr7, @func, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4, implicit $vgpr0
+# NOSHARE: renamable $sgpr4_sgpr5 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 8 from %stack.1, align 4, addrspace 5)
+# NOSHARE: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0
 # NOSHARE: $sgpr32 = SI_SPILL_S32_RESTORE %stack.3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 4 from %stack.3, addrspace 5)
 
 ...
@@ -78,7 +78,6 @@ frameInfo:
   hasCalls:        true
 machineFunctionInfo:
   scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
-  scratchWaveOffsetReg: $sgpr4
   frameOffsetReg: $sgpr32
   stackPtrOffsetReg: $sgpr32
 body:             |
@@ -88,13 +87,13 @@ body:             |
     %2:vgpr_32 = FLAT_LOAD_DWORD %1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     %3:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc
     ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
-    dead $sgpr30_sgpr31 = SI_CALL %3, @func, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4, implicit undef $vgpr0
+    dead $sgpr30_sgpr31 = SI_CALL %3, @func, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit undef $vgpr0
     $sgpr32 = COPY %0
     %4:sreg_32_xm0 = COPY $sgpr32
     ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
     ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
     $vgpr0 = COPY %2
-    dead $sgpr30_sgpr31 = SI_CALL %3, @func, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4, implicit killed $vgpr0
+    dead $sgpr30_sgpr31 = SI_CALL %3, @func, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed $vgpr0
     $sgpr32 = COPY %4
     ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32
 

diff  --git a/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll b/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll
index ed57ec6cca35..8b9942c587cc 100644
--- a/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll
@@ -345,10 +345,10 @@ define void @shl_add_ptr_combine_2use_both_max_lds_offset(i32 %idx) #0 {
 
 ; GCN-LABEL: {{^}}shl_add_ptr_combine_2use_private:
 ; GCN: v_lshlrev_b32_e32 [[SCALE0:v[0-9]+]], 2, v0
-; GCN: buffer_store_dword v{{[0-9]+}}, [[SCALE0]], s[0:3], s33 offen offset:16
+; GCN: buffer_store_dword v{{[0-9]+}}, [[SCALE0]], s[0:3], 0 offen offset:16
 
 ; GCN: v_lshlrev_b32_e32 [[SCALE1:v[0-9]+]], 3, v0
-; GCN: buffer_store_dword v{{[0-9]+}}, [[SCALE1]], s[0:3], s33 offen offset:32
+; GCN: buffer_store_dword v{{[0-9]+}}, [[SCALE1]], s[0:3], 0 offen offset:32
 define void @shl_add_ptr_combine_2use_private(i16 zeroext %idx.arg) #0 {
   %idx = zext i16 %idx.arg to i32
   %idx.add = add nuw i32 %idx, 4
@@ -364,9 +364,9 @@ define void @shl_add_ptr_combine_2use_private(i16 zeroext %idx.arg) #0 {
 ; GCN-LABEL: {{^}}shl_add_ptr_combine_2use_max_private_offset:
 ; GCN-DAG: v_lshlrev_b32_e32 [[SCALE0:v[0-9]+]], 3, v0
 ; GCN-DAG: v_lshlrev_b32_e32 [[SCALE1:v[0-9]+]], 4, v0
-; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[SCALE0]], s[0:3], s33 offen offset:4088
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[SCALE0]], s[0:3], 0 offen offset:4088
 ; GCN-DAG: v_add_{{[iu]}}32_e32 [[ADD:v[0-9]+]], vcc, 0x1ff0, [[SCALE1]]
-; GCN: buffer_store_dword v{{[0-9]+}}, [[ADD]], s[0:3], s33 offen{{$}}
+; GCN: buffer_store_dword v{{[0-9]+}}, [[ADD]], s[0:3], 0 offen{{$}}
 define void @shl_add_ptr_combine_2use_max_private_offset(i16 zeroext %idx.arg) #0 {
   %idx = zext i16 %idx.arg to i32
   %idx.add = add nuw i32 %idx, 511
@@ -382,8 +382,8 @@ define void @shl_add_ptr_combine_2use_max_private_offset(i16 zeroext %idx.arg) #
 ; GCN: v_add_{{[iu]}}32_e32 [[ADD:v[0-9]+]], vcc, 0x100, v0
 ; GCN-DAG: v_lshlrev_b32_e32 [[SCALE0:v[0-9]+]], 4, [[ADD]]
 ; GCN-DAG: v_lshlrev_b32_e32 [[SCALE1:v[0-9]+]], 5, [[ADD]]
-; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[SCALE0]], s[0:3], s33 offen{{$}}
-; GCN: buffer_store_dword v{{[0-9]+}}, [[SCALE1]], s[0:3], s33 offen{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[SCALE0]], s[0:3], 0 offen{{$}}
+; GCN: buffer_store_dword v{{[0-9]+}}, [[SCALE1]], s[0:3], 0 offen{{$}}
 define void @shl_add_ptr_combine_2use_both_max_private_offset(i16 zeroext %idx.arg) #0 {
   %idx = zext i16 %idx.arg to i32
   %idx.add = add nuw i32 %idx, 256

diff  --git a/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll b/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll
index b93658665be0..72ba9152a7e3 100644
--- a/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll
@@ -3,11 +3,10 @@
 ; Make sure this doesn't crash.
 ; ALL-LABEL: {{^}}test:
 ; ALL: s_mov_b32 s[[LO:[0-9]+]], SCRATCH_RSRC_DWORD0
-; ALL: s_mov_b32 s[[OFF:[0-9]+]], s3
 ; ALL: s_mov_b32 s[[HI:[0-9]+]], 0xe80000
 
 ; Make sure we are handling hazards correctly.
-; SGPR: buffer_load_dword [[VHI:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:16
+; SGPR: buffer_load_dword [[VHI:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:16
 ; SGPR-NEXT: s_waitcnt vmcnt(0)
 ; SGPR-NEXT: v_readfirstlane_b32 s[[HI:[0-9]+]], [[VHI]]
 ; SGPR-NEXT: s_nop 4

diff  --git a/llvm/test/CodeGen/AMDGPU/sibling-call.ll b/llvm/test/CodeGen/AMDGPU/sibling-call.ll
index f887a959cbd2..508f08ed924b 100644
--- a/llvm/test/CodeGen/AMDGPU/sibling-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/sibling-call.ll
@@ -117,7 +117,7 @@ entry:
 ; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_byval_i32:
 ; GCN-NOT: v0
 ; GCN-NOT: s32
-; GCN: buffer_load_dword v1, off, s[0:3], s33 offset:16
+; GCN: buffer_load_dword v1, off, s[0:3], 0 offset:16
 ; GCN: buffer_store_dword v1, off, s[0:3], s32{{$}}
 ; GCN-NEXT: s_setpc_b64
 define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32(i32 %a, [32 x i32] %large) #1 {

diff  --git a/llvm/test/CodeGen/AMDGPU/sp-too-many-input-sgprs.ll b/llvm/test/CodeGen/AMDGPU/sp-too-many-input-sgprs.ll
deleted file mode 100644
index e1f6eb715a31..000000000000
--- a/llvm/test/CodeGen/AMDGPU/sp-too-many-input-sgprs.ll
+++ /dev/null
@@ -1,102 +0,0 @@
-; RUN: llc -mtriple=amdgcn-mesa-mesa3d -verify-machineinstrs < %s | FileCheck -check-prefixes=MESA3D,ALL %s
-; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefixes=UNKNOWN,ALL %s
-
-; Make sure shaders pick a workable SP with > 32 input SGPRs.
-; FIXME: Doesn't seem to be getting initial value from right register?
-
-; ALL-LABEL: {{^}}too_many_input_sgprs_32:
-; MESA3D-NOT: s34
-; MESA3D: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s34 offset:4
-
-; Happens to end up in s32 anyway
-; UNKNOWN-NOT: s32
-; UNKNOWN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s32 offset:4
-define amdgpu_ps i32 @too_many_input_sgprs_32(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3, i32 inreg %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 inreg %arg7,
-                                              i32 inreg %arg8, i32 inreg %arg9, i32 inreg %arg10, i32 inreg %arg11, i32 inreg %arg12, i32 inreg %arg13, i32 inreg %arg14, i32 inreg %arg15,
-                                              i32 inreg %arg16, i32 inreg %arg17, i32 inreg %arg18, i32 inreg %arg19, i32 inreg %arg20, i32 inreg %arg21, i32 inreg %arg22, i32 inreg %arg23,
-                                              i32 inreg %arg24, i32 inreg %arg25, i32 inreg %arg26, i32 inreg %arg27, i32 inreg %arg28, i32 inreg %arg29, i32 inreg %arg30, i32 inreg %arg31) {
-bb:
-  %alloca = alloca i32, align 4, addrspace(5)
-  store volatile i32 0, i32 addrspace(5)* %alloca
-  %tmp = add i32 %arg, %arg1
-  %tmp32 = add i32 %tmp, %arg2
-  %tmp33 = add i32 %tmp32, %arg3
-  %tmp34 = add i32 %tmp33, %arg4
-  %tmp35 = add i32 %tmp34, %arg5
-  %tmp36 = add i32 %tmp35, %arg6
-  %tmp37 = add i32 %tmp36, %arg7
-  %tmp38 = add i32 %tmp37, %arg8
-  %tmp39 = add i32 %tmp38, %arg9
-  %tmp40 = add i32 %tmp39, %arg10
-  %tmp41 = add i32 %tmp40, %arg11
-  %tmp42 = add i32 %tmp41, %arg12
-  %tmp43 = add i32 %tmp42, %arg13
-  %tmp44 = add i32 %tmp43, %arg14
-  %tmp45 = add i32 %tmp44, %arg15
-  %tmp46 = add i32 %tmp45, %arg16
-  %tmp47 = add i32 %tmp46, %arg17
-  %tmp48 = add i32 %tmp47, %arg18
-  %tmp49 = add i32 %tmp48, %arg19
-  %tmp50 = add i32 %tmp49, %arg20
-  %tmp51 = add i32 %tmp50, %arg21
-  %tmp52 = add i32 %tmp51, %arg22
-  %tmp53 = add i32 %tmp52, %arg23
-  %tmp54 = add i32 %tmp53, %arg24
-  %tmp55 = add i32 %tmp54, %arg25
-  %tmp56 = add i32 %tmp55, %arg26
-  %tmp57 = add i32 %tmp56, %arg27
-  %tmp58 = add i32 %tmp57, %arg28
-  %tmp59 = add i32 %tmp58, %arg29
-  %tmp60 = add i32 %tmp59, %arg30
-  %tmp61 = add i32 %tmp60, %arg31
-  ret i32 %tmp61
-}
-
-; ALL-LABEL: {{^}}too_many_input_sgprs_33:
-; MESA3D-NOT: s35
-; MESA3D: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s35 offset:4
-
-; UNKNOWN-NOT: s33
-; UNKNOWN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s33 offset:4
-define amdgpu_ps i32 @too_many_input_sgprs_33(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3, i32 inreg %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 inreg %arg7,
-                                              i32 inreg %arg8, i32 inreg %arg9, i32 inreg %arg10, i32 inreg %arg11, i32 inreg %arg12, i32 inreg %arg13, i32 inreg %arg14, i32 inreg %arg15,
-                                              i32 inreg %arg16, i32 inreg %arg17, i32 inreg %arg18, i32 inreg %arg19, i32 inreg %arg20, i32 inreg %arg21, i32 inreg %arg22, i32 inreg %arg23,
-                                              i32 inreg %arg24, i32 inreg %arg25, i32 inreg %arg26, i32 inreg %arg27, i32 inreg %arg28, i32 inreg %arg29, i32 inreg %arg30, i32 inreg %arg31,
-                                              i32 inreg %arg32) {
-bb:
-  %alloca = alloca i32, align 4, addrspace(5)
-  store volatile i32 0, i32 addrspace(5)* %alloca
-  %tmp = add i32 %arg, %arg1
-  %tmp32 = add i32 %tmp, %arg2
-  %tmp33 = add i32 %tmp32, %arg3
-  %tmp34 = add i32 %tmp33, %arg4
-  %tmp35 = add i32 %tmp34, %arg5
-  %tmp36 = add i32 %tmp35, %arg6
-  %tmp37 = add i32 %tmp36, %arg7
-  %tmp38 = add i32 %tmp37, %arg8
-  %tmp39 = add i32 %tmp38, %arg9
-  %tmp40 = add i32 %tmp39, %arg10
-  %tmp41 = add i32 %tmp40, %arg11
-  %tmp42 = add i32 %tmp41, %arg12
-  %tmp43 = add i32 %tmp42, %arg13
-  %tmp44 = add i32 %tmp43, %arg14
-  %tmp45 = add i32 %tmp44, %arg15
-  %tmp46 = add i32 %tmp45, %arg16
-  %tmp47 = add i32 %tmp46, %arg17
-  %tmp48 = add i32 %tmp47, %arg18
-  %tmp49 = add i32 %tmp48, %arg19
-  %tmp50 = add i32 %tmp49, %arg20
-  %tmp51 = add i32 %tmp50, %arg21
-  %tmp52 = add i32 %tmp51, %arg22
-  %tmp53 = add i32 %tmp52, %arg23
-  %tmp54 = add i32 %tmp53, %arg24
-  %tmp55 = add i32 %tmp54, %arg25
-  %tmp56 = add i32 %tmp55, %arg26
-  %tmp57 = add i32 %tmp56, %arg27
-  %tmp58 = add i32 %tmp57, %arg28
-  %tmp59 = add i32 %tmp58, %arg29
-  %tmp60 = add i32 %tmp59, %arg30
-  %tmp61 = add i32 %tmp60, %arg31
-  %tmp62 = add i32 %tmp61, %arg32
-  ret i32 %tmp62
-}

diff  --git a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
index 9c7279a78e75..8fedd62ac610 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
@@ -6,8 +6,8 @@
 ; A2M-DAG:    s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
 ; A2V-NOT:    SCRATCH_RSRC
 ; GFX908-DAG: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a0
-; A2M:        buffer_store_dword v[[VSPILL]], off, s[{{[0-9:]+}}], s{{[0-9]+}} offset:[[FI:[0-9]+]] ; 4-byte Folded Spill
-; A2M:        buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], s{{[0-9]+}} offset:[[FI]] ; 4-byte Folded Reload
+; A2M:        buffer_store_dword v[[VSPILL]], off, s[{{[0-9:]+}}], 0 offset:[[FI:[0-9]+]] ; 4-byte Folded Spill
+; A2M:        buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI]] ; 4-byte Folded Reload
 ; GFX908:     v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]]
 ; A2V:        ScratchSize: 0
 define amdgpu_kernel void @max_24regs_32a_used(<16 x float> addrspace(1)* %arg, float addrspace(1)* %out) #0 {
@@ -35,8 +35,8 @@ bb:
 ; A2M-DAG:    s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
 ; A2V-NOT:    SCRATCH_RSRC
 ; GFX908-DAG: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a4
-; A2M:        buffer_store_dword v[[VSPILL]], off, s[{{[0-9:]+}}], s{{[0-9]+}} offset:[[FI:[0-9]+]] ; 4-byte Folded Spill
-; A2M:        buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], s{{[0-9]+}} offset:[[FI]] ; 4-byte Folded Reload
+; A2M:        buffer_store_dword v[[VSPILL]], off, s[{{[0-9:]+}}], 0 offset:[[FI:[0-9]+]] ; 4-byte Folded Spill
+; A2M:        buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI]] ; 4-byte Folded Reload
 ; A2V:        v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]]
 ; A2V:        ScratchSize: 0
 define amdgpu_kernel void @max_12regs_13a_used(<4 x float> addrspace(1)* %arg, <4 x float> addrspace(1)* %out) #2 {
@@ -64,8 +64,8 @@ st:
 ; A2M-DAG:    s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
 ; A2V-NOT:    SCRATCH_RSRC
 ; GFX908-DAG: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a0
-; A2M:        buffer_store_dword v[[VSPILL]], off, s[{{[0-9:]+}}], s{{[0-9]+}} offset:[[FI:[0-9]+]] ; 4-byte Folded Spill
-; A2M:        buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], s{{[0-9]+}} offset:[[FI]] ; 4-byte Folded Reload
+; A2M:        buffer_store_dword v[[VSPILL]], off, s[{{[0-9:]+}}], 0 offset:[[FI:[0-9]+]] ; 4-byte Folded Spill
+; A2M:        buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI]] ; 4-byte Folded Reload
 ; GFX908:     v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]]
 ; A2V:        ScratchSize: 0
 define amdgpu_kernel void @max_10_vgprs_used_9a(i32 addrspace(1)* %p) #1 {
@@ -80,8 +80,8 @@ define amdgpu_kernel void @max_10_vgprs_used_9a(i32 addrspace(1)* %p) #1 {
 ; A2M-DAG:    s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
 ; A2V-NOT:    SCRATCH_RSRC
 ; GFX908-DAG: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a0
-; A2M:        buffer_store_dword v[[VSPILL]], off, s[{{[0-9:]+}}], s{{[0-9]+}} offset:[[FI:[0-9]+]] ; 4-byte Folded Spill
-; A2M:        buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], s{{[0-9]+}} offset:[[FI]] ; 4-byte Folded Reload
+; A2M:        buffer_store_dword v[[VSPILL]], off, s[{{[0-9:]+}}], 0 offset:[[FI:[0-9]+]] ; 4-byte Folded Spill
+; A2M:        buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI]] ; 4-byte Folded Reload
 ; GFX908:     v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]]
 ; A2V:        ScratchSize: 0
 define amdgpu_kernel void @max_32regs_mfma32(float addrspace(1)* %arg) #3 {

diff  --git a/llvm/test/CodeGen/AMDGPU/spill-before-exec.mir b/llvm/test/CodeGen/AMDGPU/spill-before-exec.mir
index c56387918719..86f2de31e7f9 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-before-exec.mir
+++ b/llvm/test/CodeGen/AMDGPU/spill-before-exec.mir
@@ -10,7 +10,6 @@ name:            foo
 tracksRegLiveness: true
 machineFunctionInfo:
   scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
-  scratchWaveOffsetReg: $sgpr4
   stackPtrOffsetReg: $sgpr32
 registers:
   - { id: 0, class: sreg_64 }
@@ -36,6 +35,10 @@ registers:
   - { id: 1119, class: sgpr_128 }
   - { id: 1120, class: sgpr_128 }
   - { id: 1121, class: sgpr_128 }
+  - { id: 1122, class: sgpr_128 }
+  - { id: 1123, class: sgpr_128 }
+  - { id: 1124, class: sgpr_128 }
+  - { id: 1125, class: sgpr_128 }
 body:             |
   bb.0:
     successors: %bb.1
@@ -63,6 +66,10 @@ body:             |
     %1119 = COPY %1100
     %1120 = COPY %1100
     %1121 = COPY %1100
+    %1122 = COPY %1100
+    %1123 = COPY %1100
+    %1124 = COPY %1100
+    %1125 = COPY %1100
     S_BRANCH %bb.1
 
   bb.1:
@@ -97,6 +104,8 @@ body:             |
     S_CMP_EQ_U64 %1116.sub0_sub1, %1117.sub2_sub3, implicit-def $scc
     S_CMP_EQ_U64 %1118.sub0_sub1, %1119.sub2_sub3, implicit-def $scc
     S_CMP_EQ_U64 %1120.sub0_sub1, %1121.sub2_sub3, implicit-def $scc
+    S_CMP_EQ_U64 %1122.sub0_sub1, %1123.sub2_sub3, implicit-def $scc
+    S_CMP_EQ_U64 %1124.sub0_sub1, %1125.sub2_sub3, implicit-def $scc
 
     $vgpr0 = V_MOV_B32_e32 0, implicit $exec
     S_SETPC_B64_return undef $sgpr30_sgpr31, implicit %0, implicit $vgpr0

diff  --git a/llvm/test/CodeGen/AMDGPU/spill-empty-live-interval.mir b/llvm/test/CodeGen/AMDGPU/spill-empty-live-interval.mir
index 0cf19cea7811..fd0debda403c 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-empty-live-interval.mir
+++ b/llvm/test/CodeGen/AMDGPU/spill-empty-live-interval.mir
@@ -21,7 +21,6 @@ name: expecting_non_empty_interval
 tracksRegLiveness: true
 machineFunctionInfo:
   scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
-  scratchWaveOffsetReg: $sgpr4
   stackPtrOffsetReg: $sgpr32
 body:             |
   bb.0:
@@ -55,7 +54,6 @@ name: rematerialize_empty_interval_has_reference
 tracksRegLiveness: true
 machineFunctionInfo:
   scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
-  scratchWaveOffsetReg: $sgpr4
   stackPtrOffsetReg: $sgpr32
 body:             |
   bb.0:

diff  --git a/llvm/test/CodeGen/AMDGPU/spill-m0.ll b/llvm/test/CodeGen/AMDGPU/spill-m0.ll
index 2d7f7f9e33f5..1d65471dced1 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-m0.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-m0.ll
@@ -14,7 +14,7 @@
 
 ; TOVMEM-DAG: s_mov_b32 [[M0_COPY:s[0-9]+]], m0
 ; TOVMEM-DAG: v_mov_b32_e32 [[SPILL_VREG:v[0-9]+]], [[M0_COPY]]
-; TOVMEM: buffer_store_dword [[SPILL_VREG]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:12 ; 4-byte Folded Spill
+; TOVMEM: buffer_store_dword [[SPILL_VREG]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:12 ; 4-byte Folded Spill
 
 ; GCN: s_cbranch_scc1 [[ENDIF:BB[0-9]+_[0-9]+]]
 
@@ -22,7 +22,7 @@
 ; TOVGPR: v_readlane_b32 [[M0_RESTORE:s[0-9]+]], [[SPILL_VREG]], 2
 ; TOVGPR: s_mov_b32 m0, [[M0_RESTORE]]
 
-; TOVMEM: buffer_load_dword [[RELOAD_VREG:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:12 ; 4-byte Folded Reload
+; TOVMEM: buffer_load_dword [[RELOAD_VREG:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:12 ; 4-byte Folded Reload
 ; TOVMEM: s_waitcnt vmcnt(0)
 ; TOVMEM: v_readfirstlane_b32 [[M0_RESTORE:s[0-9]+]], [[RELOAD_VREG]]
 ; TOVMEM: s_mov_b32 m0, [[M0_RESTORE]]

diff  --git a/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll b/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll
index 5ff1a1cab18e..969edbf12647 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll
@@ -13,7 +13,7 @@ entry:
   %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
 
   %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
-  ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4092 ; 4-byte Folded Spill
+  ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4092 ; 4-byte Folded Spill
   %a = load volatile i32, i32 addrspace(5)* %aptr
 
   ; Force %a to spill.
@@ -35,7 +35,7 @@ entry:
 
   %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
   ; 0x40000 / 64 = 4096 (for wave64)
-  ; CHECK: s_add_u32 s6, s7, 0x40000
+  ; CHECK: s_mov_b32 s6, 0x40000
   ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 ; 4-byte Folded Spill
   %a = load volatile i32, i32 addrspace(5)* %aptr
 
@@ -48,39 +48,8 @@ entry:
   ret void
 }
 
-; CHECK-LABEL: test_sgpr_offset_kernel_scavenge_fail
-define amdgpu_kernel void @test_sgpr_offset_kernel_scavenge_fail() #1 {
-entry:
-  ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
-  ; fit in the instruction, and has to live in the SGPR offset.
-  %alloca = alloca i8, i32 4092, align 4, addrspace(5)
-  %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
-
-  %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
-
-  ; 0x40000 / 64 = 4096 (for wave64)
-  %a = load volatile i32, i32 addrspace(5)* %aptr
-
-  %asm = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"()
-  %asm0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 0
-  %asm1 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 1
-  %asm2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 2
-  %asm3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 3
-  %asm4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 4
-  %asm5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 5
-  %asm6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 6
-  %asm7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 7
-
-  call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0
-
-  ; CHECK: s_add_u32 s7, s7, 0x40000
-  ; CHECK: buffer_load_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s7 ; 4-byte Folded Reload
-  ; CHECK: s_sub_u32 s7, s7, 0x40000
-
-   ; Force %a to spill with no free SGPRs
-  call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0, i32 %asm1, i32 %asm2, i32 %asm3, i32 %asm4, i32 %asm5, i32 %asm6, i32 %asm7, i32 %a)
-  ret void
-}
+; FIXME: If we fail to scavenge an SGPR in a kernel we don't have a stack
+; pointer to temporarily update, so we just crash.
 
 ; CHECK-LABEL: test_sgpr_offset_function_scavenge_fail
 define void @test_sgpr_offset_function_scavenge_fail() #2 {
@@ -141,8 +110,8 @@ entry:
   %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
   %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)*
 
-  ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4088 ; 4-byte Folded Spill
-  ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4092 ; 4-byte Folded Spill
+  ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4088 ; 4-byte Folded Spill
+  ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4092 ; 4-byte Folded Spill
   %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1
   %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr
 
@@ -170,7 +139,7 @@ entry:
   %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)*
 
   ; 0x3ff00 / 64 = 4092 (for wave64)
-  ; CHECK: s_add_u32 s6, s7, 0x3ff00
+  ; CHECK: s_mov_b32 s6, 0x3ff00
   ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 ; 4-byte Folded Spill
   ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 offset:4 ; 4-byte Folded Spill
   %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1

diff  --git a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll
index be60a34b4208..497d35ea3d71 100644
--- a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll
+++ b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll
@@ -1,6 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefix=GCN %s
 
+; FIXME: The MUBUF loads in this test output are incorrect, their SOffset
+; should use the frame offset register, not the ABI stack pointer register. We
+; rely on the frame index argument of MUBUF stack accesses to survive until PEI
+; so we can fix up the SOffset to use the correct frame register in
+; eliminateFrameIndex. Some things like LocalStackSlotAllocation can lift the
+; frame index up into something (e.g. `v_add_nc_u32`) that we cannot fold back
+; into the MUBUF instruction, and so we end up emitting an incorrect offset.
+; Fixing this may involve adding stack access pseudos so that we don't have to
+; speculatively refer to the ABI stack pointer register at all.
+
 ; An assert was hit when frame offset register was used to address FrameIndex.
 define amdgpu_kernel void @kernel_background_evaluate(float addrspace(5)* %kg, <4 x i32> addrspace(1)* %input, <4 x float> addrspace(1)* %output, i32 %i) {
 ; GCN-LABEL: kernel_background_evaluate:
@@ -10,14 +20,15 @@ define amdgpu_kernel void @kernel_background_evaluate(float addrspace(5)* %kg, <
 ; GCN-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GCN-NEXT:    s_mov_b32 s38, -1
 ; GCN-NEXT:    s_mov_b32 s39, 0x31c16000
-; GCN-NEXT:    s_mov_b32 s33, s3
-; GCN-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GCN-NEXT:    s_add_u32 s36, s36, s3
+; GCN-NEXT:    s_addc_u32 s37, s37, 0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0x2000
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0x4000
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
-; GCN-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GCN-NEXT:    v_mov_b32_e32 v4, 0x400000
-; GCN-NEXT:    s_add_u32 s32, s33, 0xc0000
+; GCN-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GCN-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GCN-NEXT:    s_mov_b32 s32, 0xc0000
 ; GCN-NEXT:    v_add_nc_u32_e64 v32, 4, 0x4000
 ; GCN-NEXT:    ; implicit-def: $vcc_hi
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
@@ -36,7 +47,7 @@ define amdgpu_kernel void @kernel_background_evaluate(float addrspace(5)* %kg, <
 ; GCN-NEXT:    v_add_nc_u32_e32 v0, v1, v0
 ; GCN-NEXT:    v_mul_lo_u32 v0, 0x41c64e6d, v0
 ; GCN-NEXT:    v_add_nc_u32_e32 v0, 0x3039, v0
-; GCN-NEXT:    buffer_store_dword v0, v0, s[36:39], s33 offen
+; GCN-NEXT:    buffer_store_dword v0, v0, s[36:39], 0 offen
 ; GCN-NEXT:  BB0_2: ; %shader_eval_surface.exit
 ; GCN-NEXT:    s_endpgm
 entry:

diff  --git a/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll b/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll
index deb94f521f00..93b6eacf3429 100644
--- a/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll
+++ b/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll
@@ -7,10 +7,12 @@ define amdgpu_kernel void @max_alignment_128() #0 {
 ; VI-LABEL: max_alignment_128:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_add_u32 s4, s4, s7
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; VI-NEXT:    s_add_u32 s0, s0, s7
+; VI-NEXT:    s_addc_u32 s1, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, 9
 ; VI-NEXT:    s_mov_b32 flat_scratch_lo, s5
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s7 offset:128
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:128
 ; VI-NEXT:    s_endpgm
 ; VI-NEXT:    .section .rodata,#alloc
 ; VI-NEXT:    .p2align 6
@@ -52,9 +54,11 @@ define amdgpu_kernel void @max_alignment_128() #0 {
 ; GFX9-LABEL: max_alignment_128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s4, s7
-; GFX9-NEXT:    v_mov_b32_e32 v0, 9
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s7 offset:128
+; GFX9-NEXT:    s_add_u32 s0, s0, s7
+; GFX9-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 9
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:128
 ; GFX9-NEXT:    s_endpgm
 ; GFX9-NEXT:    .section .rodata,#alloc
 ; GFX9-NEXT:    .p2align 6
@@ -102,10 +106,12 @@ define amdgpu_kernel void @stackrealign_attr() #1 {
 ; VI-LABEL: stackrealign_attr:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_add_u32 s4, s4, s7
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; VI-NEXT:    s_add_u32 s0, s0, s7
+; VI-NEXT:    s_addc_u32 s1, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, 9
 ; VI-NEXT:    s_mov_b32 flat_scratch_lo, s5
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s7 offset:4
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
 ; VI-NEXT:    s_endpgm
 ; VI-NEXT:    .section .rodata,#alloc
 ; VI-NEXT:    .p2align 6
@@ -147,9 +153,11 @@ define amdgpu_kernel void @stackrealign_attr() #1 {
 ; GFX9-LABEL: stackrealign_attr:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s4, s7
-; GFX9-NEXT:    v_mov_b32_e32 v0, 9
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s7 offset:4
+; GFX9-NEXT:    s_add_u32 s0, s0, s7
+; GFX9-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 9
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
 ; GFX9-NEXT:    s_endpgm
 ; GFX9-NEXT:    .section .rodata,#alloc
 ; GFX9-NEXT:    .p2align 6
@@ -197,10 +205,12 @@ define amdgpu_kernel void @alignstack_attr() #2 {
 ; VI-LABEL: alignstack_attr:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_add_u32 s4, s4, s7
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; VI-NEXT:    s_add_u32 s0, s0, s7
+; VI-NEXT:    s_addc_u32 s1, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, 9
 ; VI-NEXT:    s_mov_b32 flat_scratch_lo, s5
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s7 offset:4
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
 ; VI-NEXT:    s_endpgm
 ; VI-NEXT:    .section .rodata,#alloc
 ; VI-NEXT:    .p2align 6
@@ -242,9 +252,11 @@ define amdgpu_kernel void @alignstack_attr() #2 {
 ; GFX9-LABEL: alignstack_attr:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s4, s7
-; GFX9-NEXT:    v_mov_b32_e32 v0, 9
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s7 offset:4
+; GFX9-NEXT:    s_add_u32 s0, s0, s7
+; GFX9-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 9
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
 ; GFX9-NEXT:    s_endpgm
 ; GFX9-NEXT:    .section .rodata,#alloc
 ; GFX9-NEXT:    .p2align 6

diff  --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll
index 5674f6e15e10..022cc1c7d28f 100644
--- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll
+++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll
@@ -9,18 +9,17 @@
 ; = 144 bytes with padding between them
 
 ; GCN-LABEL: {{^}}needs_align16_default_stack_align:
-; GCN: s_sub_u32 [[SUB:s[0-9]+]], s32, s33
 ; GCN-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, v0
-; GCN-DAG: v_lshrrev_b32_e64 [[FRAMEDIFF:v[0-9]+]], 6, [[SUB]]
+; GCN-DAG: v_lshrrev_b32_e64 [[FRAMEDIFF:v[0-9]+]], 6, s32
 ; GCN: v_add_u32_e32 [[FI:v[0-9]+]], vcc, [[FRAMEDIFF]], [[SCALED_IDX]]
 
 ; GCN-NOT: s32
 
-; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s33 offen
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen
 ; GCN: v_or_b32_e32 v{{[0-9]+}}, 12
-; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s33 offen
-; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s33 offen
-; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s33 offen
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen
 
 ; GCN-NOT: s32
 
@@ -35,13 +34,13 @@ define void @needs_align16_default_stack_align(i32 %idx) #0 {
 ; GCN-LABEL: {{^}}needs_align16_stack_align4:
 ; GCN: s_add_u32 [[SCRATCH_REG:s[0-9]+]], s32, 0x3c0{{$}}
 ; GCN: s_and_b32 s34, [[SCRATCH_REG]], 0xfffffc00
-; GCN: s_add_u32 s32, s32, 0x2800{{$}}
 
-; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s33 offen
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen
 ; GCN: v_or_b32_e32 v{{[0-9]+}}, 12
-; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s33 offen
-; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s33 offen
-; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s33 offen
+; GCN: s_add_u32 s32, s32, 0x2800{{$}}
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen
 
 ; GCN: s_sub_u32 s32, s32, 0x2800
 
@@ -56,13 +55,13 @@ define void @needs_align16_stack_align4(i32 %idx) #2 {
 ; GCN-LABEL: {{^}}needs_align32:
 ; GCN: s_add_u32 [[SCRATCH_REG:s[0-9]+]], s32, 0x7c0{{$}}
 ; GCN: s_and_b32 s34, [[SCRATCH_REG]], 0xfffff800
-; GCN: s_add_u32 s32, s32, 0x3000{{$}}
 
-; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s33 offen
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen
 ; GCN: v_or_b32_e32 v{{[0-9]+}}, 12
-; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s33 offen
-; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s33 offen
-; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s33 offen
+; GCN: s_add_u32 s32, s32, 0x3000{{$}}
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen
 
 ; GCN: s_sub_u32 s32, s32, 0x3000
 
@@ -79,7 +78,7 @@ define void @needs_align32(i32 %idx) #0 {
 ; GCN: s_and_b32 s34, [[SCRATCH_REG]], 0xffffff00
 ; GCN: s_add_u32 s32, s32, 0xd00{{$}}
 
-; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s33 offen
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen
 ; GCN: s_sub_u32 s32, s32, 0xd00
 
 ; GCN: ; ScratchSize: 52
@@ -91,8 +90,7 @@ define void @force_realign4(i32 %idx) #1 {
 }
 
 ; GCN-LABEL: {{^}}kernel_call_align16_from_8:
-; GCN: s_mov_b32 s33, s7{{$}}
-; GCN-NEXT: s_add_u32 s32, s33, 0x400{{$}}
+; GCN: s_movk_i32 s32, 0x400{{$}}
 ; GCN-NOT: s32
 ; GCN: s_swappc_b64
 define amdgpu_kernel void @kernel_call_align16_from_8() #0 {
@@ -104,8 +102,7 @@ define amdgpu_kernel void @kernel_call_align16_from_8() #0 {
 
 ; The call sequence should keep the stack on call aligned to 4
 ; GCN-LABEL: {{^}}kernel_call_align16_from_5:
-; GCN: s_mov_b32 s33, s7{{$}}
-; GCN-NEXT: s_add_u32 s32, s33, 0x400
+; GCN: s_movk_i32 s32, 0x400
 ; GCN: s_swappc_b64
 define amdgpu_kernel void @kernel_call_align16_from_5() {
   %alloca0 = alloca i8, align 1, addrspace(5)
@@ -116,8 +113,7 @@ define amdgpu_kernel void @kernel_call_align16_from_5() {
 }
 
 ; GCN-LABEL: {{^}}kernel_call_align4_from_5:
-; GCN: s_mov_b32 s33, s7{{$}}
-; GCN: s_add_u32 s32, s33, 0x400
+; GCN: s_movk_i32 s32, 0x400
 ; GCN: s_swappc_b64
 define amdgpu_kernel void @kernel_call_align4_from_5() {
   %alloca0 = alloca i8, align 1, addrspace(5)

diff  --git a/llvm/test/CodeGen/AMDGPU/stack-slot-color-sgpr-vgpr-spills.mir b/llvm/test/CodeGen/AMDGPU/stack-slot-color-sgpr-vgpr-spills.mir
index 77010834cc3e..e446320f55a2 100644
--- a/llvm/test/CodeGen/AMDGPU/stack-slot-color-sgpr-vgpr-spills.mir
+++ b/llvm/test/CodeGen/AMDGPU/stack-slot-color-sgpr-vgpr-spills.mir
@@ -12,15 +12,14 @@
 # CHECK: SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5)
 # CHECK: $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5)
 
-# CHECK: SI_SPILL_S32_SAVE killed renamable $sgpr6, %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (store 4 into %stack.1, addrspace 5)
-# CHECK: $sgpr6 = SI_SPILL_S32_RESTORE %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 4 from %stack.1, addrspace 5)
+# CHECK: SI_SPILL_S32_SAVE killed renamable $sgpr5, %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (store 4 into %stack.1, addrspace 5)
+# CHECK: $sgpr5 = SI_SPILL_S32_RESTORE %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 4 from %stack.1, addrspace 5)
 
 name: no_merge_sgpr_vgpr_spill_slot
 tracksRegLiveness: true
 machineFunctionInfo:
   scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
-  scratchWaveOffsetReg: $sgpr4
-  frameOffsetReg: $sgpr5
+  frameOffsetReg: $sgpr4
   stackPtrOffsetReg: $sgpr32
 body: |
   bb.0:

diff  --git a/llvm/test/CodeGen/AMDGPU/store-hi16.ll b/llvm/test/CodeGen/AMDGPU/store-hi16.ll
index e8d6b24efd3e..51ed07aa2964 100644
--- a/llvm/test/CodeGen/AMDGPU/store-hi16.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-hi16.ll
@@ -389,10 +389,10 @@ entry:
 ; GCN-LABEL: {{^}}store_private_hi_v2i16:
 ; GCN: s_waitcnt
 
-; GFX900-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], s33 offen{{$}}
+; GFX900-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen{{$}}
 
 ; NO-D16-HI: v_lshrrev_b32_e32 v1, 16, v1
-; NO-D16-HI: buffer_store_short v1, v0, s[0:3], s33 offen{{$}}
+; NO-D16-HI: buffer_store_short v1, v0, s[0:3], 0 offen{{$}}
 
 ; GCN-NEXT: s_waitcnt
 ; GCN-NEXT: s_setpc_b64
@@ -408,10 +408,10 @@ entry:
 ; GCN-LABEL: {{^}}store_private_hi_v2f16:
 ; GCN: s_waitcnt
 
-; GFX900-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], s33 offen{{$}}
+; GFX900-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen{{$}}
 
 ; NO-D16-HI: v_lshrrev_b32_e32 v1, 16, v1
-; NO-D16-HI: buffer_store_short v1, v0, s[0:3], s33 offen{{$}}
+; NO-D16-HI: buffer_store_short v1, v0, s[0:3], 0 offen{{$}}
 
 ; GCN-NEXT: s_waitcnt
 ; GCN-NEXT: s_setpc_b64
@@ -427,10 +427,10 @@ entry:
 ; GCN-LABEL: {{^}}store_private_hi_i32_shift:
 ; GCN: s_waitcnt
 
-; GFX900-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], s33 offen{{$}}
+; GFX900-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen{{$}}
 
 ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; NO-D16-HI-NEXT: buffer_store_short v1, v0, s[0:3], s33 offen{{$}}
+; NO-D16-HI-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen{{$}}
 
 ; GCN-NEXT: s_waitcnt
 ; GCN-NEXT: s_setpc_b64
@@ -445,10 +445,10 @@ entry:
 ; GCN-LABEL: {{^}}store_private_hi_v2i16_i8:
 ; GCN: s_waitcnt
 
-; GFX900-NEXT: buffer_store_byte_d16_hi v1, v0, s[0:3], s33 offen{{$}}
+; GFX900-NEXT: buffer_store_byte_d16_hi v1, v0, s[0:3], 0 offen{{$}}
 
 ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; NO-D16-HI-NEXT: buffer_store_byte v1, v0, s[0:3], s33 offen{{$}}
+; NO-D16-HI-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen{{$}}
 
 ; GCN-NEXT: s_waitcnt
 ; GCN-NEXT: s_setpc_b64
@@ -464,10 +464,10 @@ entry:
 ; GCN-LABEL: {{^}}store_private_hi_i8_shift:
 ; GCN: s_waitcnt
 
-; GFX900-NEXT: buffer_store_byte_d16_hi v1, v0, s[0:3], s33 offen{{$}}
+; GFX900-NEXT: buffer_store_byte_d16_hi v1, v0, s[0:3], 0 offen{{$}}
 
 ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; NO-D16-HI-NEXT: buffer_store_byte v1, v0, s[0:3], s33 offen{{$}}
+; NO-D16-HI-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen{{$}}
 
 ; GCN-NEXT: s_waitcnt
 ; GCN-NEXT: s_setpc_b64
@@ -502,10 +502,10 @@ entry:
 ; GCN-LABEL: {{^}}store_private_hi_v2i16_nooff:
 ; GCN: s_waitcnt
 
-; GFX900-NEXT: buffer_store_short_d16_hi v0, off, s[0:3], s33{{$}}
+; GFX900-NEXT: buffer_store_short_d16_hi v0, off, s[0:3], 0{{$}}
 
 ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; NO-D16-HI-NEXT: buffer_store_short v0, off, s[0:3], s33{{$}}
+; NO-D16-HI-NEXT: buffer_store_short v0, off, s[0:3], 0{{$}}
 
 ; GCN-NEXT: s_waitcnt
 ; GCN-NEXT: s_setpc_b64
@@ -522,10 +522,10 @@ entry:
 ; GCN-LABEL: {{^}}store_private_hi_v2i16_i8_nooff:
 ; GCN: s_waitcnt
 
-; GFX900-NEXT: buffer_store_byte_d16_hi v0, off, s[0:3], s33{{$}}
+; GFX900-NEXT: buffer_store_byte_d16_hi v0, off, s[0:3], 0{{$}}
 
 ; NO-D16-HI: v_lshrrev_b32_e32 v0, 16, v0
-; NO-D16-HI: buffer_store_byte v0, off, s[0:3], s33{{$}}
+; NO-D16-HI: buffer_store_byte v0, off, s[0:3], 0{{$}}
 
 ; GCN-NEXT: s_waitcnt
 ; GCN-NEXT: s_setpc_b64

diff  --git a/llvm/test/CodeGen/AMDGPU/subreg-split-live-in-error.mir b/llvm/test/CodeGen/AMDGPU/subreg-split-live-in-error.mir
index 05ddadad86bb..778c12b3dae0 100644
--- a/llvm/test/CodeGen/AMDGPU/subreg-split-live-in-error.mir
+++ b/llvm/test/CodeGen/AMDGPU/subreg-split-live-in-error.mir
@@ -41,7 +41,6 @@ name:            _amdgpu_ps_main
 tracksRegLiveness: true
 machineFunctionInfo:
   scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
-  scratchWaveOffsetReg: $sgpr4
   stackPtrOffsetReg: $sgpr32
 liveins:
   - { reg: '$vgpr2', virtual-reg: '%0' }

diff  --git a/llvm/test/CodeGen/AMDGPU/subvector-test.mir b/llvm/test/CodeGen/AMDGPU/subvector-test.mir
index 508731a75e1b..8fd27d1c62d3 100644
--- a/llvm/test/CodeGen/AMDGPU/subvector-test.mir
+++ b/llvm/test/CodeGen/AMDGPU/subvector-test.mir
@@ -7,7 +7,6 @@ name:            subvector-basic-bb
 tracksRegLiveness: true
 machineFunctionInfo:
   scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
-  scratchWaveOffsetReg: $sgpr4
   frameOffsetReg: $sgpr5
   stackPtrOffsetReg: $sgpr32
 body:             |

diff  --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
index b1ad28a2e40f..d9327368ac82 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
@@ -22,8 +22,8 @@
 ; GFX9-DAG: s_mov_b32 s[[DESC3:[0-9]+]], 0xe00000
 
 ; OFFREG is offset system SGPR
-; GCN: buffer_store_dword {{v[0-9]+}}, off, s{{\[}}[[DESC0]]:[[DESC3]]], s12 offset:{{[0-9]+}} ; 4-byte Folded Spill
-; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[}}[[DESC0]]:[[DESC3]]], s12 offset:{{[0-9]+}} ; 4-byte Folded Reload
+; GCN: buffer_store_dword {{v[0-9]+}}, off, s{{\[}}[[DESC0]]:[[DESC3]]], 0 offset:{{[0-9]+}} ; 4-byte Folded Spill
+; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[}}[[DESC0]]:[[DESC3]]], 0 offset:{{[0-9]+}} ; 4-byte Folded Reload
 ; GCN: NumVgprs: 256
 ; GCN: ScratchSize: 1536
 

diff  --git a/llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir b/llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir
index a5d5e7c82d70..98849ba3cbc4 100644
--- a/llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir
+++ b/llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir
@@ -24,7 +24,6 @@ frameInfo:
 machineFunctionInfo:
   isEntryFunction: true
   scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
-  scratchWaveOffsetReg: '$sgpr95'
   frameOffsetReg:  '$sgpr95'
   stackPtrOffsetReg: '$sgpr32'
 body:             |

diff  --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll
index 1026d63d70b7..e3183989e7d2 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -693,11 +693,11 @@ break:
 ; CHECK: s_and_b64 exec, exec, [[LIVE]]
 ; CHECK: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0
 ; CHECK: s_wqm_b64 exec, exec
-; CHECK: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:4{{$}}
+; CHECK: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}}
 ; CHECK: s_and_b64 exec, exec, [[LIVE]]
 ; CHECK: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen
 ; CHECK: s_wqm_b64 exec, exec
-; CHECK: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen
+; CHECK: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
 
 ; CHECK: s_and_b64 exec, exec, [[LIVE]]
 ; CHECK: image_sample

diff  --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
index 1b1118e7869d..c69a9f58965e 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
@@ -44,7 +44,7 @@ entry:
 ; GFX9: v_mov_b32_dpp v[[FIRST_MOV:[0-9]+]], v{{[0-9]+}} row_bcast:31 row_mask:0xc bank_mask:0xf
 ; GFX9: v_add_u32_e32 v[[FIRST_ADD:[0-9]+]], v{{[0-9]+}}, v[[FIRST_MOV]]
 ; GFX9: v_mov_b32_e32 v[[FIRST:[0-9]+]], v[[FIRST_ADD]]
-; GFX9-O0: buffer_store_dword v[[FIRST]], off, s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, s[[FIRST_SGPR_OFFSET:[0-9]+]] offset:[[FIRST_IMM_OFFSET:[0-9]+]]
+; GFX9-O0: buffer_store_dword v[[FIRST]], off, s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, 0 offset:[[FIRST_IMM_OFFSET:[0-9]+]]
   %tmp120 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp105, i32 323, i32 12, i32 15, i1 false)
   %tmp121 = add i32 %tmp105, %tmp120
   %tmp122 = tail call i32 @llvm.amdgcn.wwm.i32(i32 %tmp121)
@@ -58,7 +58,7 @@ if:
 ; GFX9: v_mov_b32_dpp v[[SECOND_MOV:[0-9]+]], v{{[0-9]+}} row_bcast:31 row_mask:0xc bank_mask:0xf
 ; GFX9: v_add_u32_e32 v[[SECOND_ADD:[0-9]+]], v{{[0-9]+}}, v[[SECOND_MOV]]
 ; GFX9: v_mov_b32_e32 v[[SECOND:[0-9]+]], v[[SECOND_ADD]]
-; GFX9-O0: buffer_store_dword v[[SECOND]], off, s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, s[[SECOND_SGPR_OFFSET:[0-9]+]] offset:[[SECOND_IMM_OFFSET:[0-9]+]]
+; GFX9-O0: buffer_store_dword v[[SECOND]], off, s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, 0 offset:[[SECOND_IMM_OFFSET:[0-9]+]]
   %tmp135 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp107, i32 323, i32 12, i32 15, i1 false)
   %tmp136 = add i32 %tmp107, %tmp135
   %tmp137 = tail call i32 @llvm.amdgcn.wwm.i32(i32 %tmp136)
@@ -67,8 +67,8 @@ if:
 merge:
   %merge_value = phi i32 [ 0, %entry ], [%tmp137, %if ]
 ; GFX9-O3: v_cmp_eq_u32_e32 vcc, v[[FIRST]], v[[SECOND]]
-; GFX9-O0: buffer_load_dword v[[SECOND:[0-9]+]], off, s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, s[[SECOND_SGPR_OFFSET]] offset:[[SECOND_IMM_OFFSET]]
-; GFX9-O0: buffer_load_dword v[[FIRST:[0-9]+]], off, s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, s[[FIRST_SGPR_OFFSET]] offset:[[FIRST_IMM_OFFSET]]
+; GFX9-O0: buffer_load_dword v[[SECOND:[0-9]+]], off, s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, 0 offset:[[SECOND_IMM_OFFSET]]
+; GFX9-O0: buffer_load_dword v[[FIRST:[0-9]+]], off, s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, 0 offset:[[FIRST_IMM_OFFSET]]
 ; GFX9-O0: v_cmp_eq_u32_e64 s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v[[FIRST]], v[[SECOND]]
   %tmp138 = icmp eq i32 %tmp122, %merge_value
   %tmp139 = sext i1 %tmp138 to i32

diff  --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir
index 8bc20c9b88c9..e39ff3b350c8 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir
@@ -13,7 +13,6 @@
 # FULL-NEXT: memoryBound:     true
 # FULL-NEXT: waveLimiter:     true
 # FULL-NEXT: scratchRSrcReg:  '$sgpr8_sgpr9_sgpr10_sgpr11'
-# FULL-NEXT: scratchWaveOffsetReg:  '$sgpr12'
 # FULL-NEXT: frameOffsetReg:  '$sgpr12'
 # FULL-NEXT: stackPtrOffsetReg:  '$sgpr13'
 # FULL-NEXT: argumentInfo:
@@ -40,7 +39,6 @@
 # SIMPLE-NEXT: memoryBound: true
 # SIMPLE-NEXT: waveLimiter: true
 # SIMPLE-NEXT: scratchRSrcReg: '$sgpr8_sgpr9_sgpr10_sgpr11'
-# SIMPLE-NEXT: scratchWaveOffsetReg:  '$sgpr12'
 # SIMPLE-NEXT: frameOffsetReg:  '$sgpr12'
 # SIMPLE-NEXT: stackPtrOffsetReg:  '$sgpr13'
 # SIMPLE-NEXT: argumentInfo:
@@ -60,7 +58,6 @@ machineFunctionInfo:
   memoryBound:     true
   waveLimiter:     true
   scratchRSrcReg:  '$sgpr8_sgpr9_sgpr10_sgpr11'
-  scratchWaveOffsetReg:  '$sgpr12'
   frameOffsetReg: '$sgpr12'
   stackPtrOffsetReg:  '$sgpr13'
   argumentInfo:
@@ -87,12 +84,10 @@ body:             |
 # FULL-NEXT: memoryBound:     false
 # FULL-NEXT: waveLimiter:     false
 # FULL-NEXT: scratchRSrcReg:  '$private_rsrc_reg'
-# FULL-NEXT: scratchWaveOffsetReg:  '$scratch_wave_offset_reg'
 # FULL-NEXT: frameOffsetReg:  '$fp_reg'
 # FULL-NEXT: stackPtrOffsetReg:  '$sp_reg'
 # FULL-NEXT: argumentInfo:
 # FULL-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-# FULL-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr33' }
 # FULL-NEXT: mode:
 # FULL-NEXT: ieee: true
 # FULL-NEXT: dx10-clamp: true
@@ -107,7 +102,6 @@ body:             |
 # SIMPLE-NEXT: maxKernArgAlign: 1
 # SIMPLE-NEXT: argumentInfo:
 # SIMPLE-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-# SIMPLE-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr33' }
 # SIMPLE-NEXT: body:
 
 name: no_mfi
@@ -128,12 +122,10 @@ body:             |
 # FULL-NEXT: memoryBound:     false
 # FULL-NEXT: waveLimiter:     false
 # FULL-NEXT: scratchRSrcReg:  '$private_rsrc_reg'
-# FULL-NEXT: scratchWaveOffsetReg:  '$scratch_wave_offset_reg'
 # FULL-NEXT: frameOffsetReg:  '$fp_reg'
 # FULL-NEXT: stackPtrOffsetReg:  '$sp_reg'
 # FULL-NEXT: argumentInfo:
 # FULL-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-# FULL-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr33' }
 # FULL-NEXT: mode:
 # FULL-NEXT: ieee: true
 # FULL-NEXT: dx10-clamp: true
@@ -148,7 +140,6 @@ body:             |
 # SIMPLE-NEXT: maxKernArgAlign: 1
 # SIMPLE-NEXT: argumentInfo:
 # SIMPLE-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-# SIMPLE-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr33' }
 # SIMPLE-NEXT: body:
 
 name: empty_mfi
@@ -170,12 +161,10 @@ body:             |
 # FULL-NEXT: memoryBound:     false
 # FULL-NEXT: waveLimiter:     false
 # FULL-NEXT: scratchRSrcReg:  '$private_rsrc_reg'
-# FULL-NEXT: scratchWaveOffsetReg:  '$scratch_wave_offset_reg'
 # FULL-NEXT: frameOffsetReg:  '$fp_reg'
 # FULL-NEXT: stackPtrOffsetReg:  '$sp_reg'
 # FULL-NEXT: argumentInfo:
 # FULL-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-# FULL-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr33' }
 # FULL-NEXT: mode:
 # FULL-NEXT: ieee: true
 # FULL-NEXT: dx10-clamp: true
@@ -191,7 +180,6 @@ body:             |
 # SIMPLE-NEXT: isEntryFunction: true
 # SIMPLE-NEXT: argumentInfo:
 # SIMPLE-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-# SIMPLE-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr33' }
 # SIMPLE-NEXT: body:
 
 name: empty_mfi_entry_func
@@ -207,12 +195,10 @@ body:             |
 # ALL-LABEL: name: default_regs_mfi
 
 # FULL: scratchRSrcReg:  '$private_rsrc_reg'
-# FULL-NEXT: scratchWaveOffsetReg:  '$scratch_wave_offset_reg'
 # FULL-NEXT: frameOffsetReg:  '$fp_reg'
 # FULL-NEXT: stackPtrOffsetReg:  '$sp_reg'
 
 # SIMPLE-NOT: scratchRSrcReg
-# SIMPLE-NOT: scratchWaveOffsetReg
 # SIMPLE-NOT:: stackPtrOffsetReg
 name: default_regs_mfi
 machineFunctionInfo:
@@ -230,13 +216,11 @@ body:             |
 # FULL: argumentInfo:
 # FULL-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
 # FULL-NEXT: flatScratchInit: { offset: 4 }
-# FULL-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr33' }
 # FULL-NEXT: workItemIDY: { reg: '$vgpr0', mask: 65280 }
 
 # SIMPLE: argumentInfo:
 # SIMPLE-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
 # SIMPLE-NEXT: flatScratchInit: { offset: 4 }
-# SIMPLE-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr33' }
 # SIMPLE-NEXT: workItemIDY: { reg: '$vgpr0', mask: 65280 }
 name: fake_stack_arginfo
 machineFunctionInfo:

diff  --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
index 87629c3ae3db..95a70cfb33d0 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
@@ -16,9 +16,8 @@
 ; CHECK-NEXT: memoryBound: false
 ; CHECK-NEXT: waveLimiter: false
 ; CHECK-NEXT: scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
-; CHECK-NEXT: scratchWaveOffsetReg: '$sgpr101'
-; CHECK-NEXT: frameOffsetReg:  '$sgpr101'
-; CHECK-NEXT: stackPtrOffsetReg: '$sgpr101'
+; CHECK-NEXT: frameOffsetReg:  '$fp_reg'
+; CHECK-NEXT: stackPtrOffsetReg: '$sgpr32'
 ; CHECK-NEXT: argumentInfo:
 ; CHECK-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
 ; CHECK-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
@@ -50,9 +49,8 @@ define amdgpu_kernel void @kernel(i32 %arg0, i64 %arg1, <16 x i32> %arg2) {
 ; CHECK-NEXT: memoryBound: false
 ; CHECK-NEXT: waveLimiter: false
 ; CHECK-NEXT: scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
-; CHECK-NEXT: scratchWaveOffsetReg: '$sgpr101'
-; CHECK-NEXT: frameOffsetReg:  '$sgpr101'
-; CHECK-NEXT: stackPtrOffsetReg: '$sgpr101'
+; CHECK-NEXT: frameOffsetReg:  '$fp_reg'
+; CHECK-NEXT: stackPtrOffsetReg: '$sgpr32'
 ; CHECK-NEXT: argumentInfo:
 ; CHECK-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr3' }
 ; CHECK-NEXT: implicitBufferPtr: { reg: '$sgpr0_sgpr1' }
@@ -79,12 +77,10 @@ define amdgpu_ps void @ps_shader(i32 %arg0, i32 inreg %arg1) {
 ; CHECK-NEXT: memoryBound: false
 ; CHECK-NEXT: waveLimiter: false
 ; CHECK-NEXT: scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3'
-; CHECK-NEXT: scratchWaveOffsetReg: '$sgpr33'
 ; CHECK-NEXT: frameOffsetReg: '$sgpr34'
 ; CHECK-NEXT: stackPtrOffsetReg: '$sgpr32'
 ; CHECK-NEXT: argumentInfo:
 ; CHECK-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-; CHECK-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr33' }
 ; CHECK-NEXT: mode:
 ; CHECK-NEXT: ieee: true
 ; CHECK-NEXT: dx10-clamp: true
@@ -108,12 +104,10 @@ define void @function() {
 ; CHECK-NEXT: memoryBound: false
 ; CHECK-NEXT: waveLimiter: false
 ; CHECK-NEXT: scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3'
-; CHECK-NEXT: scratchWaveOffsetReg: '$sgpr33'
 ; CHECK-NEXT: frameOffsetReg: '$sgpr34'
 ; CHECK-NEXT: stackPtrOffsetReg: '$sgpr32'
 ; CHECK-NEXT: argumentInfo:
 ; CHECK-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-; CHECK-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr33' }
 ; CHECK-NEXT: mode:
 ; CHECK-NEXT: ieee: true
 ; CHECK-NEXT: dx10-clamp: true

diff  --git a/llvm/test/CodeGen/MIR/AMDGPU/mfi-parse-error-scratch-wave-offset-reg.mir b/llvm/test/CodeGen/MIR/AMDGPU/mfi-parse-error-scratch-wave-offset-reg.mir
deleted file mode 100644
index bbf58085cfa0..000000000000
--- a/llvm/test/CodeGen/MIR/AMDGPU/mfi-parse-error-scratch-wave-offset-reg.mir
+++ /dev/null
@@ -1,12 +0,0 @@
-# RUN: not llc -march=amdgcn -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
-# CHECK: :7:27: expected a named register
-# CHECK: scratchWaveOffsetReg:  ''
----
-name: empty_scratch_wave_offset_reg
-machineFunctionInfo:
-  scratchWaveOffsetReg:  ''
-body:             |
-  bb.0:
-
-    S_ENDPGM
-...

diff  --git a/llvm/test/CodeGen/MIR/AMDGPU/mfi-scratch-wave-offset-reg-class.mir b/llvm/test/CodeGen/MIR/AMDGPU/mfi-scratch-wave-offset-reg-class.mir
deleted file mode 100644
index 8e765ba01e32..000000000000
--- a/llvm/test/CodeGen/MIR/AMDGPU/mfi-scratch-wave-offset-reg-class.mir
+++ /dev/null
@@ -1,13 +0,0 @@
-# RUN: not llc -march=amdgcn -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
-# CHECK: :8:33: incorrect register class for field
-# CHECK: scratchWaveOffsetReg:  '$vgpr0'
-
----
-name: wrong_reg_class_scratch_wave_offset_reg
-machineFunctionInfo:
-  scratchWaveOffsetReg:  '$vgpr0'
-body:             |
-  bb.0:
-
-    S_ENDPGM
-...

diff  --git a/llvm/test/CodeGen/MIR/AMDGPU/parse-order-reserved-regs.mir b/llvm/test/CodeGen/MIR/AMDGPU/parse-order-reserved-regs.mir
index c48b13e46e20..8bed8fe6af16 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/parse-order-reserved-regs.mir
+++ b/llvm/test/CodeGen/MIR/AMDGPU/parse-order-reserved-regs.mir
@@ -10,7 +10,6 @@
 # CHECK: machineFunctionInfo:
 # CHECK: isEntryFunction: true
 # CHECK: scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
-# CHECK: scratchWaveOffsetReg: '$sgpr50'
 # CHECK: frameOffsetReg:  '$sgpr50'
 # CHECK: renamable $vgpr0 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr50, 4, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5)
 name: reserve_correct_register
@@ -18,7 +17,6 @@ tracksRegLiveness: true
 machineFunctionInfo:
   isEntryFunction: true
   scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
-  scratchWaveOffsetReg: '$sgpr50'
   frameOffsetReg:  '$sgpr50'
 stack:
   - { id: 0, type: default, offset: 0, size: 4, alignment: 4 }

diff  --git a/llvm/test/DebugInfo/AMDGPU/variable-locations.ll b/llvm/test/DebugInfo/AMDGPU/variable-locations.ll
index cb8ebb6a6997..365dd9dfea78 100644
--- a/llvm/test/DebugInfo/AMDGPU/variable-locations.ll
+++ b/llvm/test/DebugInfo/AMDGPU/variable-locations.ll
@@ -32,7 +32,7 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata)
 @GlobB = common addrspace(1) global i32 0, align 4, !dbg !6
 
 ; CHECK: {{.*}}DW_TAG_subprogram
-; CHECK: DW_AT_frame_base [DW_FORM_block1]	(DW_OP_reg{{.*}} SGPR9)
+; CHECK-NOT: DW_AT_frame_base
 
 define amdgpu_kernel void @kernel1(
 ; CHECK: {{.*}}DW_TAG_formal_parameter


        


More information about the llvm-commits mailing list