[llvm] d2e52ee - AMDGPU: Select global saddr mode from SGPR pointer

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Mon Nov 16 08:51:15 PST 2020


Author: Matt Arsenault
Date: 2020-11-16T11:51:06-05:00
New Revision: d2e52eec513a475fb69af7718e00a6aaac7738e8

URL: https://github.com/llvm/llvm-project/commit/d2e52eec513a475fb69af7718e00a6aaac7738e8
DIFF: https://github.com/llvm/llvm-project/commit/d2e52eec513a475fb69af7718e00a6aaac7738e8.diff

LOG: AMDGPU: Select global saddr mode from SGPR pointer

Use the 64-bit SGPR base with a 0 offset, since it's 1 fewer
instruction to materialize the 0 vs. the 64-bit copy.

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
    llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
    llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-global.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global-saddr.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll
    llvm/test/CodeGen/AMDGPU/addrspacecast.ll
    llvm/test/CodeGen/AMDGPU/amdpal-elf.ll
    llvm/test/CodeGen/AMDGPU/call-waitcnt.ll
    llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
    llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll
    llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll
    llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
    llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
    llvm/test/CodeGen/AMDGPU/dagcombine-select.ll
    llvm/test/CodeGen/AMDGPU/ds_read2.ll
    llvm/test/CodeGen/AMDGPU/fabs.f16.ll
    llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
    llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
    llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
    llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll
    llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
    llvm/test/CodeGen/AMDGPU/fneg.f16.ll
    llvm/test/CodeGen/AMDGPU/fshl.ll
    llvm/test/CodeGen/AMDGPU/fshr.ll
    llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
    llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll
    llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
    llvm/test/CodeGen/AMDGPU/global_atomics.ll
    llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
    llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll
    llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
    llvm/test/CodeGen/AMDGPU/hsa.ll
    llvm/test/CodeGen/AMDGPU/idiv-licm.ll
    llvm/test/CodeGen/AMDGPU/idot2.ll
    llvm/test/CodeGen/AMDGPU/idot4s.ll
    llvm/test/CodeGen/AMDGPU/idot4u.ll
    llvm/test/CodeGen/AMDGPU/idot8s.ll
    llvm/test/CodeGen/AMDGPU/idot8u.ll
    llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
    llvm/test/CodeGen/AMDGPU/kernel-args.ll
    llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.csub.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.get.waveid.in.workgroup.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll
    llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
    llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
    llvm/test/CodeGen/AMDGPU/load-global-i32.ll
    llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll
    llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
    llvm/test/CodeGen/AMDGPU/mad.u16.ll
    llvm/test/CodeGen/AMDGPU/mai-inline.ll
    llvm/test/CodeGen/AMDGPU/memory-legalizer-store.ll
    llvm/test/CodeGen/AMDGPU/memory_clause.ll
    llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
    llvm/test/CodeGen/AMDGPU/offset-split-global.ll
    llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll
    llvm/test/CodeGen/AMDGPU/readcyclecounter.ll
    llvm/test/CodeGen/AMDGPU/saddo.ll
    llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
    llvm/test/CodeGen/AMDGPU/smrd_vmem_war.ll
    llvm/test/CodeGen/AMDGPU/store-global.ll
    llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
    llvm/test/CodeGen/AMDGPU/trunc-store-i64.ll
    llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
    llvm/test/CodeGen/AMDGPU/wave32.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 339d6fb42f96..2ad7fab81427 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1821,10 +1821,9 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
 
   // Match the immediate offset first, which canonically is moved as low as
   // possible.
-  if (CurDAG->isBaseWithConstantOffset(Addr)) {
-    SDValue LHS = Addr.getOperand(0);
-    SDValue RHS = Addr.getOperand(1);
 
+  SDValue LHS, RHS;
+  if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
     int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
     const SIInstrInfo *TII = Subtarget->getInstrInfo();
 
@@ -1852,11 +1851,24 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
   }
 
   // Match the variable offset.
-  if (Addr.getOpcode() != ISD::ADD)
-    return false;
+  if (Addr.getOpcode() != ISD::ADD) {
+    if (Addr->isDivergent() || Addr.getOpcode() == ISD::UNDEF ||
+        isa<ConstantSDNode>(Addr))
+      return false;
+
+    // It's cheaper to materialize a single 32-bit zero for vaddr than the two
+    // moves required to copy a 64-bit SGPR to VGPR.
+    SAddr = Addr;
+    SDNode *VMov = CurDAG->getMachineNode(
+      AMDGPU::V_MOV_B32_e32, SDLoc(Addr), MVT::i32,
+      CurDAG->getTargetConstant(0, SDLoc(), MVT::i32));
+    VOffset = SDValue(VMov, 0);
+    Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16);
+    return true;
+  }
 
-  SDValue LHS = Addr.getOperand(0);
-  SDValue RHS = Addr.getOperand(1);
+  LHS = Addr.getOperand(0);
+  RHS = Addr.getOperand(1);
 
   if (!LHS->isDivergent()) {
     // add (i64 sgpr), (zero_extend (i32 vgpr))

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index feebe259fe9e..37a79ce4fa37 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -3536,20 +3536,40 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
     return None;
 
   // Match the variable offset.
-  if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD)
-    return None;
+  if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD) {
+    // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and
+    // drop this.
+    if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
+        AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT)
+      return None;
+
+    // It's cheaper to materialize a single 32-bit zero for vaddr than the two
+    // moves required to copy a 64-bit SGPR to VGPR.
+    const Register SAddr = AddrDef->Reg;
+    if (!isSGPR(SAddr))
+      return None;
+
+    MachineInstr *MI = Root.getParent();
+    MachineBasicBlock *MBB = MI->getParent();
+    Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+    BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
+            VOffset)
+      .addImm(0);
+
+    return {{
+        [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); },    // saddr
+        [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); },  // voffset
+        [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
+    }};
+  }
 
   // Look through the SGPR->VGPR copy.
-  Register PtrBaseSrc =
+  Register SAddr =
     getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
-  if (!PtrBaseSrc)
-    return None;
-
-  const RegisterBank *BaseRB = RBI.getRegBank(PtrBaseSrc, *MRI, TRI);
-  if (BaseRB->getID() != AMDGPU::SGPRRegBankID)
+  if (!SAddr || !isSGPR(SAddr))
     return None;
 
-  Register SAddr = PtrBaseSrc;
   Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
 
   // It's possible voffset is an SGPR here, but the copy to VGPR will be

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
index 8d301aea9803..02d9d3cfbb85 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
@@ -1679,7 +1679,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(double addrspace(1)* %out, i32
 ; GPRIDX-NEXT:     kernarg_segment_byte_size = 28
 ; GPRIDX-NEXT:     workgroup_fbarrier_count = 0
 ; GPRIDX-NEXT:     wavefront_sgpr_count = 9
-; GPRIDX-NEXT:     workitem_vgpr_count = 4
+; GPRIDX-NEXT:     workitem_vgpr_count = 3
 ; GPRIDX-NEXT:     reserved_vgpr_first = 0
 ; GPRIDX-NEXT:     reserved_vgpr_count = 0
 ; GPRIDX-NEXT:     reserved_sgpr_first = 0
@@ -1710,10 +1710,9 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(double addrspace(1)* %out, i32
 ; GPRIDX-NEXT:    s_cmp_eq_u32 s8, 4
 ; GPRIDX-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
 ; GPRIDX-NEXT:    v_mov_b32_e32 v0, s0
-; GPRIDX-NEXT:    v_mov_b32_e32 v2, s6
 ; GPRIDX-NEXT:    v_mov_b32_e32 v1, s1
-; GPRIDX-NEXT:    v_mov_b32_e32 v3, s7
-; GPRIDX-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GPRIDX-NEXT:    v_mov_b32_e32 v2, 0
+; GPRIDX-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GPRIDX-NEXT:    s_endpgm
 ;
 ; MOVREL-LABEL: dyn_extract_v5f64_s_s:
@@ -2194,7 +2193,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(float addrspace(1)* %out, i32
 ; GPRIDX-NEXT:     kernarg_segment_byte_size = 28
 ; GPRIDX-NEXT:     workgroup_fbarrier_count = 0
 ; GPRIDX-NEXT:     wavefront_sgpr_count = 6
-; GPRIDX-NEXT:     workitem_vgpr_count = 3
+; GPRIDX-NEXT:     workitem_vgpr_count = 2
 ; GPRIDX-NEXT:     reserved_vgpr_first = 0
 ; GPRIDX-NEXT:     reserved_vgpr_count = 0
 ; GPRIDX-NEXT:     reserved_sgpr_first = 0
@@ -2211,17 +2210,16 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(float addrspace(1)* %out, i32
 ; GPRIDX-NEXT:  ; %bb.0: ; %entry
 ; GPRIDX-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GPRIDX-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GPRIDX-NEXT:    v_mov_b32_e32 v1, 0
 ; GPRIDX-NEXT:    s_waitcnt lgkmcnt(0)
-; GPRIDX-NEXT:    v_mov_b32_e32 v0, s0
 ; GPRIDX-NEXT:    s_cmp_eq_u32 s2, 1
 ; GPRIDX-NEXT:    s_cselect_b32 s3, 2.0, 1.0
 ; GPRIDX-NEXT:    s_cmp_eq_u32 s2, 2
 ; GPRIDX-NEXT:    s_cselect_b32 s3, 0x40400000, s3
 ; GPRIDX-NEXT:    s_cmp_eq_u32 s2, 3
 ; GPRIDX-NEXT:    s_cselect_b32 s2, 4.0, s3
-; GPRIDX-NEXT:    v_mov_b32_e32 v2, s2
-; GPRIDX-NEXT:    v_mov_b32_e32 v1, s1
-; GPRIDX-NEXT:    global_store_dword v[0:1], v2, off
+; GPRIDX-NEXT:    v_mov_b32_e32 v0, s2
+; GPRIDX-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GPRIDX-NEXT:    s_endpgm
 ;
 ; MOVREL-LABEL: dyn_extract_v4f32_s_s_s:
@@ -2370,7 +2368,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(double addrspace(1)* %out, i3
 ; GPRIDX-NEXT:     kernarg_segment_byte_size = 28
 ; GPRIDX-NEXT:     workgroup_fbarrier_count = 0
 ; GPRIDX-NEXT:     wavefront_sgpr_count = 7
-; GPRIDX-NEXT:     workitem_vgpr_count = 4
+; GPRIDX-NEXT:     workitem_vgpr_count = 3
 ; GPRIDX-NEXT:     reserved_vgpr_first = 0
 ; GPRIDX-NEXT:     reserved_vgpr_count = 0
 ; GPRIDX-NEXT:     reserved_sgpr_first = 0
@@ -2389,8 +2387,8 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(double addrspace(1)* %out, i3
 ; GPRIDX-NEXT:    s_load_dword s6, s[4:5], 0x8
 ; GPRIDX-NEXT:    s_mov_b32 s0, 0
 ; GPRIDX-NEXT:    s_mov_b32 s1, 0x40080000
+; GPRIDX-NEXT:    v_mov_b32_e32 v2, 0
 ; GPRIDX-NEXT:    s_waitcnt lgkmcnt(0)
-; GPRIDX-NEXT:    v_mov_b32_e32 v2, s2
 ; GPRIDX-NEXT:    s_cmp_eq_u32 s6, 1
 ; GPRIDX-NEXT:    s_cselect_b64 s[4:5], 2.0, 1.0
 ; GPRIDX-NEXT:    s_cmp_eq_u32 s6, 2
@@ -2399,8 +2397,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(double addrspace(1)* %out, i3
 ; GPRIDX-NEXT:    s_cselect_b64 s[0:1], 4.0, s[0:1]
 ; GPRIDX-NEXT:    v_mov_b32_e32 v0, s0
 ; GPRIDX-NEXT:    v_mov_b32_e32 v1, s1
-; GPRIDX-NEXT:    v_mov_b32_e32 v3, s3
-; GPRIDX-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GPRIDX-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GPRIDX-NEXT:    s_endpgm
 ;
 ; MOVREL-LABEL: dyn_extract_v4f64_s_s_s:

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
index 7901f2286b2a..88af44653468 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
@@ -7,36 +7,35 @@
 define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out.ptr, <64 x i32> addrspace(1)* %ptr, i32 %val, i32 %idx) #0 {
 ; GCN-LABEL: v_insert_v64i32_varidx:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_add_u32 s0, s0, s7
 ; GCN-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x0
-; GCN-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x10
+; GCN-NEXT:    s_add_u32 s0, s0, s7
 ; GCN-NEXT:    v_mov_b32_e32 v16, 0x100
 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
-; GCN-NEXT:    v_add_u32_e32 v31, 64, v16
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_load_dwordx16 s[12:27], s[10:11], 0x0
+; GCN-NEXT:    s_load_dwordx16 s[36:51], s[10:11], 0x0
 ; GCN-NEXT:    s_load_dwordx16 s[52:67], s[10:11], 0x40
-; GCN-NEXT:    s_load_dwordx16 s[36:51], s[10:11], 0x80
+; GCN-NEXT:    s_load_dwordx16 s[12:27], s[10:11], 0x80
+; GCN-NEXT:    v_add_u32_e32 v31, 64, v16
 ; GCN-NEXT:    v_add_u32_e32 v32, 0x44, v16
-; GCN-NEXT:    v_add_u32_e32 v33, 0x48, v16
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s12
-; GCN-NEXT:    v_mov_b32_e32 v1, s13
-; GCN-NEXT:    v_mov_b32_e32 v2, s14
-; GCN-NEXT:    v_mov_b32_e32 v3, s15
-; GCN-NEXT:    v_mov_b32_e32 v4, s16
-; GCN-NEXT:    v_mov_b32_e32 v5, s17
-; GCN-NEXT:    v_mov_b32_e32 v6, s18
-; GCN-NEXT:    v_mov_b32_e32 v7, s19
-; GCN-NEXT:    v_mov_b32_e32 v8, s20
-; GCN-NEXT:    v_mov_b32_e32 v9, s21
-; GCN-NEXT:    v_mov_b32_e32 v10, s22
-; GCN-NEXT:    v_mov_b32_e32 v11, s23
-; GCN-NEXT:    v_mov_b32_e32 v12, s24
-; GCN-NEXT:    v_mov_b32_e32 v13, s25
-; GCN-NEXT:    v_mov_b32_e32 v14, s26
-; GCN-NEXT:    v_mov_b32_e32 v15, s27
-; GCN-NEXT:    s_load_dwordx16 s[12:27], s[10:11], 0xc0
+; GCN-NEXT:    v_mov_b32_e32 v0, s36
+; GCN-NEXT:    v_mov_b32_e32 v1, s37
+; GCN-NEXT:    v_mov_b32_e32 v2, s38
+; GCN-NEXT:    v_mov_b32_e32 v3, s39
+; GCN-NEXT:    v_mov_b32_e32 v4, s40
+; GCN-NEXT:    v_mov_b32_e32 v5, s41
+; GCN-NEXT:    v_mov_b32_e32 v6, s42
+; GCN-NEXT:    v_mov_b32_e32 v7, s43
+; GCN-NEXT:    v_mov_b32_e32 v8, s44
+; GCN-NEXT:    v_mov_b32_e32 v9, s45
+; GCN-NEXT:    v_mov_b32_e32 v10, s46
+; GCN-NEXT:    v_mov_b32_e32 v11, s47
+; GCN-NEXT:    v_mov_b32_e32 v12, s48
+; GCN-NEXT:    v_mov_b32_e32 v13, s49
+; GCN-NEXT:    v_mov_b32_e32 v14, s50
+; GCN-NEXT:    v_mov_b32_e32 v15, s51
+; GCN-NEXT:    s_load_dwordx16 s[36:51], s[10:11], 0xc0
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:256
 ; GCN-NEXT:    v_add_u32_e32 v0, 4, v16
 ; GCN-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
@@ -44,13 +43,13 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out.
 ; GCN-NEXT:    buffer_store_dword v1, v31, s[0:3], 0 offen
 ; GCN-NEXT:    v_mov_b32_e32 v1, s53
 ; GCN-NEXT:    buffer_store_dword v1, v32, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v33, 0x48, v16
 ; GCN-NEXT:    v_mov_b32_e32 v1, s54
 ; GCN-NEXT:    buffer_store_dword v1, v33, s[0:3], 0 offen
-; GCN-NEXT:    s_movk_i32 s4, 0x50
 ; GCN-NEXT:    v_add_u32_e32 v34, 0x4c, v16
 ; GCN-NEXT:    v_mov_b32_e32 v1, s55
 ; GCN-NEXT:    buffer_store_dword v1, v34, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v35, s4, v16
+; GCN-NEXT:    v_add_u32_e32 v35, 0x50, v16
 ; GCN-NEXT:    v_mov_b32_e32 v1, s56
 ; GCN-NEXT:    buffer_store_dword v1, v35, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_u32_e32 v36, 0x54, v16
@@ -59,11 +58,10 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out.
 ; GCN-NEXT:    v_add_u32_e32 v37, 0x58, v16
 ; GCN-NEXT:    v_mov_b32_e32 v1, s58
 ; GCN-NEXT:    buffer_store_dword v1, v37, s[0:3], 0 offen
-; GCN-NEXT:    s_movk_i32 s5, 0x60
 ; GCN-NEXT:    v_add_u32_e32 v38, 0x5c, v16
 ; GCN-NEXT:    v_mov_b32_e32 v1, s59
 ; GCN-NEXT:    buffer_store_dword v1, v38, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v39, s5, v16
+; GCN-NEXT:    v_add_u32_e32 v39, 0x60, v16
 ; GCN-NEXT:    v_mov_b32_e32 v1, s60
 ; GCN-NEXT:    buffer_store_dword v1, v39, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_u32_e32 v40, 0x64, v16
@@ -72,11 +70,10 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out.
 ; GCN-NEXT:    v_add_u32_e32 v41, 0x68, v16
 ; GCN-NEXT:    v_mov_b32_e32 v1, s62
 ; GCN-NEXT:    buffer_store_dword v1, v41, s[0:3], 0 offen
-; GCN-NEXT:    s_movk_i32 s10, 0x70
 ; GCN-NEXT:    v_add_u32_e32 v42, 0x6c, v16
 ; GCN-NEXT:    v_mov_b32_e32 v1, s63
 ; GCN-NEXT:    buffer_store_dword v1, v42, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v43, s10, v16
+; GCN-NEXT:    v_add_u32_e32 v43, 0x70, v16
 ; GCN-NEXT:    v_mov_b32_e32 v1, s64
 ; GCN-NEXT:    buffer_store_dword v1, v43, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_u32_e32 v44, 0x74, v16
@@ -89,110 +86,104 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out.
 ; GCN-NEXT:    v_mov_b32_e32 v1, s67
 ; GCN-NEXT:    buffer_store_dword v1, v46, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_u32_e32 v47, 0x80, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s36
+; GCN-NEXT:    v_mov_b32_e32 v1, s12
 ; GCN-NEXT:    buffer_store_dword v1, v47, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_u32_e32 v48, 0x84, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s37
+; GCN-NEXT:    v_mov_b32_e32 v1, s13
 ; GCN-NEXT:    buffer_store_dword v1, v48, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_u32_e32 v49, 0x88, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s38
+; GCN-NEXT:    v_mov_b32_e32 v1, s14
 ; GCN-NEXT:    buffer_store_dword v1, v49, s[0:3], 0 offen
-; GCN-NEXT:    s_movk_i32 s11, 0x90
 ; GCN-NEXT:    v_add_u32_e32 v50, 0x8c, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s39
+; GCN-NEXT:    v_mov_b32_e32 v1, s15
 ; GCN-NEXT:    buffer_store_dword v1, v50, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v51, s11, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s40
+; GCN-NEXT:    v_add_u32_e32 v51, 0x90, v16
+; GCN-NEXT:    v_mov_b32_e32 v1, s16
 ; GCN-NEXT:    buffer_store_dword v1, v51, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_u32_e32 v52, 0x94, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s41
+; GCN-NEXT:    v_mov_b32_e32 v1, s17
 ; GCN-NEXT:    buffer_store_dword v1, v52, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_u32_e32 v53, 0x98, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s42
+; GCN-NEXT:    v_mov_b32_e32 v1, s18
 ; GCN-NEXT:    buffer_store_dword v1, v53, s[0:3], 0 offen
-; GCN-NEXT:    s_movk_i32 s28, 0xa0
 ; GCN-NEXT:    v_add_u32_e32 v54, 0x9c, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s43
+; GCN-NEXT:    v_mov_b32_e32 v1, s19
 ; GCN-NEXT:    buffer_store_dword v1, v54, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v55, s28, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s44
+; GCN-NEXT:    v_add_u32_e32 v55, 0xa0, v16
+; GCN-NEXT:    v_mov_b32_e32 v1, s20
 ; GCN-NEXT:    buffer_store_dword v1, v55, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_u32_e32 v56, 0xa4, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s45
+; GCN-NEXT:    v_mov_b32_e32 v1, s21
 ; GCN-NEXT:    buffer_store_dword v1, v56, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_u32_e32 v57, 0xa8, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s46
+; GCN-NEXT:    v_mov_b32_e32 v1, s22
 ; GCN-NEXT:    buffer_store_dword v1, v57, s[0:3], 0 offen
-; GCN-NEXT:    s_movk_i32 s29, 0xb0
 ; GCN-NEXT:    v_add_u32_e32 v58, 0xac, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s47
+; GCN-NEXT:    v_mov_b32_e32 v1, s23
 ; GCN-NEXT:    buffer_store_dword v1, v58, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v59, s29, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s48
+; GCN-NEXT:    v_add_u32_e32 v59, 0xb0, v16
+; GCN-NEXT:    v_mov_b32_e32 v1, s24
 ; GCN-NEXT:    buffer_store_dword v1, v59, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_u32_e32 v60, 0xb4, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s49
+; GCN-NEXT:    v_mov_b32_e32 v1, s25
 ; GCN-NEXT:    buffer_store_dword v1, v60, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_u32_e32 v61, 0xb8, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s50
+; GCN-NEXT:    v_mov_b32_e32 v1, s26
 ; GCN-NEXT:    buffer_store_dword v1, v61, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_u32_e32 v62, 0xbc, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s51
+; GCN-NEXT:    v_mov_b32_e32 v1, s27
 ; GCN-NEXT:    buffer_store_dword v1, v62, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v1, s12
 ; GCN-NEXT:    v_add_u32_e32 v63, 0xc0, v16
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v1, s36
 ; GCN-NEXT:    buffer_store_dword v1, v63, s[0:3], 0 offen
-; GCN-NEXT:    v_mov_b32_e32 v1, s13
 ; GCN-NEXT:    v_add_u32_e32 v64, 0xc4, v16
+; GCN-NEXT:    v_mov_b32_e32 v1, s37
 ; GCN-NEXT:    buffer_store_dword v1, v64, s[0:3], 0 offen
-; GCN-NEXT:    v_mov_b32_e32 v1, s14
 ; GCN-NEXT:    v_add_u32_e32 v65, 0xc8, v16
+; GCN-NEXT:    v_mov_b32_e32 v1, s38
 ; GCN-NEXT:    buffer_store_dword v1, v65, s[0:3], 0 offen
-; GCN-NEXT:    s_movk_i32 s12, 0xd0
 ; GCN-NEXT:    v_add_u32_e32 v66, 0xcc, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s15
+; GCN-NEXT:    v_mov_b32_e32 v1, s39
 ; GCN-NEXT:    buffer_store_dword v1, v66, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v67, s12, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s16
+; GCN-NEXT:    v_add_u32_e32 v67, 0xd0, v16
+; GCN-NEXT:    v_mov_b32_e32 v1, s40
 ; GCN-NEXT:    buffer_store_dword v1, v67, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_u32_e32 v68, 0xd4, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s17
+; GCN-NEXT:    v_mov_b32_e32 v1, s41
 ; GCN-NEXT:    buffer_store_dword v1, v68, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_u32_e32 v69, 0xd8, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s18
+; GCN-NEXT:    v_mov_b32_e32 v1, s42
 ; GCN-NEXT:    buffer_store_dword v1, v69, s[0:3], 0 offen
-; GCN-NEXT:    s_movk_i32 s13, 0xe0
 ; GCN-NEXT:    v_add_u32_e32 v70, 0xdc, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s19
+; GCN-NEXT:    v_mov_b32_e32 v1, s43
 ; GCN-NEXT:    buffer_store_dword v1, v70, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v71, s13, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s20
+; GCN-NEXT:    v_add_u32_e32 v71, 0xe0, v16
+; GCN-NEXT:    v_mov_b32_e32 v1, s44
 ; GCN-NEXT:    buffer_store_dword v1, v71, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_u32_e32 v72, 0xe4, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s21
+; GCN-NEXT:    v_mov_b32_e32 v1, s45
 ; GCN-NEXT:    buffer_store_dword v1, v72, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_u32_e32 v73, 0xe8, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s22
+; GCN-NEXT:    v_mov_b32_e32 v1, s46
 ; GCN-NEXT:    buffer_store_dword v1, v73, s[0:3], 0 offen
-; GCN-NEXT:    s_movk_i32 s14, 0xf0
 ; GCN-NEXT:    v_add_u32_e32 v74, 0xec, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s23
+; GCN-NEXT:    v_mov_b32_e32 v1, s47
 ; GCN-NEXT:    buffer_store_dword v1, v74, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v75, s14, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s24
+; GCN-NEXT:    v_add_u32_e32 v75, 0xf0, v16
+; GCN-NEXT:    v_mov_b32_e32 v1, s48
 ; GCN-NEXT:    buffer_store_dword v1, v75, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_u32_e32 v76, 0xf4, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s25
-; GCN-NEXT:    s_and_b32 s7, s7, 63
+; GCN-NEXT:    v_mov_b32_e32 v1, s49
+; GCN-NEXT:    s_and_b32 s5, s5, 63
 ; GCN-NEXT:    buffer_store_dword v1, v76, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_u32_e32 v77, 0xf8, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s26
+; GCN-NEXT:    v_mov_b32_e32 v1, s50
 ; GCN-NEXT:    v_add_u32_e32 v17, 8, v16
 ; GCN-NEXT:    buffer_store_dword v1, v77, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_u32_e32 v78, 0xfc, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s27
-; GCN-NEXT:    s_lshl_b32 s7, s7, 2
+; GCN-NEXT:    v_mov_b32_e32 v1, s51
+; GCN-NEXT:    s_lshl_b32 s5, s5, 2
 ; GCN-NEXT:    buffer_store_dword v2, v17, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_u32_e32 v18, 12, v16
 ; GCN-NEXT:    v_add_u32_e32 v19, 16, v16
@@ -208,8 +199,8 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out.
 ; GCN-NEXT:    v_add_u32_e32 v29, 56, v16
 ; GCN-NEXT:    v_add_u32_e32 v30, 60, v16
 ; GCN-NEXT:    buffer_store_dword v1, v78, s[0:3], 0 offen
-; GCN-NEXT:    v_mov_b32_e32 v2, s6
-; GCN-NEXT:    v_add_u32_e32 v1, s7, v16
+; GCN-NEXT:    v_add_u32_e32 v1, s5, v16
+; GCN-NEXT:    v_mov_b32_e32 v2, s4
 ; GCN-NEXT:    buffer_store_dword v3, v18, s[0:3], 0 offen
 ; GCN-NEXT:    buffer_store_dword v4, v19, s[0:3], 0 offen
 ; GCN-NEXT:    buffer_store_dword v5, v20, s[0:3], 0 offen
@@ -288,86 +279,24 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out.
 ; GCN-NEXT:    buffer_load_dword v62, v77, s[0:3], 0 offen
 ; GCN-NEXT:    buffer_load_dword v63, v78, s[0:3], 0 offen
 ; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:256
-; GCN-NEXT:    v_mov_b32_e32 v65, s9
-; GCN-NEXT:    s_add_u32 s6, s8, 16
-; GCN-NEXT:    v_mov_b32_e32 v64, s8
-; GCN-NEXT:    s_addc_u32 s7, s9, 0
+; GCN-NEXT:    v_mov_b32_e32 v64, 0
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    global_store_dwordx4 v[64:65], v[0:3], off
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    v_mov_b32_e32 v1, s7
-; GCN-NEXT:    s_add_u32 s6, s8, 32
-; GCN-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off
-; GCN-NEXT:    s_addc_u32 s7, s9, 0
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    v_mov_b32_e32 v1, s7
-; GCN-NEXT:    s_add_u32 s6, s8, 48
-; GCN-NEXT:    global_store_dwordx4 v[0:1], v[8:11], off
-; GCN-NEXT:    s_addc_u32 s7, s9, 0
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    v_mov_b32_e32 v1, s7
-; GCN-NEXT:    s_add_u32 s6, s8, 64
-; GCN-NEXT:    global_store_dwordx4 v[0:1], v[12:15], off
-; GCN-NEXT:    s_addc_u32 s7, s9, 0
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    v_mov_b32_e32 v1, s7
-; GCN-NEXT:    s_add_u32 s6, s8, s4
-; GCN-NEXT:    s_addc_u32 s7, s9, 0
-; GCN-NEXT:    global_store_dwordx4 v[0:1], v[16:19], off
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    s_add_u32 s4, s8, s5
-; GCN-NEXT:    v_mov_b32_e32 v1, s7
-; GCN-NEXT:    global_store_dwordx4 v[0:1], v[20:23], off
-; GCN-NEXT:    s_addc_u32 s5, s9, 0
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-NEXT:    s_add_u32 s4, s8, s10
-; GCN-NEXT:    global_store_dwordx4 v[0:1], v[24:27], off
-; GCN-NEXT:    s_addc_u32 s5, s9, 0
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-NEXT:    s_add_u32 s4, s8, 0x80
-; GCN-NEXT:    global_store_dwordx4 v[0:1], v[28:31], off
-; GCN-NEXT:    s_addc_u32 s5, s9, 0
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-NEXT:    s_add_u32 s4, s8, s11
-; GCN-NEXT:    global_store_dwordx4 v[0:1], v[32:35], off
-; GCN-NEXT:    s_addc_u32 s5, s9, 0
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-NEXT:    s_add_u32 s4, s8, s28
-; GCN-NEXT:    global_store_dwordx4 v[0:1], v[36:39], off
-; GCN-NEXT:    s_addc_u32 s5, s9, 0
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-NEXT:    s_add_u32 s4, s8, s29
-; GCN-NEXT:    global_store_dwordx4 v[0:1], v[40:43], off
-; GCN-NEXT:    s_addc_u32 s5, s9, 0
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-NEXT:    s_add_u32 s4, s8, 0xc0
-; GCN-NEXT:    global_store_dwordx4 v[0:1], v[44:47], off
-; GCN-NEXT:    s_addc_u32 s5, s9, 0
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-NEXT:    s_add_u32 s4, s8, s12
-; GCN-NEXT:    global_store_dwordx4 v[0:1], v[48:51], off
-; GCN-NEXT:    s_addc_u32 s5, s9, 0
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-NEXT:    s_add_u32 s4, s8, s13
-; GCN-NEXT:    global_store_dwordx4 v[0:1], v[52:55], off
-; GCN-NEXT:    s_addc_u32 s5, s9, 0
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-NEXT:    s_add_u32 s4, s8, s14
-; GCN-NEXT:    global_store_dwordx4 v[0:1], v[56:59], off
-; GCN-NEXT:    s_addc_u32 s5, s9, 0
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-NEXT:    global_store_dwordx4 v[0:1], v[60:63], off
+; GCN-NEXT:    global_store_dwordx4 v64, v[0:3], s[8:9]
+; GCN-NEXT:    global_store_dwordx4 v64, v[4:7], s[8:9] offset:16
+; GCN-NEXT:    global_store_dwordx4 v64, v[8:11], s[8:9] offset:32
+; GCN-NEXT:    global_store_dwordx4 v64, v[12:15], s[8:9] offset:48
+; GCN-NEXT:    global_store_dwordx4 v64, v[16:19], s[8:9] offset:64
+; GCN-NEXT:    global_store_dwordx4 v64, v[20:23], s[8:9] offset:80
+; GCN-NEXT:    global_store_dwordx4 v64, v[24:27], s[8:9] offset:96
+; GCN-NEXT:    global_store_dwordx4 v64, v[28:31], s[8:9] offset:112
+; GCN-NEXT:    global_store_dwordx4 v64, v[32:35], s[8:9] offset:128
+; GCN-NEXT:    global_store_dwordx4 v64, v[36:39], s[8:9] offset:144
+; GCN-NEXT:    global_store_dwordx4 v64, v[40:43], s[8:9] offset:160
+; GCN-NEXT:    global_store_dwordx4 v64, v[44:47], s[8:9] offset:176
+; GCN-NEXT:    global_store_dwordx4 v64, v[48:51], s[8:9] offset:192
+; GCN-NEXT:    global_store_dwordx4 v64, v[52:55], s[8:9] offset:208
+; GCN-NEXT:    global_store_dwordx4 v64, v[56:59], s[8:9] offset:224
+; GCN-NEXT:    global_store_dwordx4 v64, v[60:63], s[8:9] offset:240
 ; GCN-NEXT:    s_endpgm
   %vec = load <64 x i32>, <64 x i32> addrspace(1)* %ptr
   %insert = insertelement <64 x i32> %vec, i32 %val, i32 %idx

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
index b3cbf7f937be..a29f31fda2cb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
@@ -1999,12 +1999,12 @@ define amdgpu_ps void @insertelement_s_v16i16_s_s(<16 x i16> addrspace(4)* inreg
 ; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
 ; GFX9-NEXT:    s_cselect_b32 s7, s16, s15
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    v_mov_b32_e32 v4, 16
+; GFX9-NEXT:    s_mov_b64 s[0:1], 16
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s7
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0
-; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: insertelement_s_v16i16_s_s:
@@ -2163,19 +2163,19 @@ define amdgpu_ps void @insertelement_v_v16i16_s_s(<16 x i16> addrspace(1)* %ptr,
 ; GFX9-NEXT:    v_and_or_b32 v10, v1, s13, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[12:13], s12, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, v10, s[12:13]
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v10, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v4, v10, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v10, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, v5, v10, s[2:3]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, v6, v10, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, v7, v10, s[6:7]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, v8, v10, s[8:9]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, v9, v10, s[10:11]
 ; GFX9-NEXT:    v_mov_b32_e32 v8, 0
-; GFX9-NEXT:    v_mov_b32_e32 v10, 16
 ; GFX9-NEXT:    v_mov_b32_e32 v9, 0
-; GFX9-NEXT:    v_mov_b32_e32 v11, 0
+; GFX9-NEXT:    s_mov_b64 s[0:1], 16
+; GFX9-NEXT:    v_mov_b32_e32 v10, 0
 ; GFX9-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off
-; GFX9-NEXT:    global_store_dwordx4 v[10:11], v[4:7], off
+; GFX9-NEXT:    global_store_dwordx4 v10, v[4:7], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: insertelement_v_v16i16_s_s:
@@ -2335,10 +2335,9 @@ define amdgpu_ps void @insertelement_s_v16i16_v_s(<16 x i16> addrspace(4)* inreg
 ; GFX9-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v9, 0
 ; GFX9-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_mov_b32_e32 v0, 16
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off
+; GFX9-NEXT:    s_mov_b64 s[0:1], 16
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    global_store_dwordx4 v0, v[4:7], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: insertelement_s_v16i16_v_s:
@@ -2510,9 +2509,9 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(<16 x i16> addrspace(4)* inreg
 ; GFX9-NEXT:    v_mov_b32_e32 v6, s22
 ; GFX9-NEXT:    v_mov_b32_e32 v7, s23
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[12:13], 0, v8
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v9, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v9, s[12:13]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v9, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v9, s[2:3]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, v4, v9, s[14:15]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, v9, s[6:7]
@@ -2521,10 +2520,9 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(<16 x i16> addrspace(4)* inreg
 ; GFX9-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v9, 0
 ; GFX9-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_mov_b32_e32 v0, 16
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off
+; GFX9-NEXT:    s_mov_b64 s[0:1], 16
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    global_store_dwordx4 v0, v[4:7], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: insertelement_s_v16i16_s_v:
@@ -2699,9 +2697,9 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(<16 x i16> addrspace(4)* inreg
 ; GFX9-NEXT:    v_mov_b32_e32 v6, s18
 ; GFX9-NEXT:    v_mov_b32_e32 v7, s19
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[12:13], 0, v8
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v9, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v9, s[12:13]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v9, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v9, s[2:3]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, v4, v9, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, v9, s[6:7]
@@ -2710,10 +2708,9 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(<16 x i16> addrspace(4)* inreg
 ; GFX9-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v9, 0
 ; GFX9-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_mov_b32_e32 v0, 16
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off
+; GFX9-NEXT:    s_mov_b64 s[0:1], 16
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    global_store_dwordx4 v0, v[4:7], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: insertelement_s_v16i16_v_v:
@@ -2874,20 +2871,20 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(<16 x i16> addrspace(1)* %ptr,
 ; GFX9-NEXT:    v_cndmask_b32_e64 v11, v11, v9, s[8:9]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v11, v11, v10, s[10:11]
 ; GFX9-NEXT:    v_and_or_b32 v11, v11, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v3, v11, s[12:13]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v5, v11, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v3, v11, s[12:13]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, v6, v11, s[2:3]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, v8, v11, s[6:7]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, v9, v11, s[8:9]
 ; GFX9-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v11, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v9, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, v7, v11, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, v10, v11, s[10:11]
-; GFX9-NEXT:    v_mov_b32_e32 v10, 16
-; GFX9-NEXT:    v_mov_b32_e32 v9, 0
-; GFX9-NEXT:    v_mov_b32_e32 v11, 0
+; GFX9-NEXT:    s_mov_b64 s[0:1], 16
+; GFX9-NEXT:    v_mov_b32_e32 v10, 0
 ; GFX9-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off
-; GFX9-NEXT:    global_store_dwordx4 v[10:11], v[4:7], off
+; GFX9-NEXT:    global_store_dwordx4 v10, v[4:7], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: insertelement_v_v16i16_s_v:
@@ -3022,20 +3019,20 @@ define amdgpu_ps void @insertelement_v_v16i16_v_s(<16 x i16> addrspace(1)* %ptr,
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v10, s[10:11]
 ; GFX9-NEXT:    v_and_or_b32 v11, v1, s13, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[12:13], s12, 0
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v3, v11, s[12:13]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v5, v11, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v3, v11, s[12:13]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, v6, v11, s[2:3]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, v8, v11, s[6:7]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, v9, v11, s[8:9]
 ; GFX9-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v11, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v9, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, v7, v11, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, v10, v11, s[10:11]
-; GFX9-NEXT:    v_mov_b32_e32 v10, 16
-; GFX9-NEXT:    v_mov_b32_e32 v9, 0
-; GFX9-NEXT:    v_mov_b32_e32 v11, 0
+; GFX9-NEXT:    s_mov_b64 s[0:1], 16
+; GFX9-NEXT:    v_mov_b32_e32 v10, 0
 ; GFX9-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off
-; GFX9-NEXT:    global_store_dwordx4 v[10:11], v[4:7], off
+; GFX9-NEXT:    global_store_dwordx4 v10, v[4:7], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: insertelement_v_v16i16_v_s:
@@ -3177,13 +3174,13 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(<16 x i16> addrspace(1)* %ptr,
 ; GFX9-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v6, v12, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, v7, v12, s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v9, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, v10, v12, s[8:9]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, v11, v12, s[10:11]
-; GFX9-NEXT:    v_mov_b32_e32 v10, 16
-; GFX9-NEXT:    v_mov_b32_e32 v9, 0
-; GFX9-NEXT:    v_mov_b32_e32 v11, 0
+; GFX9-NEXT:    s_mov_b64 s[0:1], 16
+; GFX9-NEXT:    v_mov_b32_e32 v10, 0
 ; GFX9-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off
-; GFX9-NEXT:    global_store_dwordx4 v[10:11], v[4:7], off
+; GFX9-NEXT:    global_store_dwordx4 v10, v[4:7], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: insertelement_v_v16i16_v_v:

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
index 7d99993a6429..fb54cf0da95c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
@@ -6,19 +6,18 @@
 define amdgpu_ps void @insertelement_s_v2i8_s_s(<2 x i8> addrspace(4)* inreg %ptr, i8 inreg %val, i32 inreg %idx) {
 ; GFX9-LABEL: insertelement_s_v2i8_s_s:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_load_ushort v0, v[0:1], off
-; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    global_load_ushort v1, v1, s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 8, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX9-NEXT:    v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX9-NEXT:    v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    global_store_short v[0:1], v2, off
@@ -139,9 +138,8 @@ define amdgpu_ps void @insertelement_v_v2i8_s_s(<2 x i8> addrspace(1)* %ptr, i8
 define amdgpu_ps void @insertelement_s_v2i8_v_s(<2 x i8> addrspace(4)* inreg %ptr, i8 %val, i32 inreg %idx) {
 ; GFX9-LABEL: insertelement_s_v2i8_v_s:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    v_mov_b32_e32 v2, s3
-; GFX9-NEXT:    global_load_ushort v1, v[1:2], off
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    global_load_ushort v1, v1, s[2:3]
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 8, v1
@@ -205,19 +203,18 @@ define amdgpu_ps void @insertelement_s_v2i8_v_s(<2 x i8> addrspace(4)* inreg %pt
 define amdgpu_ps void @insertelement_s_v2i8_s_v(<2 x i8> addrspace(4)* inreg %ptr, i8 inreg %val, i32 %idx) {
 ; GFX9-LABEL: insertelement_s_v2i8_s_v:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    v_mov_b32_e32 v2, s3
-; GFX9-NEXT:    global_load_ushort v1, v[1:2], off
-; GFX9-NEXT:    v_mov_b32_e32 v3, s4
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    global_load_ushort v2, v2, s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 8, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 8, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX9-NEXT:    v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    global_store_short v[0:1], v2, off
@@ -274,9 +271,8 @@ define amdgpu_ps void @insertelement_s_v2i8_s_v(<2 x i8> addrspace(4)* inreg %pt
 define amdgpu_ps void @insertelement_s_v2i8_v_v(<2 x i8> addrspace(4)* inreg %ptr, i8 %val, i32 %idx) {
 ; GFX9-LABEL: insertelement_s_v2i8_v_v:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-NEXT:    global_load_ushort v2, v[2:3], off
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    global_load_ushort v2, v2, s[2:3]
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 8, v2

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-global.mir
index 18ec87e178e2..96e109e5fe34 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-global.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-global.mir
@@ -690,9 +690,9 @@ body:             |
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
     ; GFX9: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX9: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[COPY]]
-    ; GFX9: [[GLOBAL_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_RTN [[COPY3]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
-    ; GFX9: $vgpr0 = COPY [[GLOBAL_ATOMIC_CMPSWAP_RTN]]
+    ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX9: [[GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN [[V_MOV_B32_e32_]], [[REG_SEQUENCE]], [[COPY]], 0, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
+    ; GFX9: $vgpr0 = COPY [[GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN]]
     ; GFX10-LABEL: name: amdgpu_atomic_cmpxchg_s32_global_sgpr_ptr
     ; GFX10: liveins: $sgpr0_sgpr1, $vgpr2, $vgpr3
     ; GFX10: $vcc_hi = IMPLICIT_DEF
@@ -700,9 +700,9 @@ body:             |
     ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX10: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
     ; GFX10: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX10: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[COPY]]
-    ; GFX10: [[GLOBAL_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_RTN [[COPY3]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
-    ; GFX10: $vgpr0 = COPY [[GLOBAL_ATOMIC_CMPSWAP_RTN]]
+    ; GFX10: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX10: [[GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN [[V_MOV_B32_e32_]], [[REG_SEQUENCE]], [[COPY]], 0, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
+    ; GFX10: $vgpr0 = COPY [[GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN]]
     %0:sgpr(p1) = COPY $sgpr0_sgpr1
     %1:vgpr(s32) = COPY $vgpr2
     %2:vgpr(s32) = COPY $vgpr3
@@ -791,9 +791,9 @@ body:             |
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
     ; GFX9: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX9: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[COPY]]
-    ; GFX9: [[GLOBAL_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_RTN [[COPY3]], [[REG_SEQUENCE]], 4095, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
-    ; GFX9: $vgpr0 = COPY [[GLOBAL_ATOMIC_CMPSWAP_RTN]]
+    ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX9: [[GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN [[V_MOV_B32_e32_]], [[REG_SEQUENCE]], [[COPY]], 4095, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
+    ; GFX9: $vgpr0 = COPY [[GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN]]
     ; GFX10-LABEL: name: amdgpu_atomic_cmpxchg_s32_global_sgpr_ptr_offset_4095
     ; GFX10: liveins: $sgpr0_sgpr1, $vgpr2, $vgpr3
     ; GFX10: $vcc_hi = IMPLICIT_DEF

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy.mir
index f8eb541beff1..7906ff62c097 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy.mir
@@ -21,9 +21,9 @@ body: |
     ; WAVE32-LABEL: name: copy
     ; WAVE32: $vcc_hi = IMPLICIT_DEF
     ; WAVE32: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr2_sgpr3
-    ; WAVE32: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[COPY]]
     ; WAVE32: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-    ; WAVE32: GLOBAL_STORE_DWORD [[COPY1]], [[DEF]], 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
+    ; WAVE32: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; WAVE32: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], [[DEF]], [[COPY]], 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
     %0:sgpr(p1) = COPY $sgpr2_sgpr3
     %1:vgpr(p1) = COPY %0
     %2:vgpr(s32) = G_IMPLICIT_DEF

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir
index b450aa8b8196..e04f8d0bf96d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir
@@ -26,8 +26,8 @@ body: |
     ; CHECK: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM]]
     ; CHECK: %12:vreg_64 = nofpexcept V_ADD_F64 0, [[COPY3]], 1, [[COPY4]], 0, 0, implicit $mode, implicit $exec
     ; CHECK: %15:vreg_64 = nofpexcept V_FRACT_F64_e64 0, %12, 0, 0, implicit $mode, implicit $exec
-    ; CHECK: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[COPY1]]
-    ; CHECK: GLOBAL_STORE_DWORDX2 [[COPY5]], %15, 0, 0, 0, 0, implicit $exec :: (store 8, addrspace 1)
+    ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; CHECK: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], %15, [[COPY1]], 0, 0, 0, 0, implicit $exec :: (store 8, addrspace 1)
     ; CHECK: S_ENDPGM 0
     %2:sgpr(p4) = COPY $sgpr0_sgpr1
     %7:sgpr(s64) = G_CONSTANT i64 36
@@ -76,8 +76,8 @@ body: |
     ; CHECK: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM]]
     ; CHECK: %13:vreg_64 = nofpexcept V_ADD_F64 0, [[COPY3]], 3, [[COPY4]], 0, 0, implicit $mode, implicit $exec
     ; CHECK: %16:vreg_64 = nofpexcept V_FRACT_F64_e64 0, %13, 0, 0, implicit $mode, implicit $exec
-    ; CHECK: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[COPY1]]
-    ; CHECK: GLOBAL_STORE_DWORDX2 [[COPY5]], %16, 0, 0, 0, 0, implicit $exec :: (store 8, addrspace 1)
+    ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; CHECK: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], %16, [[COPY1]], 0, 0, 0, 0, implicit $exec :: (store 8, addrspace 1)
     ; CHECK: S_ENDPGM 0
     %2:sgpr(p4) = COPY $sgpr0_sgpr1
     %7:sgpr(s64) = G_CONSTANT i64 36

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global-saddr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global-saddr.mir
index 23ba321f9fc8..4623a7fe057c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global-saddr.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global-saddr.mir
@@ -15,16 +15,16 @@ body: |
     ; GFX9-LABEL: name: load_global_s32_from_sgpr
     ; GFX9: liveins: $sgpr0_sgpr1
     ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX9: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[COPY]]
-    ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY1]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
-    ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
+    ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX9: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+    ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]]
     ; GFX10-LABEL: name: load_global_s32_from_sgpr
     ; GFX10: liveins: $sgpr0_sgpr1
     ; GFX10: $vcc_hi = IMPLICIT_DEF
     ; GFX10: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX10: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[COPY]]
-    ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY1]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
-    ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
+    ; GFX10: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX10: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+    ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]]
     %0:sgpr(p1) = COPY $sgpr0_sgpr1
     %1:vgpr(p1) = COPY %0
     %2:vgpr(s32) = G_LOAD %1 :: (load 4, align 4, addrspace 1)
@@ -400,19 +400,9 @@ body: |
     ; GFX9-LABEL: name: load_global_s32_from_sgpr_base_offset_2049
     ; GFX9: liveins: $sgpr0_sgpr1
     ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2049
-    ; GFX9: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX9: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; GFX9: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; GFX9: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-    ; GFX9: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; GFX9: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-    ; GFX9: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
-    ; GFX9: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def $scc, implicit $scc
-    ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
-    ; GFX9: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
-    ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
-    ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
+    ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX9: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 2049, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+    ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]]
     ; GFX10-LABEL: name: load_global_s32_from_sgpr_base_offset_2049
     ; GFX10: liveins: $sgpr0_sgpr1
     ; GFX10: $vcc_hi = IMPLICIT_DEF
@@ -442,19 +432,9 @@ body: |
     ; GFX9-LABEL: name: load_global_s32_from_sgpr_base_offset_neg2049
     ; GFX9: liveins: $sgpr0_sgpr1
     ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294965247
-    ; GFX9: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-    ; GFX9: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; GFX9: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; GFX9: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-    ; GFX9: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; GFX9: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-    ; GFX9: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
-    ; GFX9: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def $scc, implicit $scc
-    ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
-    ; GFX9: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
-    ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
-    ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
+    ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX9: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], -2049, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+    ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]]
     ; GFX10-LABEL: name: load_global_s32_from_sgpr_base_offset_neg2049
     ; GFX10: liveins: $sgpr0_sgpr1
     ; GFX10: $vcc_hi = IMPLICIT_DEF
@@ -717,3 +697,52 @@ body: |
     $vgpr0 = COPY %4
 
 ...
+
+---
+name: load_global_s32_from_copy_undef_sgpr
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    ; GFX9-LABEL: name: load_global_s32_from_copy_undef_sgpr
+    ; GFX9: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+    ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+    ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+    ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
+    ; GFX10-LABEL: name: load_global_s32_from_copy_undef_sgpr
+    ; GFX10: $vcc_hi = IMPLICIT_DEF
+    ; GFX10: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+    ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+    ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+    ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
+    %0:sgpr(p1) = G_IMPLICIT_DEF
+    %1:vgpr(p1) = COPY %0
+    %2:vgpr(s32) = G_LOAD %1 :: (load 4, align 4, addrspace 1)
+    $vgpr0 = COPY %2
+
+...
+
+---
+name: load_global_s32_from_undef_vgpr
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    ; GFX9-LABEL: name: load_global_s32_from_undef_vgpr
+    ; GFX9: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+    ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+    ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
+    ; GFX10-LABEL: name: load_global_s32_from_undef_vgpr
+    ; GFX10: $vcc_hi = IMPLICIT_DEF
+    ; GFX10: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+    ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+    ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
+    %0:vgpr(p1) = G_IMPLICIT_DEF
+    %1:vgpr(s32) = G_LOAD %0 :: (load 4, align 4, addrspace 1)
+    $vgpr0 = COPY %1
+
+...

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll
index e2fba855f9db..5fc598b3dcbf 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll
@@ -55,11 +55,10 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 ad
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    ds_inc_rtn_u32 v2, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    ds_inc_rtn_u32 v0, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false), !noalias !0
   store i32 %result, i32 addrspace(1)* %out
@@ -106,11 +105,10 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out,
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    ds_inc_rtn_u32 v2, v1, v0 offset:16
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    ds_inc_rtn_u32 v0, v1, v0 offset:16
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %gep, i32 42, i32 0, i32 0, i1 false)
@@ -217,15 +215,12 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32
 ; GFX9-LABEL: global_atomic_inc_ret_i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 42
+; GFX9-NEXT:    v_mov_b32_e32 v0, 42
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_atomic_inc v2, v[0:1], v2, off glc
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    global_atomic_inc v0, v1, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 42, i32 0, i32 0, i1 false)
   store i32 %result, i32 addrspace(1)* %out
@@ -268,17 +263,12 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(i32 addrspace(1)* %o
 ; GFX9-LABEL: global_atomic_inc_ret_i32_offset:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 42
+; GFX9-NEXT:    v_mov_b32_e32 v0, 42
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_add_u32 s2, s2, 16
-; GFX9-NEXT:    s_addc_u32 s3, s3, 0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_atomic_inc v2, v[0:1], v2, off glc
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    global_atomic_inc v0, v1, v0, s[2:3] offset:16 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
   %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false)
@@ -310,11 +300,10 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32(i32 addrspace(1)* %ptr) n
 ; GFX9-LABEL: global_atomic_inc_noret_i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 42
+; GFX9-NEXT:    v_mov_b32_e32 v0, 42
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_atomic_inc v0, v[0:1], v2, off glc
+; GFX9-NEXT:    global_atomic_inc v0, v1, v0, s[0:1] glc
 ; GFX9-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 42, i32 0, i32 0, i1 false)
   ret void
@@ -348,13 +337,10 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(i32 addrspace(1)*
 ; GFX9-LABEL: global_atomic_inc_noret_i32_offset:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 42
+; GFX9-NEXT:    v_mov_b32_e32 v0, 42
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_add_u32 s0, s0, 16
-; GFX9-NEXT:    s_addc_u32 s1, s1, 0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_atomic_inc v0, v[0:1], v2, off glc
+; GFX9-NEXT:    global_atomic_inc v0, v1, v0, s[0:1] offset:16 glc
 ; GFX9-NEXT:    s_endpgm
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
   %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false)
@@ -508,18 +494,15 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(i32 addrspace(1)* %out,
 ;
 ; GFX9-LABEL: atomic_inc_shl_base_lds_0_i32:
 ; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_add_u32_e32 v1, 2, v0
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX9-NEXT:    v_add_u32_e32 v2, 2, v0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, 9
-; GFX9-NEXT:    ds_inc_rtn_u32 v3, v0, v1 offset:8
+; GFX9-NEXT:    v_mov_b32_e32 v2, 9
+; GFX9-NEXT:    ds_inc_rtn_u32 v0, v0, v2 offset:8
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dword v[0:1], v3, off
+; GFX9-NEXT:    global_store_dword v2, v1, s[2:3]
+; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
@@ -572,10 +555,9 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 ad
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX9-NEXT:    ds_inc_rtn_u64 v[0:1], v2, v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %ptr, i64 42, i32 0, i32 0, i1 false)
   store i64 %result, i64 addrspace(1)* %out
@@ -624,10 +606,9 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out,
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX9-NEXT:    ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %gep, i64 42, i32 0, i32 0, i1 false)
@@ -742,16 +723,13 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64
 ; GFX9-LABEL: global_atomic_inc_ret_i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 42
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 42
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_atomic_inc_x2 v[0:1], v[0:1], v[2:3], off glc
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    global_atomic_inc_x2 v[0:1], v2, v[0:1], s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 42, i32 0, i32 0, i1 false)
   store i64 %result, i64 addrspace(1)* %out
@@ -796,18 +774,13 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(i64 addrspace(1)* %o
 ; GFX9-LABEL: global_atomic_inc_ret_i64_offset:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 42
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 42
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_add_u32 s2, s2, 32
-; GFX9-NEXT:    s_addc_u32 s3, s3, 0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_atomic_inc_x2 v[0:1], v[0:1], v[2:3], off glc
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    global_atomic_inc_x2 v[0:1], v2, v[0:1], s[2:3] offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4
   %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false)
@@ -841,12 +814,11 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(i64 addrspace(1)* %ptr) n
 ; GFX9-LABEL: global_atomic_inc_noret_i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 42
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 42
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_atomic_inc_x2 v[0:1], v[0:1], v[2:3], off glc
+; GFX9-NEXT:    global_atomic_inc_x2 v[0:1], v2, v[0:1], s[0:1] glc
 ; GFX9-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 42, i32 0, i32 0, i1 false)
   ret void
@@ -882,14 +854,11 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(i64 addrspace(1)*
 ; GFX9-LABEL: global_atomic_inc_noret_i64_offset:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 42
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 42
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_add_u32 s0, s0, 32
-; GFX9-NEXT:    s_addc_u32 s1, s1, 0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_atomic_inc_x2 v[0:1], v[0:1], v[2:3], off glc
+; GFX9-NEXT:    global_atomic_inc_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc
 ; GFX9-NEXT:    s_endpgm
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4
   %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false)
@@ -1247,19 +1216,16 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(i64 addrspace(1)* %out,
 ;
 ; GFX9-LABEL: atomic_inc_shl_base_lds_0_i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX9-NEXT:    v_add_u32_e32 v4, 2, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; GFX9-NEXT:    v_add_u32_e32 v2, 2, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 9
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:16
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX9-NEXT:    ds_inc_rtn_u64 v[0:1], v3, v[0:1] offset:16
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-NEXT:    global_store_dword v[2:3], v4, off
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    v_mov_b32_e32 v2, s0
-; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT:    global_store_dword v3, v2, s[2:3]
+; GFX9-NEXT:    global_store_dwordx2 v3, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
@@ -1523,17 +1489,14 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(i32 addrspace(1)* %out0,
 ; GFX9-NEXT:    s_load_dword s4, s[4:5], 0x10
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-NEXT:    ds_inc_rtn_u32 v4, v1, v0
-; GFX9-NEXT:    ds_inc_rtn_u32 v5, v1, v0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-NEXT:    ds_inc_rtn_u32 v2, v1, v0
+; GFX9-NEXT:    ds_inc_rtn_u32 v0, v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX9-NEXT:    global_store_dword v[0:1], v4, off
+; GFX9-NEXT:    global_store_dword v1, v2, s[0:1]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_store_dword v[2:3], v5, off
+; GFX9-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_endpgm
   %result0 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false)
   %result1 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
index 8847cb053f70..3cb754590898 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
@@ -267,10 +267,9 @@ define amdgpu_kernel void @test_div_fmas_f32(float addrspace(1)* %out, [8 x i32]
 ; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX10_W32-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s2
-; GFX10_W32-NEXT:    v_div_fmas_f32 v2, s5, v0, v1
-; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10_W32-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10_W32-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10_W32-NEXT:    v_div_fmas_f32 v0, s5, v0, v1
+; GFX10_W32-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10_W32-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10_W32-NEXT:    s_endpgm
 ;
 ; GFX10_W64-LABEL: test_div_fmas_f32:
@@ -286,10 +285,9 @@ define amdgpu_kernel void @test_div_fmas_f32(float addrspace(1)* %out, [8 x i32]
 ; GFX10_W64-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX10_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
 ; GFX10_W64-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10_W64-NEXT:    v_div_fmas_f32 v2, s5, v0, v1
-; GFX10_W64-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10_W64-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10_W64-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10_W64-NEXT:    v_div_fmas_f32 v0, s5, v0, v1
+; GFX10_W64-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10_W64-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10_W64-NEXT:    s_endpgm
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %d)
   store float %result, float addrspace(1)* %out, align 4
@@ -340,15 +338,14 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_0(float addrspace(1)* %o
 ; GFX10_W32-NEXT:    s_load_dword s3, s[0:1], 0x94
 ; GFX10_W32-NEXT:    s_load_dword s4, s[0:1], 0x70
 ; GFX10_W32-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10_W32-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10_W32-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10_W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10_W32-NEXT:    s_and_b32 s2, 1, s2
 ; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s2
-; GFX10_W32-NEXT:    v_div_fmas_f32 v2, 1.0, s4, v0
-; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10_W32-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10_W32-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10_W32-NEXT:    v_div_fmas_f32 v0, 1.0, s4, v0
+; GFX10_W32-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10_W32-NEXT:    s_endpgm
 ;
 ; GFX10_W64-LABEL: test_div_fmas_f32_inline_imm_0:
@@ -358,14 +355,13 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_0(float addrspace(1)* %o
 ; GFX10_W64-NEXT:    s_load_dword s3, s[0:1], 0x94
 ; GFX10_W64-NEXT:    s_load_dword s4, s[0:1], 0x70
 ; GFX10_W64-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10_W64-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10_W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10_W64-NEXT:    s_and_b32 s2, 1, s2
 ; GFX10_W64-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX10_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
-; GFX10_W64-NEXT:    v_div_fmas_f32 v2, 1.0, s4, v0
-; GFX10_W64-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10_W64-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10_W64-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10_W64-NEXT:    v_div_fmas_f32 v0, 1.0, s4, v0
+; GFX10_W64-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10_W64-NEXT:    s_endpgm
   %result = call float @llvm.amdgcn.div.fmas.f32(float 1.0, float %b, float %c, i1 %d)
   store float %result, float addrspace(1)* %out, align 4
@@ -416,15 +412,14 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_1(float addrspace(1)* %o
 ; GFX10_W32-NEXT:    s_load_dword s3, s[0:1], 0x34
 ; GFX10_W32-NEXT:    s_load_dword s4, s[0:1], 0x2c
 ; GFX10_W32-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10_W32-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10_W32-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10_W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10_W32-NEXT:    s_and_b32 s2, 1, s2
 ; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s2
-; GFX10_W32-NEXT:    v_div_fmas_f32 v2, s4, 1.0, v0
-; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10_W32-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10_W32-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10_W32-NEXT:    v_div_fmas_f32 v0, s4, 1.0, v0
+; GFX10_W32-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10_W32-NEXT:    s_endpgm
 ;
 ; GFX10_W64-LABEL: test_div_fmas_f32_inline_imm_1:
@@ -434,14 +429,13 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_1(float addrspace(1)* %o
 ; GFX10_W64-NEXT:    s_load_dword s3, s[0:1], 0x34
 ; GFX10_W64-NEXT:    s_load_dword s4, s[0:1], 0x2c
 ; GFX10_W64-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10_W64-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10_W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10_W64-NEXT:    s_and_b32 s2, 1, s2
 ; GFX10_W64-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX10_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
-; GFX10_W64-NEXT:    v_div_fmas_f32 v2, s4, 1.0, v0
-; GFX10_W64-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10_W64-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10_W64-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10_W64-NEXT:    v_div_fmas_f32 v0, s4, 1.0, v0
+; GFX10_W64-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10_W64-NEXT:    s_endpgm
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float 1.0, float %c, i1 %d)
   store float %result, float addrspace(1)* %out, align 4
@@ -492,15 +486,14 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_2(float addrspace(1)* %o
 ; GFX10_W32-NEXT:    s_load_dword s3, s[0:1], 0x70
 ; GFX10_W32-NEXT:    s_load_dword s4, s[0:1], 0x4c
 ; GFX10_W32-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10_W32-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10_W32-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10_W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10_W32-NEXT:    s_and_b32 s2, 1, s2
 ; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s2
-; GFX10_W32-NEXT:    v_div_fmas_f32 v2, s4, v0, 1.0
-; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10_W32-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10_W32-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10_W32-NEXT:    v_div_fmas_f32 v0, s4, v0, 1.0
+; GFX10_W32-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10_W32-NEXT:    s_endpgm
 ;
 ; GFX10_W64-LABEL: test_div_fmas_f32_inline_imm_2:
@@ -510,14 +503,13 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_2(float addrspace(1)* %o
 ; GFX10_W64-NEXT:    s_load_dword s3, s[0:1], 0x70
 ; GFX10_W64-NEXT:    s_load_dword s4, s[0:1], 0x4c
 ; GFX10_W64-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10_W64-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10_W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10_W64-NEXT:    s_and_b32 s2, 1, s2
 ; GFX10_W64-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX10_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
-; GFX10_W64-NEXT:    v_div_fmas_f32 v2, s4, v0, 1.0
-; GFX10_W64-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10_W64-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10_W64-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10_W64-NEXT:    v_div_fmas_f32 v0, s4, v0, 1.0
+; GFX10_W64-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10_W64-NEXT:    s_endpgm
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float 1.0, i1 %d)
   store float %result, float addrspace(1)* %out, align 4
@@ -579,9 +571,8 @@ define amdgpu_kernel void @test_div_fmas_f64(double addrspace(1)* %out, double %
 ; GFX10_W32-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s8
 ; GFX10_W32-NEXT:    v_div_fmas_f64 v[0:1], s[2:3], v[0:1], v[2:3]
-; GFX10_W32-NEXT:    v_mov_b32_e32 v3, s1
-; GFX10_W32-NEXT:    v_mov_b32_e32 v2, s0
-; GFX10_W32-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX10_W32-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10_W32-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10_W32-NEXT:    s_endpgm
 ;
 ; GFX10_W64-LABEL: test_div_fmas_f64:
@@ -597,9 +588,8 @@ define amdgpu_kernel void @test_div_fmas_f64(double addrspace(1)* %out, double %
 ; GFX10_W64-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10_W64-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX10_W64-NEXT:    v_div_fmas_f64 v[0:1], s[2:3], v[0:1], v[2:3]
-; GFX10_W64-NEXT:    v_mov_b32_e32 v3, s1
-; GFX10_W64-NEXT:    v_mov_b32_e32 v2, s0
-; GFX10_W64-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX10_W64-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10_W64-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10_W64-NEXT:    s_endpgm
   %result = call double @llvm.amdgcn.div.fmas.f64(double %a, double %b, double %c, i1 %d)
   store double %result, double addrspace(1)* %out, align 8
@@ -658,10 +648,9 @@ define amdgpu_kernel void @test_div_fmas_f32_cond_to_vcc(float addrspace(1)* %ou
 ; GFX10_W32-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX10_W32-NEXT:    s_and_b32 s2, 1, s2
 ; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s2
-; GFX10_W32-NEXT:    v_div_fmas_f32 v2, s4, v0, v1
-; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10_W32-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10_W32-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10_W32-NEXT:    v_div_fmas_f32 v0, s4, v0, v1
+; GFX10_W32-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10_W32-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10_W32-NEXT:    s_endpgm
 ;
 ; GFX10_W64-LABEL: test_div_fmas_f32_cond_to_vcc:
@@ -676,10 +665,9 @@ define amdgpu_kernel void @test_div_fmas_f32_cond_to_vcc(float addrspace(1)* %ou
 ; GFX10_W64-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10_W64-NEXT:    s_and_b32 s2, 1, s2
 ; GFX10_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
-; GFX10_W64-NEXT:    v_div_fmas_f32 v2, s4, v0, v1
-; GFX10_W64-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10_W64-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10_W64-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10_W64-NEXT:    v_div_fmas_f32 v0, s4, v0, v1
+; GFX10_W64-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10_W64-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10_W64-NEXT:    s_endpgm
   %cmp = icmp eq i32 %i, 0
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %cmp)
@@ -734,10 +722,9 @@ define amdgpu_kernel void @test_div_fmas_f32_imm_false_cond_to_vcc(float addrspa
 ; GFX10_W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX10_W32-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10_W32-NEXT:    v_div_fmas_f32 v2, s4, v0, v1
-; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10_W32-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10_W32-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10_W32-NEXT:    v_div_fmas_f32 v0, s4, v0, v1
+; GFX10_W32-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10_W32-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10_W32-NEXT:    s_endpgm
 ;
 ; GFX10_W64-LABEL: test_div_fmas_f32_imm_false_cond_to_vcc:
@@ -751,10 +738,9 @@ define amdgpu_kernel void @test_div_fmas_f32_imm_false_cond_to_vcc(float addrspa
 ; GFX10_W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10_W64-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX10_W64-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10_W64-NEXT:    v_div_fmas_f32 v2, s4, v0, v1
-; GFX10_W64-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10_W64-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10_W64-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10_W64-NEXT:    v_div_fmas_f32 v0, s4, v0, v1
+; GFX10_W64-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10_W64-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10_W64-NEXT:    s_endpgm
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 false)
   store float %result, float addrspace(1)* %out, align 4
@@ -808,10 +794,9 @@ define amdgpu_kernel void @test_div_fmas_f32_imm_true_cond_to_vcc(float addrspac
 ; GFX10_W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX10_W32-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10_W32-NEXT:    v_div_fmas_f32 v2, s4, v0, v1
-; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10_W32-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10_W32-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10_W32-NEXT:    v_div_fmas_f32 v0, s4, v0, v1
+; GFX10_W32-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10_W32-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10_W32-NEXT:    s_endpgm
 ;
 ; GFX10_W64-LABEL: test_div_fmas_f32_imm_true_cond_to_vcc:
@@ -825,10 +810,9 @@ define amdgpu_kernel void @test_div_fmas_f32_imm_true_cond_to_vcc(float addrspac
 ; GFX10_W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10_W64-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX10_W64-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10_W64-NEXT:    v_div_fmas_f32 v2, s4, v0, v1
-; GFX10_W64-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10_W64-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10_W64-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10_W64-NEXT:    v_div_fmas_f32 v0, s4, v0, v1
+; GFX10_W64-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10_W64-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10_W64-NEXT:    s_endpgm
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 true)
   store float %result, float addrspace(1)* %out, align 4
@@ -899,7 +883,7 @@ define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace
 ; GFX10_W32:       ; %bb.0:
 ; GFX10_W32-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10_W32-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
-; GFX10_W32-NEXT:    s_load_dword s2, s[0:1], 0x54
+; GFX10_W32-NEXT:    s_load_dword s0, s[0:1], 0x54
 ; GFX10_W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10_W32-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10_W32-NEXT:    s_waitcnt lgkmcnt(0)
@@ -907,43 +891,37 @@ define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace
 ; GFX10_W32-NEXT:    global_load_dword v2, v1, s[6:7]
 ; GFX10_W32-NEXT:    global_load_dword v3, v1, s[6:7] offset:4
 ; GFX10_W32-NEXT:    global_load_dword v1, v1, s[6:7] offset:8
-; GFX10_W32-NEXT:    s_add_u32 s0, s4, 8
-; GFX10_W32-NEXT:    s_addc_u32 s1, s5, 0
-; GFX10_W32-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX10_W32-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX10_W32-NEXT:    s_and_b32 s2, 1, s2
-; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 s2, 0, s2
-; GFX10_W32-NEXT:    s_and_b32 vcc_lo, vcc_lo, s2
+; GFX10_W32-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX10_W32-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX10_W32-NEXT:    s_and_b32 s0, 1, s0
+; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
+; GFX10_W32-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
 ; GFX10_W32-NEXT:    s_waitcnt vmcnt(0)
-; GFX10_W32-NEXT:    v_div_fmas_f32 v2, v2, v3, v1
-; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10_W32-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10_W32-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10_W32-NEXT:    v_div_fmas_f32 v0, v2, v3, v1
+; GFX10_W32-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10_W32-NEXT:    global_store_dword v1, v0, s[4:5] offset:8
 ; GFX10_W32-NEXT:    s_endpgm
 ;
 ; GFX10_W64-LABEL: test_div_fmas_f32_logical_cond_to_vcc:
 ; GFX10_W64:       ; %bb.0:
 ; GFX10_W64-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10_W64-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
-; GFX10_W64-NEXT:    s_load_dword s2, s[0:1], 0x54
+; GFX10_W64-NEXT:    s_load_dword s0, s[0:1], 0x54
 ; GFX10_W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX10_W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10_W64-NEXT:    s_clause 0x2
 ; GFX10_W64-NEXT:    global_load_dword v2, v1, s[6:7]
 ; GFX10_W64-NEXT:    global_load_dword v3, v1, s[6:7] offset:4
 ; GFX10_W64-NEXT:    global_load_dword v1, v1, s[6:7] offset:8
-; GFX10_W64-NEXT:    s_add_u32 s0, s4, 8
-; GFX10_W64-NEXT:    s_addc_u32 s1, s5, 0
-; GFX10_W64-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX10_W64-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX10_W64-NEXT:    s_and_b32 s2, 1, s2
-; GFX10_W64-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, s2
-; GFX10_W64-NEXT:    s_and_b64 vcc, vcc, s[2:3]
+; GFX10_W64-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX10_W64-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX10_W64-NEXT:    s_and_b32 s0, 1, s0
+; GFX10_W64-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
+; GFX10_W64-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GFX10_W64-NEXT:    s_waitcnt vmcnt(0)
-; GFX10_W64-NEXT:    v_div_fmas_f32 v2, v2, v3, v1
-; GFX10_W64-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10_W64-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10_W64-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10_W64-NEXT:    v_div_fmas_f32 v0, v2, v3, v1
+; GFX10_W64-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10_W64-NEXT:    global_store_dword v1, v0, s[4:5] offset:8
 ; GFX10_W64-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.a = getelementptr float, float addrspace(1)* %in, i32 %tid
@@ -1058,14 +1036,11 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out
 ; GFX10_W32-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10_W32-NEXT:    s_and_b32 s0, 1, s4
 ; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
-; GFX10_W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10_W32-NEXT:    s_add_u32 s0, s2, 8
-; GFX10_W32-NEXT:    s_addc_u32 s1, s3, 0
 ; GFX10_W32-NEXT:    s_waitcnt vmcnt(0)
-; GFX10_W32-NEXT:    v_div_fmas_f32 v2, v1, v2, v3
-; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10_W32-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10_W32-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10_W32-NEXT:    v_div_fmas_f32 v0, v1, v2, v3
+; GFX10_W32-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10_W32-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10_W32-NEXT:    global_store_dword v1, v0, s[2:3] offset:8
 ; GFX10_W32-NEXT:    s_endpgm
 ;
 ; GFX10_W64-LABEL: test_div_fmas_f32_i1_phi_vcc:
@@ -1091,14 +1066,11 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out
 ; GFX10_W64-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX10_W64-NEXT:    s_and_b32 s0, 1, s6
 ; GFX10_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX10_W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10_W64-NEXT:    s_add_u32 s0, s2, 8
-; GFX10_W64-NEXT:    s_addc_u32 s1, s3, 0
 ; GFX10_W64-NEXT:    s_waitcnt vmcnt(0)
-; GFX10_W64-NEXT:    v_div_fmas_f32 v2, v1, v2, v3
-; GFX10_W64-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10_W64-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10_W64-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10_W64-NEXT:    v_div_fmas_f32 v0, v1, v2, v3
+; GFX10_W64-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10_W64-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10_W64-NEXT:    global_store_dword v1, v0, s[2:3] offset:8
 ; GFX10_W64-NEXT:    s_endpgm
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll
index 69d52364bf3b..9c0cb28c1227 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll
@@ -53,10 +53,9 @@ define amdgpu_kernel void @test_div_scale_f32_1(float addrspace(1)* %out, float
 ; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_div_scale_f32 v2, s2, v0, v0, v1
-; GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-NEXT:    v_div_scale_f32 v0, s2, v0, v0, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
@@ -120,10 +119,9 @@ define amdgpu_kernel void @test_div_scale_f32_2(float addrspace(1)* %out, float
 ; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_div_scale_f32 v2, s2, v1, v0, v1
-; GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-NEXT:    v_div_scale_f32 v0, s2, v1, v0, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
@@ -193,9 +191,8 @@ define amdgpu_kernel void @test_div_scale_f64_1(double addrspace(1)* %out, doubl
 ; GFX10-NEXT:    global_load_dwordx2 v[2:3], v2, s[2:3] offset:8
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_div_scale_f64 v[0:1], s2, v[2:3], v[2:3], v[0:1]
-; GFX10-NEXT:    v_mov_b32_e32 v3, s1
-; GFX10-NEXT:    v_mov_b32_e32 v2, s0
-; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
@@ -265,9 +262,8 @@ define amdgpu_kernel void @test_div_scale_f64_2(double addrspace(1)* %out, doubl
 ; GFX10-NEXT:    global_load_dwordx2 v[2:3], v2, s[2:3] offset:8
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_div_scale_f64 v[0:1], s2, v[0:1], v[2:3], v[0:1]
-; GFX10-NEXT:    v_mov_b32_e32 v3, s1
-; GFX10-NEXT:    v_mov_b32_e32 v2, s0
-; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
@@ -324,14 +320,13 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_num_1(float addrspace(1)* %
 ; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x54
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_div_scale_f32 v2, s0, v0, v0, s0
-; GFX10-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-NEXT:    v_div_scale_f32 v0, s0, v0, v0, s0
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr float, float addrspace(1)* %in, i32 %tid
@@ -386,14 +381,13 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_num_2(float addrspace(1)* %
 ; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x34
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_div_scale_f32 v2, s0, s0, v0, s0
-; GFX10-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-NEXT:    v_div_scale_f32 v0, s0, s0, v0, s0
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr float, float addrspace(1)* %in, i32 %tid
@@ -448,14 +442,13 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_den_1(float addrspace(1)* %
 ; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x34
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_div_scale_f32 v2, s0, s0, s0, v0
-; GFX10-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-NEXT:    v_div_scale_f32 v0, s0, s0, s0, v0
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr float, float addrspace(1)* %in, i32 %tid
@@ -510,14 +503,13 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_den_2(float addrspace(1)* %
 ; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x34
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_div_scale_f32 v2, s0, v0, s0, v0
-; GFX10-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-NEXT:    v_div_scale_f32 v0, s0, v0, s0, v0
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr float, float addrspace(1)* %in, i32 %tid
@@ -572,14 +564,13 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_1(double addrspace(1)*
 ; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x54
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[6:7]
-; GFX10-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_div_scale_f64 v[0:1], s0, v[0:1], v[0:1], s[0:1]
-; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX10-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
@@ -634,14 +625,13 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_2(double addrspace(1)*
 ; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x54
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[6:7]
-; GFX10-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_div_scale_f64 v[0:1], s0, s[0:1], v[0:1], s[0:1]
-; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX10-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
@@ -696,14 +686,13 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_1(double addrspace(1)*
 ; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x54
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[6:7]
-; GFX10-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_div_scale_f64 v[0:1], s0, s[0:1], s[0:1], v[0:1]
-; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX10-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
@@ -758,14 +747,13 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_2(double addrspace(1)*
 ; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x54
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[6:7]
-; GFX10-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_div_scale_f64 v[0:1], s0, v[0:1], s[0:1], v[0:1]
-; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX10-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
@@ -811,12 +799,11 @@ define amdgpu_kernel void @test_div_scale_f32_all_scalar_1(float addrspace(1)* %
 ; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x4c
 ; GFX10-NEXT:    s_load_dword s3, s[0:1], 0x70
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_div_scale_f32 v2, s2, s3, s3, s2
-; GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-NEXT:    v_div_scale_f32 v0, s2, s3, s3, s2
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
   %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 false)
   %result0 = extractvalue { float, i1 } %result, 0
@@ -857,12 +844,11 @@ define amdgpu_kernel void @test_div_scale_f32_all_scalar_2(float addrspace(1)* %
 ; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x4c
 ; GFX10-NEXT:    s_load_dword s3, s[0:1], 0x70
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_div_scale_f32 v2, s2, s2, s3, s2
-; GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-NEXT:    v_div_scale_f32 v0, s2, s2, s3, s2
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
   %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 true)
   %result0 = extractvalue { float, i1 } %result, 0
@@ -905,12 +891,11 @@ define amdgpu_kernel void @test_div_scale_f64_all_scalar_1(double addrspace(1)*
 ; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x4c
 ; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x74
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_div_scale_f64 v[0:1], s2, s[4:5], s[4:5], s[2:3]
-; GFX10-NEXT:    v_mov_b32_e32 v3, s1
-; GFX10-NEXT:    v_mov_b32_e32 v2, s0
-; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
   %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 false)
   %result0 = extractvalue { double, i1 } %result, 0
@@ -953,12 +938,11 @@ define amdgpu_kernel void @test_div_scale_f64_all_scalar_2(double addrspace(1)*
 ; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x4c
 ; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x74
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_div_scale_f64 v[0:1], s2, s[2:3], s[4:5], s[2:3]
-; GFX10-NEXT:    v_mov_b32_e32 v3, s1
-; GFX10-NEXT:    v_mov_b32_e32 v2, s0
-; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
   %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 true)
   %result0 = extractvalue { double, i1 } %result, 0
@@ -1005,14 +989,13 @@ define amdgpu_kernel void @test_div_scale_f32_inline_imm_num(float addrspace(1)*
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_div_scale_f32 v2, s2, v0, v0, 1.0
-; GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-NEXT:    v_div_scale_f32 v0, s2, v0, v0, 1.0
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
@@ -1063,14 +1046,13 @@ define amdgpu_kernel void @test_div_scale_f32_inline_imm_den(float addrspace(1)*
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_div_scale_f32 v2, s2, 2.0, 2.0, v0
-; GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-NEXT:    v_div_scale_f32 v0, s2, 2.0, 2.0, v0
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
@@ -1137,10 +1119,9 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_num(float addrspace(1)* %out,
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_div_scale_f32 v2, s2, v0, v0, v1
-; GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-NEXT:    v_div_scale_f32 v0, s2, v0, v0, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
@@ -1209,10 +1190,9 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_den(float addrspace(1)* %out,
 ; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
-; GFX10-NEXT:    v_div_scale_f32 v2, s2, v0, v0, v1
-; GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-NEXT:    v_div_scale_f32 v0, s2, v0, v0, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
@@ -1255,12 +1235,11 @@ define amdgpu_kernel void @test_div_scale_f32_val_undef_val(float addrspace(1)*
 ; GFX10-LABEL: test_div_scale_f32_val_undef_val:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-NEXT:    v_div_scale_f32 v2, s2, s0, s0, 0x41000000
-; GFX10-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-NEXT:    v_div_scale_f32 v0, s2, s0, s0, 0x41000000
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
   %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float 8.0, float undef, i1 false)
   %result0 = extractvalue { float, i1 } %result, 0
@@ -1295,12 +1274,11 @@ define amdgpu_kernel void @test_div_scale_f32_undef_val_val(float addrspace(1)*
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0x41000000
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_div_scale_f32 v2, s2, v0, v0, s0
-; GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-NEXT:    v_div_scale_f32 v0, s2, v0, v0, s0
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
   %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float undef, float 8.0, i1 false)
   %result0 = extractvalue { float, i1 } %result, 0
@@ -1332,12 +1310,11 @@ define amdgpu_kernel void @test_div_scale_f32_undef_undef_val(float addrspace(1)
 ; GFX10-LABEL: test_div_scale_f32_undef_undef_val:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-NEXT:    v_div_scale_f32 v2, s2, s0, s0, s0
-; GFX10-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-NEXT:    v_div_scale_f32 v0, s2, s0, s0, s0
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
   %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float undef, float undef, i1 false)
   %result0 = extractvalue { float, i1 } %result, 0
@@ -1372,15 +1349,14 @@ define amdgpu_kernel void @test_div_scale_f64_val_undef_val(double addrspace(1)*
 ;
 ; GFX10-LABEL: test_div_scale_f64_val_undef_val:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX10-NEXT:    s_mov_b32 s2, 0
 ; GFX10-NEXT:    s_mov_b32 s3, 0x40200000
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-NEXT:    v_div_scale_f64 v[0:1], s2, s[0:1], s[0:1], s[2:3]
+; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_div_scale_f64 v[0:1], s2, s[0:1], s[0:1], s[2:3]
-; GFX10-NEXT:    v_mov_b32_e32 v3, s1
-; GFX10-NEXT:    v_mov_b32_e32 v2, s0
-; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
   %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double 8.0, double undef, i1 false)
   %result0 = extractvalue { double, i1 } %result, 0

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll
index 867cf8d91159..956432f39472 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll
@@ -66,12 +66,11 @@ define amdgpu_ps <4 x float> @load_2d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, i32
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
 ; GFX10-NEXT:    s_mov_b32 s7, s9
-; GFX10-NEXT:    v_mov_b32_e32 v5, s10
+; GFX10-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX10-NEXT:    image_load v[0:4], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm tfe
-; GFX10-NEXT:    v_mov_b32_e32 v6, s11
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_store_dword v[5:6], v4, off
+; GFX10-NEXT:    global_store_dword v5, v4, s[10:11]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    ; return to shader part epilog
   %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0)
@@ -112,12 +111,11 @@ define amdgpu_ps <4 x float> @load_2d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc,
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
 ; GFX10-NEXT:    s_mov_b32 s7, s9
-; GFX10-NEXT:    v_mov_b32_e32 v5, s10
+; GFX10-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX10-NEXT:    image_load v[0:4], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm tfe lwe
-; GFX10-NEXT:    v_mov_b32_e32 v6, s11
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_store_dword v[5:6], v4, off
+; GFX10-NEXT:    global_store_dword v5, v4, s[10:11]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    ; return to shader part epilog
   %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 3, i32 0)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll
index 22c12553cffe..257615e71fea 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll
@@ -62,10 +62,9 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe(<8 x i32> inreg %r
 ; GFX9-NEXT:    s_mov_b32 s7, s9
 ; GFX9-NEXT:    v_and_or_b32 v1, v2, v4, v1
 ; GFX9-NEXT:    image_load v[0:4], v[0:1], s[0:7] dmask:0xf unorm a16 tfe da
-; GFX9-NEXT:    v_mov_b32_e32 v5, s10
-; GFX9-NEXT:    v_mov_b32_e32 v6, s11
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_dword v[5:6], v4, off
+; GFX9-NEXT:    global_store_dword v5, v4, s[10:11]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
@@ -84,12 +83,11 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe(<8 x i32> inreg %r
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
 ; GFX10-NEXT:    s_mov_b32 s7, s9
-; GFX10-NEXT:    v_mov_b32_e32 v5, s10
+; GFX10-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX10-NEXT:    image_load v[0:4], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm a16 tfe
-; GFX10-NEXT:    v_mov_b32_e32 v6, s11
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_store_dword v[5:6], v4, off
+; GFX10-NEXT:    global_store_dword v5, v4, s[10:11]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    ; return to shader part epilog
   %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2darraymsaa.sl_v4f32i32s.i16(i32 15, i16 %s, i16 %t, i16 %slice, i16 %fragid, <8 x i32> %rsrc, i32 1, i32 0)
@@ -116,10 +114,9 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe_lwe(<8 x i32> inre
 ; GFX9-NEXT:    s_mov_b32 s7, s9
 ; GFX9-NEXT:    v_and_or_b32 v1, v2, v4, v1
 ; GFX9-NEXT:    image_load v[0:4], v[0:1], s[0:7] dmask:0xf unorm a16 tfe lwe da
-; GFX9-NEXT:    v_mov_b32_e32 v5, s10
-; GFX9-NEXT:    v_mov_b32_e32 v6, s11
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_dword v[5:6], v4, off
+; GFX9-NEXT:    global_store_dword v5, v4, s[10:11]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
@@ -138,12 +135,11 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe_lwe(<8 x i32> inre
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
 ; GFX10-NEXT:    s_mov_b32 s7, s9
-; GFX10-NEXT:    v_mov_b32_e32 v5, s10
+; GFX10-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX10-NEXT:    image_load v[0:4], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm a16 tfe lwe
-; GFX10-NEXT:    v_mov_b32_e32 v6, s11
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_store_dword v[5:6], v4, off
+; GFX10-NEXT:    global_store_dword v5, v4, s[10:11]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    ; return to shader part epilog
   %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2darraymsaa.sl_v4f32i32s.i16(i32 15, i16 %s, i16 %t, i16 %slice, i16 %fragid, <8 x i32> %rsrc, i32 3, i32 0)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll
index 33c95e3e3309..dd7111bbb955 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll
@@ -66,12 +66,11 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe(<8 x i32> inreg %r
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
 ; GFX10-NEXT:    s_mov_b32 s7, s9
-; GFX10-NEXT:    v_mov_b32_e32 v5, s10
+; GFX10-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX10-NEXT:    image_load v[0:4], v[0:3], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe
-; GFX10-NEXT:    v_mov_b32_e32 v6, s11
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_store_dword v[5:6], v4, off
+; GFX10-NEXT:    global_store_dword v5, v4, s[10:11]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    ; return to shader part epilog
   %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2darraymsaa.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 1, i32 0)
@@ -112,12 +111,11 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe_lwe(<8 x i32> inre
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
 ; GFX10-NEXT:    s_mov_b32 s7, s9
-; GFX10-NEXT:    v_mov_b32_e32 v5, s10
+; GFX10-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX10-NEXT:    image_load v[0:4], v[0:3], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe lwe
-; GFX10-NEXT:    v_mov_b32_e32 v6, s11
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_store_dword v[5:6], v4, off
+; GFX10-NEXT:    global_store_dword v5, v4, s[10:11]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    ; return to shader part epilog
   %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2darraymsaa.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 3, i32 0)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll
index 02129757f825..011d76710606 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll
@@ -62,10 +62,9 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, i32
 ; GFX9-NEXT:    s_mov_b32 s7, s9
 ; GFX9-NEXT:    v_and_or_b32 v1, v2, v3, s8
 ; GFX9-NEXT:    image_load v[0:4], v[0:1], s[0:7] dmask:0xf unorm a16 tfe
-; GFX9-NEXT:    v_mov_b32_e32 v5, s10
-; GFX9-NEXT:    v_mov_b32_e32 v6, s11
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_dword v[5:6], v4, off
+; GFX9-NEXT:    global_store_dword v5, v4, s[10:11]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
@@ -84,12 +83,11 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, i32
 ; GFX10-NEXT:    s_mov_b32 s3, s5
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s7, s9
-; GFX10-NEXT:    v_mov_b32_e32 v5, s10
+; GFX10-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX10-NEXT:    image_load v[0:4], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 tfe
-; GFX10-NEXT:    v_mov_b32_e32 v6, s11
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_store_dword v[5:6], v4, off
+; GFX10-NEXT:    global_store_dword v5, v4, s[10:11]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    ; return to shader part epilog
   %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.3d.sl_v4f32i32s.i16(i32 15, i16 %s, i16 %t, i16 %r, <8 x i32> %rsrc, i32 1, i32 0)
@@ -116,10 +114,9 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc,
 ; GFX9-NEXT:    s_mov_b32 s7, s9
 ; GFX9-NEXT:    v_and_or_b32 v1, v2, v3, s8
 ; GFX9-NEXT:    image_load v[0:4], v[0:1], s[0:7] dmask:0xf unorm a16 tfe lwe
-; GFX9-NEXT:    v_mov_b32_e32 v5, s10
-; GFX9-NEXT:    v_mov_b32_e32 v6, s11
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_dword v[5:6], v4, off
+; GFX9-NEXT:    global_store_dword v5, v4, s[10:11]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
@@ -138,12 +135,11 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc,
 ; GFX10-NEXT:    s_mov_b32 s3, s5
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s7, s9
-; GFX10-NEXT:    v_mov_b32_e32 v5, s10
+; GFX10-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX10-NEXT:    image_load v[0:4], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 tfe lwe
-; GFX10-NEXT:    v_mov_b32_e32 v6, s11
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_store_dword v[5:6], v4, off
+; GFX10-NEXT:    global_store_dword v5, v4, s[10:11]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    ; return to shader part epilog
   %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.3d.sl_v4f32i32s.i16(i32 15, i16 %s, i16 %t, i16 %r, <8 x i32> %rsrc, i32 3, i32 0)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll
index e23ba4fde3fa..bfd347086b7d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll
@@ -66,12 +66,11 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, i32
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
 ; GFX10-NEXT:    s_mov_b32 s7, s9
-; GFX10-NEXT:    v_mov_b32_e32 v5, s10
+; GFX10-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX10-NEXT:    image_load v[0:4], v[0:2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm tfe
-; GFX10-NEXT:    v_mov_b32_e32 v6, s11
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_store_dword v[5:6], v4, off
+; GFX10-NEXT:    global_store_dword v5, v4, s[10:11]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    ; return to shader part epilog
   %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.3d.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 1, i32 0)
@@ -112,12 +111,11 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc,
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
 ; GFX10-NEXT:    s_mov_b32 s7, s9
-; GFX10-NEXT:    v_mov_b32_e32 v5, s10
+; GFX10-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX10-NEXT:    image_load v[0:4], v[0:2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm tfe lwe
-; GFX10-NEXT:    v_mov_b32_e32 v6, s11
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_store_dword v[5:6], v4, off
+; GFX10-NEXT:    global_store_dword v5, v4, s[10:11]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    ; return to shader part epilog
   %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.3d.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 3, i32 0)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll
index 6c0fe47d5ae3..235f6066eb6b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll
@@ -23,13 +23,12 @@ define amdgpu_kernel void @dpp_test(i32 addrspace(1)* %out, i32 %in) {
 ; GFX10-NEXT:    s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf]
 ; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x2c ; encoding: [0x80,0x00,0x00,0xf4,0x2c,0x00,0x00,0xfa]
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24 ; encoding: [0x00,0x00,0x04,0xf4,0x24,0x00,0x00,0xfa]
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0 ; encoding: [0x80,0x02,0x02,0x7e]
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
-; GFX10-NEXT:    v_mov_b32_e32 v2, s2 ; encoding: [0x02,0x02,0x04,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v0, s0 ; encoding: [0x00,0x02,0x00,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v1, s1 ; encoding: [0x01,0x02,0x02,0x7e]
-; GFX10-NEXT:    v_mov_b32_dpp v2, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 ; encoding: [0xfa,0x02,0x04,0x7e,0x02,0x01,0x08,0x11]
-; GFX10-NEXT:    global_store_dword v[0:1], v2, off ; encoding: [0x00,0x80,0x70,0xdc,0x00,0x02,0x7d,0x00]
+; GFX10-NEXT:    v_mov_b32_e32 v0, s2 ; encoding: [0x02,0x02,0x00,0x7e]
+; GFX10-NEXT:    v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x08,0x11]
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1] ; encoding: [0x00,0x80,0x70,0xdc,0x01,0x00,0x00,0x00]
 ; GFX10-NEXT:    s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
   %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %in, i32 1, i32 1, i32 1, i1 true) #0
   store i32 %tmp0, i32 addrspace(1)* %out
@@ -52,15 +51,14 @@ define amdgpu_kernel void @mov_dpp64_test(i64 addrspace(1)* %out, i64 %in1) {
 ; GFX10-LABEL: mov_dpp64_test:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24 ; encoding: [0x00,0x00,0x08,0xf4,0x24,0x00,0x00,0xfa]
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0 ; encoding: [0x80,0x02,0x04,0x7e]
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s2 ; encoding: [0x02,0x02,0x00,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s3 ; encoding: [0x03,0x02,0x02,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v3, s1 ; encoding: [0x01,0x02,0x06,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v2, s0 ; encoding: [0x00,0x02,0x04,0x7e]
 ; GFX10-NEXT:    v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x00,0x11]
 ; GFX10-NEXT:    v_mov_b32_dpp v1, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; encoding: [0xfa,0x02,0x02,0x7e,0x01,0x01,0x00,0x11]
-; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off ; encoding: [0x00,0x80,0x74,0xdc,0x02,0x00,0x7d,0x00]
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] ; encoding: [0x00,0x80,0x74,0xdc,0x02,0x00,0x00,0x00]
 ; GFX10-NEXT:    s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
   %tmp0 = call i64 @llvm.amdgcn.mov.dpp.i64(i64 %in1, i32 1, i32 1, i32 1, i1 false) #0
   store i64 %tmp0, i64 addrspace(1)* %out

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
index 812ad9781f8f..4badf1ddadd4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
@@ -24,12 +24,11 @@ define amdgpu_kernel void @dpp_test(i32 addrspace(1)* %out, i32 %in1, i32 %in2)
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-NEXT:    v_mov_b32_e32 v0, s3
-; GFX10-NEXT:    v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
-; GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
   %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 1, i32 1, i32 1, i1 false)
   store i32 %tmp0, i32 addrspace(1)* %out

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
index 8046caf973be..291f40e4f22a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
@@ -387,9 +387,8 @@ define <3 x i32> @v_load_constant_v3i32_align16(<3 x i32> addrspace(4)* %ptr) {
 define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align1(<3 x i32> addrspace(4)* inreg %ptr) {
 ; GFX9-UNALIGNED-LABEL: s_load_constant_v3i32_align1:
 ; GFX9-UNALIGNED:       ; %bb.0:
-; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-UNALIGNED-NEXT:    global_load_dwordx3 v[0:2], v[0:1], off
+; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-UNALIGNED-NEXT:    global_load_dwordx3 v[0:2], v0, s[0:1]
 ; GFX9-UNALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-UNALIGNED-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-UNALIGNED-NEXT:    v_readfirstlane_b32 s1, v1
@@ -398,99 +397,55 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align1(<3 x i32> addrspace(4)*
 ;
 ; GFX9-NOUNALIGNED-LABEL: s_load_constant_v3i32_align1:
 ; GFX9-NOUNALIGNED:       ; %bb.0:
-; GFX9-NOUNALIGNED-NEXT:    s_add_u32 s2, s0, 1
-; GFX9-NOUNALIGNED-NEXT:    s_addc_u32 s3, s1, 0
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-NOUNALIGNED-NEXT:    s_add_u32 s2, s0, 2
-; GFX9-NOUNALIGNED-NEXT:    s_addc_u32 s3, s1, 0
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v5, s3
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v4, s2
-; GFX9-NOUNALIGNED-NEXT:    s_add_u32 s2, s0, 3
-; GFX9-NOUNALIGNED-NEXT:    s_addc_u32 s3, s1, 0
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v7, s3
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v6, s2
-; GFX9-NOUNALIGNED-NEXT:    s_add_u32 s2, s0, 4
-; GFX9-NOUNALIGNED-NEXT:    s_addc_u32 s3, s1, 0
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v9, s3
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v8, s2
-; GFX9-NOUNALIGNED-NEXT:    s_add_u32 s2, s0, 5
-; GFX9-NOUNALIGNED-NEXT:    s_addc_u32 s3, s1, 0
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v11, s3
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v10, s2
-; GFX9-NOUNALIGNED-NEXT:    s_add_u32 s2, s0, 6
-; GFX9-NOUNALIGNED-NEXT:    s_addc_u32 s3, s1, 0
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v13, s3
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v12, s2
-; GFX9-NOUNALIGNED-NEXT:    s_add_u32 s2, s0, 7
-; GFX9-NOUNALIGNED-NEXT:    s_addc_u32 s3, s1, 0
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v15, s3
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v14, s2
-; GFX9-NOUNALIGNED-NEXT:    s_add_u32 s2, s0, 8
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v16, v[0:1], off
-; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v17, v[2:3], off
-; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v18, v[4:5], off
-; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v19, v[6:7], off
-; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v8, v[8:9], off
-; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v9, v[10:11], off
-; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v10, v[12:13], off
-; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v11, v[14:15], off
-; GFX9-NOUNALIGNED-NEXT:    s_addc_u32 s3, s1, 0
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NOUNALIGNED-NEXT:    s_add_u32 s2, s0, 9
-; GFX9-NOUNALIGNED-NEXT:    s_addc_u32 s3, s1, 0
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-NOUNALIGNED-NEXT:    s_add_u32 s2, s0, 10
-; GFX9-NOUNALIGNED-NEXT:    s_addc_u32 s3, s1, 0
-; GFX9-NOUNALIGNED-NEXT:    s_add_u32 s0, s0, 11
-; GFX9-NOUNALIGNED-NEXT:    s_addc_u32 s1, s1, 0
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v5, s3
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v7, s1
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v4, s2
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v6, s0
-; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v12, v[0:1], off
-; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v2, v[2:3], off
-; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v3, v[4:5], off
-; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v4, v[6:7], off
+; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v1, v0, s[0:1]
+; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:1
+; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:2
+; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:3
+; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:4
+; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:5
+; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:6
+; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:7
+; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:8
+; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:9
+; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:10
+; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v0, v0, s[0:1] offset:11
+; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v12, 0xff
 ; GFX9-NOUNALIGNED-NEXT:    s_movk_i32 s0, 0xff
 ; GFX9-NOUNALIGNED-NEXT:    s_mov_b32 s1, 8
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v5, 0xff
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v6, 8
+; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v13, 8
 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(10)
-; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_sdwa v0, s1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_sdwa v2, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(9)
-; GFX9-NOUNALIGNED-NEXT:    v_and_b32_e32 v1, s0, v18
+; GFX9-NOUNALIGNED-NEXT:    v_and_b32_e32 v3, s0, v3
 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(8)
-; GFX9-NOUNALIGNED-NEXT:    v_and_b32_e32 v7, s0, v19
-; GFX9-NOUNALIGNED-NEXT:    v_and_or_b32 v0, v16, s0, v0
-; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; GFX9-NOUNALIGNED-NEXT:    v_or3_b32 v0, v0, v1, v7
+; GFX9-NOUNALIGNED-NEXT:    v_and_b32_e32 v4, s0, v4
+; GFX9-NOUNALIGNED-NEXT:    v_and_or_b32 v1, v1, s0, v2
+; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_sdwa v6, s1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(5)
-; GFX9-NOUNALIGNED-NEXT:    v_and_b32_e32 v1, v10, v5
+; GFX9-NOUNALIGNED-NEXT:    v_and_b32_e32 v7, v7, v12
 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(4)
-; GFX9-NOUNALIGNED-NEXT:    v_and_b32_e32 v7, v11, v5
-; GFX9-NOUNALIGNED-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_sdwa v0, v6, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NOUNALIGNED-NEXT:    v_and_or_b32 v0, v8, v5, v0
-; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; GFX9-NOUNALIGNED-NEXT:    v_or3_b32 v1, v0, v1, v7
-; GFX9-NOUNALIGNED-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX9-NOUNALIGNED-NEXT:    v_and_b32_e32 v8, v8, v12
+; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_sdwa v0, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_sdwa v10, v13, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NOUNALIGNED-NEXT:    v_and_b32_e32 v1, v3, v5
+; GFX9-NOUNALIGNED-NEXT:    v_and_b32_e32 v11, v11, v12
 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NOUNALIGNED-NEXT:    v_and_b32_e32 v2, v4, v5
-; GFX9-NOUNALIGNED-NEXT:    v_and_or_b32 v0, v12, v5, v0
-; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
-; GFX9-NOUNALIGNED-NEXT:    v_or3_b32 v2, v0, v1, v2
+; GFX9-NOUNALIGNED-NEXT:    v_and_b32_e32 v0, v0, v12
+; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v3, 24, v4
+; GFX9-NOUNALIGNED-NEXT:    v_and_or_b32 v4, v5, s0, v6
+; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
+; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v6, 24, v8
+; GFX9-NOUNALIGNED-NEXT:    v_and_or_b32 v7, v9, v12, v10
+; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v9, 24, v0
+; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
+; GFX9-NOUNALIGNED-NEXT:    v_or3_b32 v0, v1, v2, v3
+; GFX9-NOUNALIGNED-NEXT:    v_or3_b32 v1, v4, v5, v6
+; GFX9-NOUNALIGNED-NEXT:    v_or3_b32 v2, v7, v8, v9
+; GFX9-NOUNALIGNED-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NOUNALIGNED-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX9-NOUNALIGNED-NEXT:    v_readfirstlane_b32 s2, v2
 ; GFX9-NOUNALIGNED-NEXT:    ; return to shader part epilog
 ;
@@ -577,9 +532,8 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align1(<3 x i32> addrspace(4)*
 define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align2(<3 x i32> addrspace(4)* inreg %ptr) {
 ; GFX9-UNALIGNED-LABEL: s_load_constant_v3i32_align2:
 ; GFX9-UNALIGNED:       ; %bb.0:
-; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-UNALIGNED-NEXT:    global_load_dwordx3 v[0:2], v[0:1], off
+; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-UNALIGNED-NEXT:    global_load_dwordx3 v[0:2], v0, s[0:1]
 ; GFX9-UNALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-UNALIGNED-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-UNALIGNED-NEXT:    v_readfirstlane_b32 s1, v1
@@ -588,49 +542,27 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align2(<3 x i32> addrspace(4)*
 ;
 ; GFX9-NOUNALIGNED-LABEL: s_load_constant_v3i32_align2:
 ; GFX9-NOUNALIGNED:       ; %bb.0:
-; GFX9-NOUNALIGNED-NEXT:    s_add_u32 s2, s0, 2
-; GFX9-NOUNALIGNED-NEXT:    s_addc_u32 s3, s1, 0
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-NOUNALIGNED-NEXT:    s_add_u32 s2, s0, 4
-; GFX9-NOUNALIGNED-NEXT:    s_addc_u32 s3, s1, 0
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v5, s3
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v4, s2
-; GFX9-NOUNALIGNED-NEXT:    s_add_u32 s2, s0, 6
-; GFX9-NOUNALIGNED-NEXT:    s_addc_u32 s3, s1, 0
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v7, s3
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v6, s2
-; GFX9-NOUNALIGNED-NEXT:    s_add_u32 s2, s0, 8
-; GFX9-NOUNALIGNED-NEXT:    s_addc_u32 s3, s1, 0
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NOUNALIGNED-NEXT:    s_add_u32 s0, s0, 10
-; GFX9-NOUNALIGNED-NEXT:    s_addc_u32 s1, s1, 0
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v9, s3
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v11, s1
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v10, s0
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v8, s2
-; GFX9-NOUNALIGNED-NEXT:    global_load_ushort v0, v[0:1], off
-; GFX9-NOUNALIGNED-NEXT:    global_load_ushort v1, v[2:3], off
-; GFX9-NOUNALIGNED-NEXT:    global_load_ushort v2, v[4:5], off
-; GFX9-NOUNALIGNED-NEXT:    global_load_ushort v3, v[6:7], off
-; GFX9-NOUNALIGNED-NEXT:    global_load_ushort v4, v[8:9], off
-; GFX9-NOUNALIGNED-NEXT:    global_load_ushort v5, v[10:11], off
+; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NOUNALIGNED-NEXT:    global_load_ushort v1, v0, s[0:1]
+; GFX9-NOUNALIGNED-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2
+; GFX9-NOUNALIGNED-NEXT:    global_load_ushort v3, v0, s[0:1] offset:4
+; GFX9-NOUNALIGNED-NEXT:    global_load_ushort v4, v0, s[0:1] offset:6
+; GFX9-NOUNALIGNED-NEXT:    global_load_ushort v5, v0, s[0:1] offset:8
+; GFX9-NOUNALIGNED-NEXT:    global_load_ushort v0, v0, s[0:1] offset:10
 ; GFX9-NOUNALIGNED-NEXT:    s_mov_b32 s0, 0xffff
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v6, 0xffff
 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(4)
-; GFX9-NOUNALIGNED-NEXT:    v_and_b32_e32 v1, s0, v1
-; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NOUNALIGNED-NEXT:    v_and_or_b32 v0, v0, s0, v1
-; GFX9-NOUNALIGNED-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NOUNALIGNED-NEXT:    v_and_b32_e32 v2, s0, v2
+; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NOUNALIGNED-NEXT:    v_and_b32_e32 v0, v3, v6
-; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NOUNALIGNED-NEXT:    v_and_or_b32 v1, v2, v6, v0
+; GFX9-NOUNALIGNED-NEXT:    v_and_b32_e32 v4, s0, v4
+; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NOUNALIGNED-NEXT:    v_and_b32_e32 v0, v5, v6
-; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NOUNALIGNED-NEXT:    v_and_or_b32 v2, v4, v6, v0
+; GFX9-NOUNALIGNED-NEXT:    v_and_b32_e32 v0, s0, v0
+; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
+; GFX9-NOUNALIGNED-NEXT:    v_and_or_b32 v0, v1, s0, v2
+; GFX9-NOUNALIGNED-NEXT:    v_and_or_b32 v1, v3, s0, v4
+; GFX9-NOUNALIGNED-NEXT:    v_and_or_b32 v2, v5, s0, v6
+; GFX9-NOUNALIGNED-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NOUNALIGNED-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX9-NOUNALIGNED-NEXT:    v_readfirstlane_b32 s2, v2
 ; GFX9-NOUNALIGNED-NEXT:    ; return to shader part epilog

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
index 5f4d4097b23a..3a4e59075070 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
@@ -91,24 +91,20 @@ define amdgpu_kernel void @localize_globals(i1 %cond) {
 ; GFX9-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX9-NEXT:    s_cbranch_scc0 BB1_2
 ; GFX9-NEXT:  ; %bb.1: ; %bb1
+; GFX9-NEXT:    s_getpc_b64 s[0:1]
+; GFX9-NEXT:    s_add_u32 s0, s0, gv2 at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s1, s1, gv2 at gotpcrel32@hi+12
 ; GFX9-NEXT:    s_getpc_b64 s[2:3]
-; GFX9-NEXT:    s_add_u32 s2, s2, gv2 at gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s3, s3, gv2 at gotpcrel32@hi+12
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, gv3 at gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, gv3 at gotpcrel32@hi+12
+; GFX9-NEXT:    s_add_u32 s2, s2, gv3 at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s3, s3, gv3 at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX9-NEXT:    s_mov_b32 s0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    v_mov_b32_e32 v2, 1
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    global_store_dword v0, v0, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX9-NEXT:  BB1_2: ; %Flow
 ; GFX9-NEXT:    s_xor_b32 s0, s0, -1
 ; GFX9-NEXT:    s_and_b32 s0, s0, 1
@@ -123,15 +119,11 @@ define amdgpu_kernel void @localize_globals(i1 %cond) {
 ; GFX9-NEXT:    s_addc_u32 s3, s3, gv1 at gotpcrel32@hi+12
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    v_mov_b32_e32 v3, 1
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_store_dword v[0:1], v3, off
+; GFX9-NEXT:    global_store_dword v0, v0, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX9-NEXT:  BB1_4: ; %bb2
 ; GFX9-NEXT:    s_endpgm
 entry:
@@ -171,17 +163,13 @@ define void @localize_internal_globals(i1 %cond) {
 ; GFX9-NEXT:    s_getpc_b64 s[6:7]
 ; GFX9-NEXT:    s_add_u32 s6, s6, static.gv2 at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s7, s7, static.gv2 at rel32@hi+12
-; GFX9-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    global_store_dword v0, v0, s[6:7]
 ; GFX9-NEXT:    s_getpc_b64 s[6:7]
 ; GFX9-NEXT:    s_add_u32 s6, s6, static.gv3 at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s7, s7, static.gv3 at rel32@hi+12
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
-; GFX9-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-NEXT:    v_mov_b32_e32 v2, 1
-; GFX9-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_mov_b32_e32 v1, 1
+; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-NEXT:  BB2_2: ; %Flow
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_xor_b64 exec, exec, s[4:5]
@@ -190,17 +178,13 @@ define void @localize_internal_globals(i1 %cond) {
 ; GFX9-NEXT:    s_getpc_b64 s[6:7]
 ; GFX9-NEXT:    s_add_u32 s6, s6, static.gv0 at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s7, s7, static.gv0 at rel32@hi+12
-; GFX9-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    global_store_dword v0, v0, s[6:7]
 ; GFX9-NEXT:    s_getpc_b64 s[6:7]
 ; GFX9-NEXT:    s_add_u32 s6, s6, static.gv1 at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s7, s7, static.gv1 at rel32@hi+12
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
-; GFX9-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-NEXT:    v_mov_b32_e32 v2, 1
-; GFX9-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_mov_b32_e32 v1, 1
+; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-NEXT:  BB2_4: ; %bb2
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll
index 3fa2613a4505..16c2b6247210 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll
@@ -38,23 +38,21 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
 ; GCN-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GCN-NEXT:    s_load_dword s4, s[4:5], 0x10
 ; GCN-NEXT:    s_add_u32 s5, s32, 0x1000
+; GCN-NEXT:    s_add_u32 s8, s5, 4
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    v_mov_b32_e32 v2, s5
-; GCN-NEXT:    s_add_u32 s8, s5, 4
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_lshl_b32 s4, s4, 2
 ; GCN-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; GCN-NEXT:    v_mov_b32_e32 v1, 1
-; GCN-NEXT:    v_mov_b32_e32 v2, s8
+; GCN-NEXT:    v_mov_b32_e32 v2, 1
+; GCN-NEXT:    v_mov_b32_e32 v3, s8
 ; GCN-NEXT:    s_add_u32 s4, s5, s4
-; GCN-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; GCN-NEXT:    v_mov_b32_e32 v1, s4
-; GCN-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
+; GCN-NEXT:    v_mov_b32_e32 v2, s4
+; GCN-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_add_u32_e32 v2, v1, v0
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    v_mov_b32_e32 v1, s7
-; GCN-NEXT:    global_store_dword v[0:1], v2, off
+; GCN-NEXT:    v_add_u32_e32 v0, v2, v0
+; GCN-NEXT:    global_store_dword v1, v0, s[6:7]
 ; GCN-NEXT:  BB0_3: ; %bb.2
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    global_store_dword v[0:1], v0, off
@@ -119,17 +117,15 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
 ; GCN-NEXT:    s_lshl_b32 s4, s4, 2
 ; GCN-NEXT:    v_mov_b32_e32 v2, s5
 ; GCN-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; GCN-NEXT:    v_mov_b32_e32 v1, 1
-; GCN-NEXT:    v_mov_b32_e32 v2, s8
+; GCN-NEXT:    v_mov_b32_e32 v2, 1
+; GCN-NEXT:    v_mov_b32_e32 v3, s8
 ; GCN-NEXT:    s_add_u32 s4, s5, s4
-; GCN-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; GCN-NEXT:    v_mov_b32_e32 v1, s4
-; GCN-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
+; GCN-NEXT:    v_mov_b32_e32 v2, s4
+; GCN-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_add_u32_e32 v2, v1, v0
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    v_mov_b32_e32 v1, s7
-; GCN-NEXT:    global_store_dword v[0:1], v2, off
+; GCN-NEXT:    v_add_u32_e32 v0, v2, v0
+; GCN-NEXT:    global_store_dword v1, v0, s[6:7]
 ; GCN-NEXT:  BB1_2: ; %bb.1
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    global_store_dword v[0:1], v0, off

diff  --git a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
index 8fe55b673804..abb602af7667 100644
--- a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
@@ -133,9 +133,12 @@ define amdgpu_kernel void @use_constant_to_flat_addrspacecast(i32 addrspace(4)*
 
 ; HSA-LABEl: {{^}}use_constant_to_global_addrspacecast:
 ; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}}
-; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
-; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
-; HSA: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}
+; CI-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
+; CI-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
+; CI: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}
+
+; GFX9: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; GFX9: global_load_dword v{{[0-9]+}}, [[ZERO:v[0-9]+]], s{{\[}}[[PTRLO]]:[[PTRHI]]{{\]}}
 define amdgpu_kernel void @use_constant_to_global_addrspacecast(i32 addrspace(4)* %ptr) #0 {
   %stof = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(1)*
   %ld = load volatile i32, i32 addrspace(1)* %stof
@@ -186,10 +189,13 @@ define amdgpu_kernel void @use_flat_to_private_addrspacecast(i32* %ptr) #0 {
 ; HSA: enable_sgpr_queue_ptr = 0
 
 ; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}}, s[4:5], 0x0
-; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
-; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
-; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0
-; HSA: {{flat|global}}_store_dword v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}, [[K]]
+; CI-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
+; CI-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
+; CI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0
+; CI: flat_store_dword v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}, [[K]]
+
+; GFX9: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
+; GFX9: global_store_dword [[ZERO]], [[ZERO]], s{{\[}}[[PTRLO]]:[[PTRHI]]{{\]$}}
 define amdgpu_kernel void @use_flat_to_global_addrspacecast(i32* %ptr) #0 {
   %ftos = addrspacecast i32* %ptr to i32 addrspace(1)*
   store volatile i32 0, i32 addrspace(1)* %ftos

diff  --git a/llvm/test/CodeGen/AMDGPU/amdpal-elf.ll b/llvm/test/CodeGen/AMDGPU/amdpal-elf.ll
index 4222ec5eed71..40255134aa57 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-elf.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-elf.ll
@@ -22,9 +22,9 @@
 ; ELF: }
 
 ; GFX10-W32: NumSGPRsForWavesPerEU: 4
-; GFX10-W32: NumVGPRsForWavesPerEU: 3
+; GFX10-W32: NumVGPRsForWavesPerEU: 1
 ; GFX10-W64: NumSGPRsForWavesPerEU: 2
-; GFX10-W64: NumVGPRsForWavesPerEU: 3
+; GFX10-W64: NumVGPRsForWavesPerEU: 1
 
 define amdgpu_kernel void @simple(i32 addrspace(1)* %out) {
 entry:

diff  --git a/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll b/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll
index 297f94429712..808230581f5a 100644
--- a/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll
@@ -32,12 +32,10 @@ define amdgpu_kernel void @call_memory_no_dep(i32 addrspace(1)* %ptr, i32) #0 {
 ; GCN-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
 ; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
 ; GCN-NEXT:    s_add_u32 s0, s0, s9
-; GCN-NEXT:    v_mov_b32_e32 v2, 0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
-; GCN-NEXT:    global_store_dword v[0:1], v2, off
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    global_store_dword v0, v0, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_getpc_b64 s[6:7]
 ; GCN-NEXT:    s_add_u32 s6, s6, func at rel32@lo+4
@@ -66,9 +64,7 @@ define amdgpu_kernel void @call_no_wait_after_call(i32 addrspace(1)* %ptr, i32)
 ; GCN-NEXT:    s_mov_b32 s32, 0
 ; GCN-NEXT:    v_mov_b32_e32 v40, 0
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT:    v_mov_b32_e32 v0, s34
-; GCN-NEXT:    v_mov_b32_e32 v1, s35
-; GCN-NEXT:    global_store_dword v[0:1], v40, off
+; GCN-NEXT:    global_store_dword v40, v40, s[34:35]
 ; GCN-NEXT:    s_endpgm
   call void @func(i32 0)
   store i32 0, i32 addrspace(1)* %ptr
@@ -88,10 +84,9 @@ define amdgpu_kernel void @call_no_wait_after_call_return_val(i32 addrspace(1)*
 ; GCN-NEXT:    s_add_u32 s4, s4, func.return at rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s5, s5, func.return at rel32@hi+12
 ; GCN-NEXT:    s_mov_b32 s32, 0
+; GCN-NEXT:    v_mov_b32_e32 v40, 0
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT:    v_mov_b32_e32 v1, s34
-; GCN-NEXT:    v_mov_b32_e32 v2, s35
-; GCN-NEXT:    global_store_dword v[1:2], v0, off
+; GCN-NEXT:    global_store_dword v40, v0, s[34:35]
 ; GCN-NEXT:    s_endpgm
   %rv = call i32 @func.return(i32 0)
   store i32 %rv, i32 addrspace(1)* %ptr

diff  --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
index e0e763623481..4538436cbe5c 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
@@ -161,7 +161,8 @@ define hidden void @func_indirect_use_workgroup_id_z() #1 {
 }
 
 ; GCN-LABEL: {{^}}other_arg_use_workgroup_id_x:
-; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
+; CIVI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
+; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0, off
 ; GCN: ; use s12
 define hidden void @other_arg_use_workgroup_id_x(i32 %arg0) #1 {
   %val = call i32 @llvm.amdgcn.workgroup.id.x()
@@ -171,7 +172,8 @@ define hidden void @other_arg_use_workgroup_id_x(i32 %arg0) #1 {
 }
 
 ; GCN-LABEL: {{^}}other_arg_use_workgroup_id_y:
-; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
+; CIVI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
+; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0, off
 ; GCN: ; use s13
 define hidden void @other_arg_use_workgroup_id_y(i32 %arg0) #1 {
   %val = call i32 @llvm.amdgcn.workgroup.id.y()
@@ -181,7 +183,8 @@ define hidden void @other_arg_use_workgroup_id_y(i32 %arg0) #1 {
 }
 
 ; GCN-LABEL: {{^}}other_arg_use_workgroup_id_z:
-; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
+; CIVI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
+; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0, off
 ; GCN: ; use s14
 define hidden void @other_arg_use_workgroup_id_z(i32 %arg0) #1 {
   %val = call i32 @llvm.amdgcn.workgroup.id.z()

diff  --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll
index 1d79f88c0094..450bd02eba7e 100644
--- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll
+++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll
@@ -33,11 +33,10 @@ define amdgpu_kernel void @test_sink_small_offset_global_atomic_csub_i32(i32 add
 ; GCN-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GCN-NEXT:    s_cbranch_execz BB0_2
 ; GCN-NEXT:  ; %bb.1: ; %if
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    v_mov_b32_e32 v1, 2
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NEXT:    v_mov_b32_e32 v2, 2
-; GCN-NEXT:    global_atomic_csub v0, v[0:1], v2, off offset:28 glc
+; GCN-NEXT:    global_atomic_csub v0, v0, v1, s[2:3] offset:28 glc
 ; GCN-NEXT:  BB0_2: ; %endif
 ; GCN-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0x3d0800

diff  --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll
index 22dde27d0038..be1e841a09c7 100644
--- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll
+++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll
@@ -32,11 +32,10 @@ define amdgpu_kernel void @test_sink_small_offset_global_atomic_fadd_f32(float a
 ; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN-NEXT:    s_cbranch_execz BB0_2
 ; GCN-NEXT:  ; %bb.1: ; %if
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    v_mov_b32_e32 v1, 2.0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NEXT:    v_mov_b32_e32 v2, 2.0
-; GCN-NEXT:    global_atomic_add_f32 v[0:1], v2, off offset:28
+; GCN-NEXT:    global_atomic_add_f32 v0, v1, s[2:3] offset:28
 ; GCN-NEXT:    global_load_dword v0, v[0:1], off
 ; GCN-NEXT:  BB0_2: ; %endif
 ; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]

diff  --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
index c803b26a969a..8f2ce7391080 100644
--- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
+++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
@@ -74,7 +74,8 @@ done:
 ; GCN-LABEL: {{^}}test_sink_global_small_max_mubuf_offset:
 ; GCN: s_and_saveexec_b64
 ; SICIVI: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4095{{$}}
-; GFX9: global_load_sbyte {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, off offset:4095{{$}}
+; GFX9: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; GFX9: global_load_sbyte {{v[0-9]+}}, [[ZERO]], {{s\[[0-9]+:[0-9]+\]}} offset:4095{{$}}
 ; GCN: {{^}}BB2_2:
 ; GCN: s_or_b64 exec
 define amdgpu_kernel void @test_sink_global_small_max_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
@@ -697,7 +698,8 @@ done:
 ; OPT-GFX9: load i8, i8 addrspace(1)* %sunkaddr
 
 ; GCN-LABEL: {{^}}test_sink_global_small_min_scratch_global_offset:
-; GFX9: global_load_sbyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:-4096{{$}}
+; GFX9: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; GFX9: global_load_sbyte v{{[0-9]+}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}} offset:-4096{{$}}
 define amdgpu_kernel void @test_sink_global_small_min_scratch_global_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 1024

diff  --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
index 1289032955fb..271f6c703980 100644
--- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
@@ -226,32 +226,29 @@ define amdgpu_kernel void @vload2_private(i16 addrspace(1)* nocapture readonly %
 ; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
 ; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
 ; GFX900-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX900-NEXT:    s_add_u32 s0, s0, s9
 ; GFX900-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-NEXT:    v_mov_b32_e32 v0, s4
-; GFX900-NEXT:    v_mov_b32_e32 v1, s5
-; GFX900-NEXT:    global_load_ushort v2, v[0:1], off
+; GFX900-NEXT:    global_load_ushort v0, v2, s[4:5]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-NEXT:    buffer_store_short v2, off, s[0:3], 0 offset:4
-; GFX900-NEXT:    global_load_ushort v2, v[0:1], off offset:2
+; GFX900-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:4
+; GFX900-NEXT:    global_load_ushort v0, v2, s[4:5] offset:2
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-NEXT:    buffer_store_short v2, off, s[0:3], 0 offset:6
-; GFX900-NEXT:    global_load_ushort v2, v[0:1], off offset:4
-; GFX900-NEXT:    v_mov_b32_e32 v0, s6
-; GFX900-NEXT:    v_mov_b32_e32 v1, s7
+; GFX900-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:6
+; GFX900-NEXT:    global_load_ushort v0, v2, s[4:5] offset:4
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-NEXT:    buffer_store_short v2, off, s[0:3], 0 offset:8
-; GFX900-NEXT:    buffer_load_ushort v2, off, s[0:3], 0 offset:4
-; GFX900-NEXT:    buffer_load_ushort v4, off, s[0:3], 0 offset:6
+; GFX900-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:8
+; GFX900-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:4
+; GFX900-NEXT:    buffer_load_ushort v3, off, s[0:3], 0 offset:6
 ; GFX900-NEXT:    s_waitcnt vmcnt(1)
-; GFX900-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-NEXT:    v_mov_b32_e32 v3, v4
-; GFX900-NEXT:    buffer_load_short_d16_hi v3, off, s[0:3], 0 offset:8
-; GFX900-NEXT:    v_lshl_or_b32 v2, v4, 16, v2
+; GFX900-NEXT:    v_mov_b32_e32 v1, v3
+; GFX900-NEXT:    buffer_load_short_d16_hi v1, off, s[0:3], 0 offset:8
+; GFX900-NEXT:    v_lshl_or_b32 v0, v3, 16, v0
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX900-NEXT:    s_endpgm
 ;
 ; FLATSCR-LABEL: vload2_private:
@@ -259,36 +256,33 @@ define amdgpu_kernel void @vload2_private(i16 addrspace(1)* nocapture readonly %
 ; FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
 ; FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
 ; FLATSCR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; FLATSCR-NEXT:    v_mov_b32_e32 v2, 0
 ; FLATSCR-NEXT:    s_mov_b32 vcc_hi, 0
 ; FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
-; FLATSCR-NEXT:    v_mov_b32_e32 v0, s0
-; FLATSCR-NEXT:    v_mov_b32_e32 v1, s1
-; FLATSCR-NEXT:    global_load_ushort v2, v[0:1], off
+; FLATSCR-NEXT:    global_load_ushort v0, v2, s[0:1]
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR-NEXT:    scratch_store_short off, v2, vcc_hi offset:4
-; FLATSCR-NEXT:    global_load_ushort v2, v[0:1], off offset:2
+; FLATSCR-NEXT:    scratch_store_short off, v0, vcc_hi offset:4
+; FLATSCR-NEXT:    global_load_ushort v0, v2, s[0:1] offset:2
 ; FLATSCR-NEXT:    s_mov_b32 vcc_hi, 0
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR-NEXT:    scratch_store_short off, v2, vcc_hi offset:6
-; FLATSCR-NEXT:    global_load_ushort v2, v[0:1], off offset:4
+; FLATSCR-NEXT:    scratch_store_short off, v0, vcc_hi offset:6
+; FLATSCR-NEXT:    global_load_ushort v0, v2, s[0:1] offset:4
 ; FLATSCR-NEXT:    s_mov_b32 vcc_hi, 0
-; FLATSCR-NEXT:    v_mov_b32_e32 v0, s2
-; FLATSCR-NEXT:    v_mov_b32_e32 v1, s3
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR-NEXT:    scratch_store_short off, v2, vcc_hi offset:8
+; FLATSCR-NEXT:    scratch_store_short off, v0, vcc_hi offset:8
 ; FLATSCR-NEXT:    s_mov_b32 vcc_hi, 0
-; FLATSCR-NEXT:    scratch_load_ushort v2, off, vcc_hi offset:4
+; FLATSCR-NEXT:    scratch_load_ushort v0, off, vcc_hi offset:4
 ; FLATSCR-NEXT:    s_mov_b32 vcc_hi, 0
-; FLATSCR-NEXT:    scratch_load_ushort v4, off, vcc_hi offset:6
+; FLATSCR-NEXT:    scratch_load_ushort v3, off, vcc_hi offset:6
 ; FLATSCR-NEXT:    s_mov_b32 vcc_hi, 0
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(1)
-; FLATSCR-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; FLATSCR-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR-NEXT:    v_mov_b32_e32 v3, v4
-; FLATSCR-NEXT:    scratch_load_short_d16_hi v3, off, vcc_hi offset:8
-; FLATSCR-NEXT:    v_lshl_or_b32 v2, v4, 16, v2
+; FLATSCR-NEXT:    v_mov_b32_e32 v1, v3
+; FLATSCR-NEXT:    scratch_load_short_d16_hi v1, off, vcc_hi offset:8
+; FLATSCR-NEXT:    v_lshl_or_b32 v0, v3, 16, v0
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; FLATSCR-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; FLATSCR-NEXT:    s_endpgm
 entry:
   %loc = alloca [3 x i16], align 2, addrspace(5)

diff  --git a/llvm/test/CodeGen/AMDGPU/dagcombine-select.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-select.ll
index e9233632c9da..d10d0dd74741 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcombine-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcombine-select.ll
@@ -4,7 +4,7 @@
 ; GCN:     s_cselect_b32 [[SEL:s[0-9]+]], s{{[0-9]+}},
 ; GCN:     v_mov_b32_e32 [[VSEL:v[0-9]+]], [[SEL]]
 ; GCN-NOT: v_and_b32
-; GCN:     store_dword v[{{[0-9:]+}}], [[VSEL]],
+; GCN:     store_dword v{{[0-9]+}}, [[VSEL]], s{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @select_and1(i32 addrspace(1)* %p, i32 %x, i32 %y) {
   %c = icmp slt i32 %x, 11
   %s = select i1 %c, i32 0, i32 -1
@@ -17,7 +17,7 @@ define amdgpu_kernel void @select_and1(i32 addrspace(1)* %p, i32 %x, i32 %y) {
 ; GCN:     s_cselect_b32 [[SEL:s[0-9]+]], s{{[0-9]+}},
 ; GCN:     v_mov_b32_e32 [[VSEL:v[0-9]+]], [[SEL]]
 ; GCN-NOT: v_and_b32
-; GCN:     store_dword v[{{[0-9:]+}}], [[VSEL]],
+; GCN:     store_dword v{{[0-9]+}}, [[VSEL]], s{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @select_and2(i32 addrspace(1)* %p, i32 %x, i32 %y) {
   %c = icmp slt i32 %x, 11
   %s = select i1 %c, i32 0, i32 -1
@@ -30,7 +30,7 @@ define amdgpu_kernel void @select_and2(i32 addrspace(1)* %p, i32 %x, i32 %y) {
 ; GCN:     s_cselect_b32 [[SEL:s[0-9]+]], s{{[0-9]+}},
 ; GCN:     v_mov_b32_e32 [[VSEL:v[0-9]+]], [[SEL]]
 ; GCN-NOT: v_and_b32
-; GCN:     store_dword v[{{[0-9:]+}}], [[VSEL]],
+; GCN:     store_dword v{{[0-9]+}}, [[VSEL]], s{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @select_and3(i32 addrspace(1)* %p, i32 %x, i32 %y) {
   %c = icmp slt i32 %x, 11
   %s = select i1 %c, i32 -1, i32 0
@@ -58,7 +58,7 @@ define amdgpu_kernel void @select_and_v4(<4 x i32> addrspace(1)* %p, i32 %x, <4
 ; GCN:     s_cselect_b32 [[SEL:s[0-9]+]], s{{[0-9]+}},
 ; GCN:     v_mov_b32_e32 [[VSEL:v[0-9]+]], [[SEL]]
 ; GCN-NOT: v_or_b32
-; GCN:     store_dword v[{{[0-9:]+}}], [[VSEL]],
+; GCN:     store_dword v{{[0-9]+}}, [[VSEL]], s{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @select_or1(i32 addrspace(1)* %p, i32 %x, i32 %y) {
   %c = icmp slt i32 %x, 11
   %s = select i1 %c, i32 0, i32 -1
@@ -71,7 +71,7 @@ define amdgpu_kernel void @select_or1(i32 addrspace(1)* %p, i32 %x, i32 %y) {
 ; GCN:     s_cselect_b32 [[SEL:s[0-9]+]], s{{[0-9]+}},
 ; GCN:     v_mov_b32_e32 [[VSEL:v[0-9]+]], [[SEL]]
 ; GCN-NOT: v_or_b32
-; GCN:     store_dword v[{{[0-9:]+}}], [[VSEL]],
+; GCN:     store_dword v{{[0-9]+}}, [[VSEL]], s{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @select_or2(i32 addrspace(1)* %p, i32 %x, i32 %y) {
   %c = icmp slt i32 %x, 11
   %s = select i1 %c, i32 0, i32 -1
@@ -84,7 +84,7 @@ define amdgpu_kernel void @select_or2(i32 addrspace(1)* %p, i32 %x, i32 %y) {
 ; GCN:     s_cselect_b32 [[SEL:s[0-9]+]], s{{[0-9]+}},
 ; GCN:     v_mov_b32_e32 [[VSEL:v[0-9]+]], [[SEL]]
 ; GCN-NOT: v_or_b32
-; GCN:     store_dword v[{{[0-9:]+}}], [[VSEL]],
+; GCN:     store_dword v{{[0-9]+}}, [[VSEL]], s{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @select_or3(i32 addrspace(1)* %p, i32 %x, i32 %y) {
   %c = icmp slt i32 %x, 11
   %s = select i1 %c, i32 -1, i32 0
@@ -118,7 +118,7 @@ define amdgpu_kernel void @sel_constants_sub_constant_sel_constants(i32 addrspac
 }
 
 ; GCN-LABEL: {{^}}sel_constants_sub_constant_sel_constants_i16:
-; GCN: v_cndmask_b32_e64 v2, 2, 9,
+; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 2, 9,
 define amdgpu_kernel void @sel_constants_sub_constant_sel_constants_i16(i16 addrspace(1)* %p, i1 %cond) {
   %sel = select i1 %cond, i16 -4, i16 3
   %bo = sub i16 5, %sel

diff  --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
index 9f72a34b726a..f9a3d1902b53 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
@@ -958,14 +958,12 @@ define amdgpu_kernel void @load_constant_adjacent_offsets(i32 addrspace(1)* %out
 ;
 ; GFX9-LABEL: load_constant_adjacent_offsets:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    ds_read2_b32 v[0:1], v2 offset1:1
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_add_u32_e32 v2, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
+; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
   %val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4
@@ -990,14 +988,12 @@ define amdgpu_kernel void @load_constant_disjoint_offsets(i32 addrspace(1)* %out
 ;
 ; GFX9-LABEL: load_constant_disjoint_offsets:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    ds_read2_b32 v[0:1], v0 offset1:2
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    ds_read2_b32 v[0:1], v2 offset1:2
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_add_u32_e32 v2, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
+; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
   %val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4
@@ -1026,29 +1022,25 @@ define amdgpu_kernel void @load_misaligned64_constant_offsets(i64 addrspace(1)*
 ;
 ; GFX9-ALIGNED-LABEL: load_misaligned64_constant_offsets:
 ; GFX9-ALIGNED:       ; %bb.0:
-; GFX9-ALIGNED-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-ALIGNED-NEXT:    ds_read2_b32 v[0:1], v2 offset1:1
-; GFX9-ALIGNED-NEXT:    ds_read2_b32 v[2:3], v2 offset0:2 offset1:3
+; GFX9-ALIGNED-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-ALIGNED-NEXT:    ds_read2_b32 v[0:1], v4 offset1:1
+; GFX9-ALIGNED-NEXT:    ds_read2_b32 v[2:3], v4 offset0:2 offset1:3
 ; GFX9-ALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-ALIGNED-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
 ; GFX9-ALIGNED-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-ALIGNED-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-ALIGNED-NEXT:    v_mov_b32_e32 v2, s0
-; GFX9-ALIGNED-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-ALIGNED-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
 ; GFX9-ALIGNED-NEXT:    s_endpgm
 ;
 ; GFX9-UNALIGNED-LABEL: load_misaligned64_constant_offsets:
 ; GFX9-UNALIGNED:       ; %bb.0:
-; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-UNALIGNED-NEXT:    ds_read_b128 v[0:3], v0
+; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-UNALIGNED-NEXT:    ds_read_b128 v[0:3], v4
 ; GFX9-UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-UNALIGNED-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
 ; GFX9-UNALIGNED-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v2, s0
-; GFX9-UNALIGNED-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-UNALIGNED-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
 ; GFX9-UNALIGNED-NEXT:    s_endpgm
   %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4
   %val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4
@@ -1083,12 +1075,11 @@ define amdgpu_kernel void @load_misaligned64_constant_large_offsets(i64 addrspac
 ; GFX9-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
 ; GFX9-NEXT:    ds_read2_b32 v[2:3], v2 offset1:1
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    v_mov_b32_e32 v2, s0
-; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4
   %val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4
@@ -1154,10 +1145,9 @@ define amdgpu_kernel void @sgemm_inner_loop_read2_sequence(float addrspace(1)* %
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v6
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v7
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v8
-; GFX9-NEXT:    v_add_f32_e32 v2, v0, v9
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_mov_b32_e32 v10, 0
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v9
+; GFX9-NEXT:    global_store_dword v10, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %x.i = tail call i32 @llvm.amdgcn.workgroup.id.x() #1
   %y.i = tail call i32 @llvm.amdgcn.workitem.id.y() #1
@@ -1221,13 +1211,12 @@ define amdgpu_kernel void @misaligned_read2_v2i32(<2 x i32> addrspace(1)* %out,
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x2c
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
-; GFX9-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
   %load = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 4
   store <2 x i32> %load, <2 x i32> addrspace(1)* %out, align 8
@@ -1253,13 +1242,12 @@ define amdgpu_kernel void @misaligned_read2_i64(i64 addrspace(1)* %out, i64 addr
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x2c
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
-; GFX9-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
   %load = load i64, i64 addrspace(3)* %in, align 4
   store i64 %load, i64 addrspace(1)* %out, align 8
@@ -1304,6 +1292,7 @@ define amdgpu_kernel void @ds_read_
diff _base_interleaving(
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x2c
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v2, s0, v1
 ; GFX9-NEXT:    v_add_u32_e32 v3, s1, v0
@@ -1322,10 +1311,8 @@ define amdgpu_kernel void @ds_read_
diff _base_interleaving(
 ; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
 ; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_mul_f32_e32 v1, v5, v7
-; GFX9-NEXT:    v_sub_f32_e32 v2, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off offset:40
+; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX9-NEXT:    global_store_dword v8, v0, s[4:5] offset:40
 ; GFX9-NEXT:    s_endpgm
   float addrspace(1)* nocapture %arg,
   [4 x [4 x float]] addrspace(3)* %arg1,
@@ -1402,27 +1389,26 @@ define amdgpu_kernel void @ds_read_call_read(i32 addrspace(1)* %out, i32 addrspa
 ; GFX9-NEXT:    s_mov_b32 s36, s0
 ; GFX9-NEXT:    s_load_dwordx4 s[36:39], s[36:37], 0x0
 ; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[0:1], 0x24
-; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
+; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x2c
 ; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_add_u32 s36, s36, s3
 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    v_lshl_add_u32 v41, v0, 2, s0
 ; GFX9-NEXT:    s_getpc_b64 s[0:1]
 ; GFX9-NEXT:    s_add_u32 s0, s0, void_func_void at gotpcrel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s1, s1, void_func_void at gotpcrel32@hi+12
-; GFX9-NEXT:    v_lshl_add_u32 v40, v0, 2, s2
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX9-NEXT:    ds_read_b32 v41, v40
+; GFX9-NEXT:    ds_read_b32 v42, v41
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    ds_read_b32 v0, v40 offset:4
+; GFX9-NEXT:    ds_read_b32 v0, v41 offset:4
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_add_u32_e32 v2, v41, v0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s34
-; GFX9-NEXT:    v_mov_b32_e32 v1, s35
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_add_u32_e32 v0, v42, v0
+; GFX9-NEXT:    global_store_dword v40, v0, s[34:35]
 ; GFX9-NEXT:    s_endpgm
   %x = call i32 @llvm.amdgcn.workitem.id.x()
   %arrayidx0 = getelementptr i32, i32 addrspace(3)* %arg, i32 %x
@@ -1508,41 +1494,38 @@ define amdgpu_kernel void @read2_v2i32_align1_odd_offset(<2 x i32> addrspace(1)*
 ;
 ; GFX9-ALIGNED-LABEL: read2_v2i32_align1_odd_offset:
 ; GFX9-ALIGNED:       ; %bb.0: ; %entry
-; GFX9-ALIGNED-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-ALIGNED-NEXT:    ds_read_u8 v2, v0 offset:65
-; GFX9-ALIGNED-NEXT:    ds_read_u8 v3, v0 offset:66
-; GFX9-ALIGNED-NEXT:    ds_read_u8 v4, v0 offset:67
-; GFX9-ALIGNED-NEXT:    ds_read_u8 v5, v0 offset:68
-; GFX9-ALIGNED-NEXT:    ds_read_u8 v1, v0 offset:69
-; GFX9-ALIGNED-NEXT:    ds_read_u8 v6, v0 offset:70
-; GFX9-ALIGNED-NEXT:    ds_read_u8 v7, v0 offset:71
-; GFX9-ALIGNED-NEXT:    ds_read_u8 v0, v0 offset:72
+; GFX9-ALIGNED-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-ALIGNED-NEXT:    ds_read_u8 v0, v2 offset:65
+; GFX9-ALIGNED-NEXT:    ds_read_u8 v3, v2 offset:66
+; GFX9-ALIGNED-NEXT:    ds_read_u8 v4, v2 offset:67
+; GFX9-ALIGNED-NEXT:    ds_read_u8 v5, v2 offset:68
+; GFX9-ALIGNED-NEXT:    ds_read_u8 v1, v2 offset:69
+; GFX9-ALIGNED-NEXT:    ds_read_u8 v6, v2 offset:70
+; GFX9-ALIGNED-NEXT:    ds_read_u8 v7, v2 offset:71
+; GFX9-ALIGNED-NEXT:    ds_read_u8 v8, v2 offset:72
 ; GFX9-ALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-ALIGNED-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
 ; GFX9-ALIGNED-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
 ; GFX9-ALIGNED-NEXT:    v_or_b32_e32 v1, v6, v1
-; GFX9-ALIGNED-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
-; GFX9-ALIGNED-NEXT:    v_or_b32_sdwa v0, v0, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-ALIGNED-NEXT:    v_or_b32_e32 v1, v0, v1
-; GFX9-ALIGNED-NEXT:    v_lshlrev_b32_e32 v0, 8, v3
-; GFX9-ALIGNED-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX9-ALIGNED-NEXT:    v_lshlrev_b32_e32 v2, 8, v5
-; GFX9-ALIGNED-NEXT:    v_or_b32_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-ALIGNED-NEXT:    v_or_b32_e32 v0, v2, v0
-; GFX9-ALIGNED-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-ALIGNED-NEXT:    v_mov_b32_e32 v2, s0
-; GFX9-ALIGNED-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-ALIGNED-NEXT:    v_lshlrev_b32_e32 v6, 8, v8
+; GFX9-ALIGNED-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX9-ALIGNED-NEXT:    v_lshlrev_b32_e32 v3, 8, v5
+; GFX9-ALIGNED-NEXT:    v_or_b32_sdwa v6, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-ALIGNED-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-ALIGNED-NEXT:    v_or_b32_e32 v1, v6, v1
+; GFX9-ALIGNED-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX9-ALIGNED-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-ALIGNED-NEXT:    s_endpgm
 ;
 ; GFX9-UNALIGNED-LABEL: read2_v2i32_align1_odd_offset:
 ; GFX9-UNALIGNED:       ; %bb.0: ; %entry
-; GFX9-UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v0, 0x41
+; GFX9-UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-UNALIGNED-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
+; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v2, s0
-; GFX9-UNALIGNED-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-UNALIGNED-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-UNALIGNED-NEXT:    s_endpgm
 entry:
   %load = load <2 x i32>, <2 x i32> addrspace(3)* bitcast (i8 addrspace(3)* getelementptr (i8, i8 addrspace(3)* bitcast ([100 x <2 x i32>] addrspace(3)* @v2i32_align1 to i8 addrspace(3)*), i32 65) to <2 x i32> addrspace(3)*), align 1

diff  --git a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
index d1d31691226f..f3208c29ca62 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
@@ -10,7 +10,7 @@
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: s_and_b32 [[RESULT:s[0-9]+]], [[VAL]], 0x7fff
 ; GCN: v_mov_b32_e32 [[V_RESULT:v[0-9]+]], [[RESULT]]
-; GCN: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[V_RESULT]]
+; GCN: {{flat|global}}_store_short v{{.+}}, [[V_RESULT]]
 define amdgpu_kernel void @s_fabs_free_f16(half addrspace(1)* %out, i16 %in) {
   %bc= bitcast i16 %in to half
   %fabs = call half @llvm.fabs.f16(half %bc)
@@ -22,7 +22,7 @@ define amdgpu_kernel void @s_fabs_free_f16(half addrspace(1)* %out, i16 %in) {
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: s_and_b32 [[RESULT:s[0-9]+]], [[VAL]], 0x7fff
 ; GCN: v_mov_b32_e32 [[V_RESULT:v[0-9]+]], [[RESULT]]
-; GCN: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[V_RESULT]]
+; GCN: {{flat|global}}_store_short v{{.+}}, [[V_RESULT]]
 define amdgpu_kernel void @s_fabs_f16(half addrspace(1)* %out, half %in) {
   %fabs = call half @llvm.fabs.f16(half %in)
   store half %fabs, half addrspace(1)* %out
@@ -65,7 +65,7 @@ define amdgpu_kernel void @s_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half
 ; GFX89-NOT: and
 ; GFX89: v_mov_b32_e32 [[V_IN1:v[0-9]+]], [[IN1]]
 ; GFX89: v_mul_f16_e64 [[RESULT:v[0-9]+]], |[[IN0]]|, [[V_IN1]]
-; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+; GFX89: {{flat|global}}_store_short v{{.+}}, [[RESULT]]
 define amdgpu_kernel void @fabs_fold_f16(half addrspace(1)* %out, half %in0, half %in1) {
   %fabs = call half @llvm.fabs.f16(half %in0)
   %fmul = fmul half %fabs, %in1

diff  --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
index b32dce30a2e0..1363c1c3dea2 100644
--- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
+++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
@@ -84,13 +84,12 @@ define amdgpu_kernel void @global_store_2xi16_align2(i16 addrspace(1)* %p, i16 a
 ; GFX9-LABEL: global_store_2xi16_align2:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX9-NEXT:    v_mov_b32_e32 v2, 1
-; GFX9-NEXT:    v_mov_b32_e32 v3, 2
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 1
+; GFX9-NEXT:    v_mov_b32_e32 v2, 2
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_short v[0:1], v2, off
-; GFX9-NEXT:    global_store_short v[0:1], v3, off offset:2
+; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_short v0, v2, s[0:1] offset:2
 ; GFX9-NEXT:    s_endpgm
   %gep.r = getelementptr i16, i16 addrspace(1)* %r, i64 1
   store i16 1, i16 addrspace(1)* %r, align 2
@@ -193,11 +192,10 @@ define amdgpu_kernel void @global_store_2xi16_align1(i16 addrspace(1)* %p, i16 a
 ; GFX9-LABEL: global_store_2xi16_align1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0x20001
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x20001
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %gep.r = getelementptr i16, i16 addrspace(1)* %r, i64 1
   store i16 1, i16 addrspace(1)* %r, align 1
@@ -283,11 +281,10 @@ define amdgpu_kernel void @global_store_2xi16_align4(i16 addrspace(1)* %p, i16 a
 ; GFX9-LABEL: global_store_2xi16_align4:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0x20001
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x20001
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %gep.r = getelementptr i16, i16 addrspace(1)* %r, i64 1
   store i16 1, i16 addrspace(1)* %r, align 4

diff  --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
index 74c8b5464902..99251ef866ac 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
@@ -12,7 +12,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0
 
 ; GCN-LABEL: {{^}}test_fold_canonicalize_undef_value_f16:
 ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
-; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_fold_canonicalize_undef_value_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half undef)
   store half %canonicalized, half addrspace(1)* %out
@@ -34,7 +34,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_f16(half addrspace(1)* %out)
 
 ; GCN-LABEL: {{^}}s_test_canonicalize_var_f16:
 ; GFX89: v_max_f16_e64 [[REG:v[0-9]+]], {{s[0-9]+}}, {{s[0-9]+}}
-; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]]
 define amdgpu_kernel void @s_test_canonicalize_var_f16(half addrspace(1)* %out, i16 zeroext %val.arg) #1 {
   %val = bitcast i16 %val.arg to half
   %canonicalized = call half @llvm.canonicalize.f16(half %val)
@@ -59,7 +59,7 @@ define <2 x half> @v_test_canonicalize_build_vector_v2f16(half %lo, half %hi) #1
 
 ; GCN-LABEL: {{^}}v_test_canonicalize_fabs_var_f16:
 ; GFX89: v_max_f16_e64 [[REG:v[0-9]+]], |{{v[0-9]+}}|, |{{v[0-9]+}}|
-; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]]
 define amdgpu_kernel void @v_test_canonicalize_fabs_var_f16(half addrspace(1)* %out) #1 {
   %val = load half, half addrspace(1)* %out
   %val.fabs = call half @llvm.fabs.f16(half %val)
@@ -70,7 +70,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f16(half addrspace(1)* %
 
 ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_fabs_var_f16:
 ; GFX89: v_max_f16_e64 [[REG:v[0-9]+]], -|{{v[0-9]+}}|, -|{{v[0-9]+}}|
-; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]]
 
 ; CI: v_cvt_f32_f16_e64 v{{[0-9]+}}, -|v{{[0-9]+}}|
 ; CI: v_mul_f32_e32 {{v[0-9]+}}, 1.0, {{v[0-9]+}}
@@ -85,7 +85,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f16(half addrspace(
 
 ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_var_f16:
 ; GFX89: v_max_f16_e64 [[REG:v[0-9]+]], -{{v[0-9]+}}, -{{v[0-9]+}}
-; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]]
 
 ; CI: v_cvt_f32_f16_e64 {{v[0-9]+}}, -{{v[0-9]+}}
 ; CI: v_mul_f32_e32 {{v[0-9]+}}, 1.0, {{v[0-9]+}}
@@ -100,7 +100,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f16(half addrspace(1)* %
 ; GCN-LABEL: {{^}}v_test_no_denormals_canonicalize_fneg_var_f16:
 ; VI: v_mul_f16_e32 [[REG:v[0-9]+]], -1.0, v{{[0-9]+}}
 ; GFX9: v_max_f16_e64 [[REG:v[0-9]+]], -v{{[0-9]+}}, -v{{[0-9]+}}
-; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]]
 define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_f16(half addrspace(1)* %out) #2 {
   %val = load half, half addrspace(1)* %out
   %val.fneg = fneg half %val
@@ -113,7 +113,7 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_f16(half ad
 ; VI: v_mul_f16_e64 [[REG:v[0-9]+]], -1.0, |v{{[0-9]+}}|
 ; GFX9: v_max_f16_e64 [[REG:v[0-9]+]], -|v{{[0-9]+}}|, -|v{{[0-9]+}}|
 
-; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]]
 
 ; CI: v_cvt_f32_f16_e64 {{v[0-9]+}}, -|{{v[0-9]+}}|
 ; CI: v_mul_f32_e32 {{v[0-9]+}}, 1.0, {{v[0-9]+}}
@@ -128,7 +128,7 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_fabs_var_f16(ha
 
 ; GCN-LABEL: {{^}}test_fold_canonicalize_p0_f16:
 ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
-; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_fold_canonicalize_p0_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half 0.0)
   store half %canonicalized, half addrspace(1)* %out
@@ -137,7 +137,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f16(half addrspace(1)* %out
 
 ; GCN-LABEL: {{^}}test_fold_canonicalize_n0_f16:
 ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffff8000{{$}}
-; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_fold_canonicalize_n0_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half -0.0)
   store half %canonicalized, half addrspace(1)* %out
@@ -146,7 +146,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f16(half addrspace(1)* %out
 
 ; GCN-LABEL: {{^}}test_fold_canonicalize_p1_f16:
 ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3c00{{$}}
-; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_fold_canonicalize_p1_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half 1.0)
   store half %canonicalized, half addrspace(1)* %out
@@ -155,7 +155,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f16(half addrspace(1)* %out
 
 ; GCN-LABEL: {{^}}test_fold_canonicalize_n1_f16:
 ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffffbc00{{$}}
-; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_fold_canonicalize_n1_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half -1.0)
   store half %canonicalized, half addrspace(1)* %out
@@ -164,7 +164,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f16(half addrspace(1)* %out
 
 ; GCN-LABEL: {{^}}test_fold_canonicalize_literal_f16:
 ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4c00{{$}}
-; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_fold_canonicalize_literal_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half 16.0)
   store half %canonicalized, half addrspace(1)* %out
@@ -173,7 +173,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f16(half addrspace(1)*
 
 ; GCN-LABEL: {{^}}test_default_denormals_fold_canonicalize_denormal0_f16:
 ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3ff{{$}}
-; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal0_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half 0xH03FF)
   store half %canonicalized, half addrspace(1)* %out
@@ -182,7 +182,7 @@ define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal0_f1
 
 ; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal0_f16:
 ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3ff{{$}}
-; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f16(half addrspace(1)* %out) #3 {
   %canonicalized = call half @llvm.canonicalize.f16(half 0xH03FF)
   store half %canonicalized, half addrspace(1)* %out
@@ -191,7 +191,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f16(half a
 
 ; GCN-LABEL: {{^}}test_default_denormals_fold_canonicalize_denormal1_f16:
 ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffff83ff{{$}}
-; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal1_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half 0xH83FF)
   store half %canonicalized, half addrspace(1)* %out
@@ -200,7 +200,7 @@ define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal1_f1
 
 ; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal1_f16:
 ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffff83ff{{$}}
-; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f16(half addrspace(1)* %out) #3 {
   %canonicalized = call half @llvm.canonicalize.f16(half 0xH83FF)
   store half %canonicalized, half addrspace(1)* %out
@@ -209,7 +209,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f16(half a
 
 ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_f16:
 ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7c00{{$}}
-; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_fold_canonicalize_qnan_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half 0xH7C00)
   store half %canonicalized, half addrspace(1)* %out
@@ -218,7 +218,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f16(half addrspace(1)* %o
 
 ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg1_f16:
 ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
-; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half bitcast (i16 -1 to half))
   store half %canonicalized, half addrspace(1)* %out
@@ -227,7 +227,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f16(half addrs
 
 ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg2_f16:
 ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
-; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half bitcast (i16 -2 to half))
   store half %canonicalized, half addrspace(1)* %out
@@ -236,7 +236,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f16(half addrs
 
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan0_value_f16:
 ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
-; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half 0xH7C01)
   store half %canonicalized, half addrspace(1)* %out
@@ -245,7 +245,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f16(half addrspace
 
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan1_value_f16:
 ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
-; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half 0xH7DFF)
   store half %canonicalized, half addrspace(1)* %out
@@ -254,7 +254,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f16(half addrspace
 
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan2_value_f16:
 ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
-; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half 0xHFDFF)
   store half %canonicalized, half addrspace(1)* %out
@@ -263,7 +263,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f16(half addrspace
 
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan3_value_f16:
 ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
-; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half 0xHFC01)
   store half %canonicalized, half addrspace(1)* %out
@@ -276,7 +276,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f16(half addrspace
 ; VI-NOT: v_and_b32
 
 ; GFX9: v_pk_max_f16 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+$}}
-; GFX9: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX9: global_store_dword v{{.+}}, [[REG]], s
 define amdgpu_kernel void @v_test_canonicalize_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
@@ -335,7 +335,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(<2 x half> ad
 ; VI-NOT: 0xffff
 
 ; GFX9: v_pk_max_f16 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} neg_lo:[1,1] neg_hi:[1,1]{{$}}
-; GFX9: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX9: global_store_dword v{{[0-9]+}}, [[REG]], s
 define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
@@ -352,7 +352,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(<2 x half> addrspa
 ; VI-NOT: v_and_b32
 
 ; GFX9: v_pk_max_f16 [[REG:v[0-9]+]], {{s[0-9]+}}, {{s[0-9]+$}}
-; GFX9: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX9: global_store_dword v{{[0-9]+}}, [[REG]], s
 define amdgpu_kernel void @s_test_canonicalize_var_v2f16(<2 x half> addrspace(1)* %out, i32 zeroext %val.arg) #1 {
   %val = bitcast i32 %val.arg to <2 x half>
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val)
@@ -362,7 +362,7 @@ define amdgpu_kernel void @s_test_canonicalize_var_v2f16(<2 x half> addrspace(1)
 
 ; GCN-LABEL: {{^}}test_fold_canonicalize_p0_v2f16:
 ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
-; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_dword v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_fold_canonicalize_p0_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> zeroinitializer)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
@@ -371,7 +371,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_v2f16(<2 x half> addrspace(
 
 ; GCN-LABEL: {{^}}test_fold_canonicalize_n0_v2f16:
 ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80008000{{$}}
-; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_dword v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_fold_canonicalize_n0_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half -0.0, half -0.0>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
@@ -380,7 +380,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_v2f16(<2 x half> addrspace(
 
 ; GCN-LABEL: {{^}}test_fold_canonicalize_p1_v2f16:
 ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3c003c00{{$}}
-; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_dword v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_fold_canonicalize_p1_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 1.0, half 1.0>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
@@ -389,7 +389,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_v2f16(<2 x half> addrspace(
 
 ; GCN-LABEL: {{^}}test_fold_canonicalize_n1_v2f16:
 ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0xbc00bc00{{$}}
-; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_dword v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_fold_canonicalize_n1_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half -1.0, half -1.0>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
@@ -398,7 +398,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_v2f16(<2 x half> addrspace(
 
 ; GCN-LABEL: {{^}}test_fold_canonicalize_literal_v2f16:
 ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4c004c00{{$}}
-; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_dword v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_fold_canonicalize_literal_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 16.0, half 16.0>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
@@ -407,7 +407,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_v2f16(<2 x half> addrs
 
 ; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal0_v2f16:
 ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3ff03ff{{$}}
-; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_dword v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH03FF, half 0xH03FF>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
@@ -416,7 +416,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_v2f16(<
 
 ; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal0_v2f16:
 ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3ff03ff{{$}}
-; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_dword v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_v2f16(<2 x half> addrspace(1)* %out) #3 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH03FF, half 0xH03FF>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
@@ -425,7 +425,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_v2f16(<2 x
 
 ; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal1_v2f16:
 ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x83ff83ff{{$}}
-; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_dword v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH83FF, half 0xH83FF>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
@@ -434,7 +434,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_v2f16(<
 
 ; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal1_v2f16:
 ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x83ff83ff{{$}}
-; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_dword v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_v2f16(<2 x half> addrspace(1)* %out) #3 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH83FF, half 0xH83FF>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
@@ -443,7 +443,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_v2f16(<2 x
 
 ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_v2f16:
 ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7c007c00{{$}}
-; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_dword v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_fold_canonicalize_qnan_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH7C00, half 0xH7C00>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
@@ -452,7 +452,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_v2f16(<2 x half> addrspac
 
 ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg1_v2f16:
 ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}}
-; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_dword v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> bitcast (i32 -1 to <2 x half>))
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
@@ -461,7 +461,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_v2f16(<2 x hal
 
 ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg2_v2f16:
 ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}}
-; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_dword v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half bitcast (i16 -2 to half), half bitcast (i16 -2 to half)>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
@@ -470,7 +470,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_v2f16(<2 x hal
 
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan0_value_v2f16:
 ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}}
-; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_dword v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_fold_canonicalize_snan0_value_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH7C01, half 0xH7C01>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
@@ -479,7 +479,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_v2f16(<2 x half> a
 
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan1_value_v2f16:
 ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}}
-; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_dword v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_fold_canonicalize_snan1_value_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH7DFF, half 0xH7DFF>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
@@ -488,7 +488,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_v2f16(<2 x half> a
 
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan2_value_v2f16:
 ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}}
-; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_dword v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_fold_canonicalize_snan2_value_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xHFDFF, half 0xHFDFF>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
@@ -497,7 +497,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_v2f16(<2 x half> a
 
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan3_value_v2f16:
 ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}}
-; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_dword v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_fold_canonicalize_snan3_value_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xHFC01, half 0xHFC01>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
@@ -542,7 +542,7 @@ define <4 x half> @v_test_canonicalize_var_v4f16(<4 x half> %val) #1 {
 
 ; GCN-LABEL: {{^}}s_test_canonicalize_undef_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00
-; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_dword v{{.+}}, [[REG]]
 define amdgpu_kernel void @s_test_canonicalize_undef_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> undef)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out

diff  --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
index d95194a8716d..357de0e0eb49 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
@@ -20,7 +20,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0
 ; GCN-LABEL: {{^}}v_test_canonicalize_var_f32:
 ; GFX678: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}}
 ; GFX9: v_max_f32_e32 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
-; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]]
 define amdgpu_kernel void @v_test_canonicalize_var_f32(float addrspace(1)* %out) #1 {
   %val = load float, float addrspace(1)* %out
   %canonicalized = call float @llvm.canonicalize.f32(float %val)
@@ -31,7 +31,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_f32(float addrspace(1)* %out)
 ; GCN-LABEL: {{^}}s_test_canonicalize_var_f32:
 ; GFX678: v_mul_f32_e64 [[REG:v[0-9]+]], 1.0, {{s[0-9]+}}
 ; GFX9: v_max_f32_e64 [[REG:v[0-9]+]], {{s[0-9]+}}, {{s[0-9]+}}
-; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]]
 define amdgpu_kernel void @s_test_canonicalize_var_f32(float addrspace(1)* %out, float %val) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float %val)
   store float %canonicalized, float addrspace(1)* %out
@@ -41,7 +41,7 @@ define amdgpu_kernel void @s_test_canonicalize_var_f32(float addrspace(1)* %out,
 ; GCN-LABEL: {{^}}v_test_canonicalize_fabs_var_f32:
 ; GFX678: v_mul_f32_e64 [[REG:v[0-9]+]], 1.0, |{{v[0-9]+}}|
 ; GFX9: v_max_f32_e64 [[REG:v[0-9]+]], |{{v[0-9]+}}|, |{{v[0-9]+}}|
-; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]]
 define amdgpu_kernel void @v_test_canonicalize_fabs_var_f32(float addrspace(1)* %out) #1 {
   %val = load float, float addrspace(1)* %out
   %val.fabs = call float @llvm.fabs.f32(float %val)
@@ -53,7 +53,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f32(float addrspace(1)*
 ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_fabs_var_f32:
 ; GFX678: v_mul_f32_e64 [[REG:v[0-9]+]], -1.0, |{{v[0-9]+}}|
 ; GFX9: v_max_f32_e64 [[REG:v[0-9]+]], -|{{v[0-9]+}}|, -|{{v[0-9]+}}|
-; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]]
 define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f32(float addrspace(1)* %out) #1 {
   %val = load float, float addrspace(1)* %out
   %val.fabs = call float @llvm.fabs.f32(float %val)
@@ -66,7 +66,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f32(float addrspace
 ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_var_f32:
 ; GFX678: v_mul_f32_e32 [[REG:v[0-9]+]], -1.0, {{v[0-9]+}}
 ; GFX9: v_max_f32_e64 [[REG:v[0-9]+]], -{{v[0-9]+}}, -{{v[0-9]+}}
-; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]]
 define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(float addrspace(1)* %out) #1 {
   %val = load float, float addrspace(1)* %out
   %val.fneg = fneg float %val
@@ -77,7 +77,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(float addrspace(1)*
 
 ; GCN-LABEL: {{^}}test_fold_canonicalize_undef_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
-; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_fold_canonicalize_undef_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float undef)
   store float %canonicalized, float addrspace(1)* %out
@@ -86,7 +86,7 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_f32(float addrspace(1)*
 
 ; GCN-LABEL: {{^}}test_fold_canonicalize_p0_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
-; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_fold_canonicalize_p0_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float 0.0)
   store float %canonicalized, float addrspace(1)* %out
@@ -95,7 +95,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f32(float addrspace(1)* %ou
 
 ; GCN-LABEL: {{^}}test_fold_canonicalize_n0_f32:
 ; GCN: v_bfrev_b32_e32 [[REG:v[0-9]+]], 1{{$}}
-; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_fold_canonicalize_n0_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float -0.0)
   store float %canonicalized, float addrspace(1)* %out
@@ -104,7 +104,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f32(float addrspace(1)* %ou
 
 ; GCN-LABEL: {{^}}test_fold_canonicalize_p1_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0{{$}}
-; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_fold_canonicalize_p1_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float 1.0)
   store float %canonicalized, float addrspace(1)* %out
@@ -113,7 +113,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f32(float addrspace(1)* %ou
 
 ; GCN-LABEL: {{^}}test_fold_canonicalize_n1_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], -1.0{{$}}
-; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_fold_canonicalize_n1_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float -1.0)
   store float %canonicalized, float addrspace(1)* %out
@@ -122,7 +122,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f32(float addrspace(1)* %ou
 
 ; GCN-LABEL: {{^}}test_fold_canonicalize_literal_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x41800000{{$}}
-; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_fold_canonicalize_literal_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float 16.0)
   store float %canonicalized, float addrspace(1)* %out
@@ -131,7 +131,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f32(float addrspace(1)
 
 ; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal0_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
-; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 8388607 to float))
   store float %canonicalized, float addrspace(1)* %out
@@ -140,7 +140,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32(flo
 
 ; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal0_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fffff{{$}}
-; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f32(float addrspace(1)* %out) #3 {
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 8388607 to float))
   store float %canonicalized, float addrspace(1)* %out
@@ -149,7 +149,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f32(float
 
 ; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal1_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
-; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2155872255 to float))
   store float %canonicalized, float addrspace(1)* %out
@@ -158,7 +158,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f32(flo
 
 ; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal1_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x807fffff{{$}}
-; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f32(float addrspace(1)* %out) #3 {
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2155872255 to float))
   store float %canonicalized, float addrspace(1)* %out
@@ -167,7 +167,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f32(float
 
 ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
-; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_fold_canonicalize_qnan_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float 0x7FF8000000000000)
   store float %canonicalized, float addrspace(1)* %out
@@ -176,7 +176,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f32(float addrspace(1)* %
 
 ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg1_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
-; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 -1 to float))
   store float %canonicalized, float addrspace(1)* %out
@@ -185,7 +185,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f32(float addr
 
 ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg2_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
-; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 -2 to float))
   store float %canonicalized, float addrspace(1)* %out
@@ -194,7 +194,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f32(float addr
 
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan0_value_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
-; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2139095041 to float))
   store float %canonicalized, float addrspace(1)* %out
@@ -203,7 +203,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f32(float addrspac
 
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan1_value_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
-; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2143289343 to float))
   store float %canonicalized, float addrspace(1)* %out
@@ -212,7 +212,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f32(float addrspac
 
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan2_value_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
-; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 4286578689 to float))
   store float %canonicalized, float addrspace(1)* %out
@@ -221,7 +221,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f32(float addrspac
 
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan3_value_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
-; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 4290772991 to float))
   store float %canonicalized, float addrspace(1)* %out
@@ -230,7 +230,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f32(float addrspac
 
 ; GCN-LABEL: {{^}}v_test_canonicalize_var_f64:
 ; GCN: v_max_f64 [[REG:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}
-; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, [[REG]]
 define amdgpu_kernel void @v_test_canonicalize_var_f64(double addrspace(1)* %out) #1 {
   %val = load double, double addrspace(1)* %out
   %canonicalized = call double @llvm.canonicalize.f64(double %val)
@@ -240,7 +240,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_f64(double addrspace(1)* %out
 
 ; GCN-LABEL: {{^}}s_test_canonicalize_var_f64:
 ; GCN: v_max_f64 [[REG:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, [[REG]]
 define amdgpu_kernel void @s_test_canonicalize_var_f64(double addrspace(1)* %out, double %val) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double %val)
   store double %canonicalized, double addrspace(1)* %out
@@ -249,7 +249,7 @@ define amdgpu_kernel void @s_test_canonicalize_var_f64(double addrspace(1)* %out
 
 ; GCN-LABEL: {{^}}v_test_canonicalize_fabs_var_f64:
 ; GCN: v_max_f64 [[REG:v\[[0-9]+:[0-9]+\]]], |{{v\[[0-9]+:[0-9]+\]}}|, |{{v\[[0-9]+:[0-9]+\]}}|
-; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, [[REG]]
 define amdgpu_kernel void @v_test_canonicalize_fabs_var_f64(double addrspace(1)* %out) #1 {
   %val = load double, double addrspace(1)* %out
   %val.fabs = call double @llvm.fabs.f64(double %val)
@@ -260,7 +260,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f64(double addrspace(1)*
 
 ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_fabs_var_f64:
 ; GCN: v_max_f64 [[REG:v\[[0-9]+:[0-9]\]]], -|{{v\[[0-9]+:[0-9]+\]}}|, -|{{v\[[0-9]+:[0-9]+\]}}|
-; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, [[REG]]
 define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f64(double addrspace(1)* %out) #1 {
   %val = load double, double addrspace(1)* %out
   %val.fabs = call double @llvm.fabs.f64(double %val)
@@ -272,7 +272,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f64(double addrspac
 
 ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_var_f64:
 ; GCN: v_max_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -{{v\[[0-9]+:[0-9]+\]}}, -{{v\[[0-9]+:[0-9]+\]}}
-; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, [[REG]]
 define amdgpu_kernel void @v_test_canonicalize_fneg_var_f64(double addrspace(1)* %out) #1 {
   %val = load double, double addrspace(1)* %out
   %val.fneg = fneg double %val
@@ -284,7 +284,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f64(double addrspace(1)*
 ; GCN-LABEL: {{^}}test_fold_canonicalize_p0_f64:
 ; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], v[[LO]]{{$}}
-; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
 define amdgpu_kernel void @test_fold_canonicalize_p0_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double 0.0)
   store double %canonicalized, double addrspace(1)* %out
@@ -294,7 +294,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f64(double addrspace(1)* %o
 ; GCN-LABEL: {{^}}test_fold_canonicalize_n0_f64:
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_bfrev_b32_e32 v[[HI:[0-9]+]], 1{{$}}
-; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
 define amdgpu_kernel void @test_fold_canonicalize_n0_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double -0.0)
   store double %canonicalized, double addrspace(1)* %out
@@ -304,7 +304,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f64(double addrspace(1)* %o
 ; GCN-LABEL: {{^}}test_fold_canonicalize_p1_f64:
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x3ff00000{{$}}
-; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
 define amdgpu_kernel void @test_fold_canonicalize_p1_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double 1.0)
   store double %canonicalized, double addrspace(1)* %out
@@ -314,7 +314,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f64(double addrspace(1)* %o
 ; GCN-LABEL: {{^}}test_fold_canonicalize_n1_f64:
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0xbff00000{{$}}
-; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
 define amdgpu_kernel void @test_fold_canonicalize_n1_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double -1.0)
   store double %canonicalized, double addrspace(1)* %out
@@ -324,7 +324,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f64(double addrspace(1)* %o
 ; GCN-LABEL: {{^}}test_fold_canonicalize_literal_f64:
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x40300000{{$}}
-; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
 define amdgpu_kernel void @test_fold_canonicalize_literal_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double 16.0)
   store double %canonicalized, double addrspace(1)* %out
@@ -334,7 +334,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f64(double addrspace(1
 ; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal0_f64:
 ; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], v[[LO]]{{$}}
-; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
 define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(double addrspace(1)* %out) #2 {
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 4503599627370495 to double))
   store double %canonicalized, double addrspace(1)* %out
@@ -344,7 +344,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(dou
 ; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal0_f64:
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], -1{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0xfffff{{$}}
-; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
 define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f64(double addrspace(1)* %out) #3 {
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 4503599627370495 to double))
   store double %canonicalized, double addrspace(1)* %out
@@ -354,7 +354,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f64(double
 ; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal1_f64:
 ; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], v[[LO]]{{$}}
-; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
 define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f64(double addrspace(1)* %out) #2 {
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9227875636482146303 to double))
   store double %canonicalized, double addrspace(1)* %out
@@ -364,7 +364,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f64(dou
 ; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal1_f64:
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], -1{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x800fffff{{$}}
-; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
 define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f64(double addrspace(1)* %out) #3 {
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9227875636482146303 to double))
   store double %canonicalized, double addrspace(1)* %out
@@ -374,7 +374,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f64(double
 ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_f64:
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
-; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
 define amdgpu_kernel void @test_fold_canonicalize_qnan_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double 0x7FF8000000000000)
   store double %canonicalized, double addrspace(1)* %out
@@ -384,7 +384,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f64(double addrspace(1)*
 ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg1_f64:
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
-; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
 define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 -1 to double))
   store double %canonicalized, double addrspace(1)* %out
@@ -394,7 +394,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f64(double add
 ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg2_f64:
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
-; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
 define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 -2 to double))
   store double %canonicalized, double addrspace(1)* %out
@@ -404,7 +404,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f64(double add
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan0_value_f64:
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
-; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
 define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9218868437227405313 to double))
   store double %canonicalized, double addrspace(1)* %out
@@ -414,7 +414,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f64(double addrspa
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan1_value_f64:
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
-; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
 define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9223372036854775807 to double))
   store double %canonicalized, double addrspace(1)* %out
@@ -424,7 +424,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f64(double addrspa
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan2_value_f64:
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
-; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
 define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 18442240474082181121 to double))
   store double %canonicalized, double addrspace(1)* %out
@@ -434,7 +434,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f64(double addrspa
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan3_value_f64:
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
-; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
 define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 18446744073709551615 to double))
   store double %canonicalized, double addrspace(1)* %out

diff  --git a/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll b/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll
index d62155c1220d..1a005731d3c7 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=ieee < %s | FileCheck --check-prefixes=GCN,GCN-DENORM %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign < %s | FileCheck --check-prefixes=GCN,GCN-FLUSH %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=ieee < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH %s
 
 ; GCN-LABEL: {{^}}div_1_by_x_25ulp:
 ; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000
@@ -13,7 +13,7 @@
 
 ; GCN-FLUSH:      v_rcp_f32_e32 [[OUT:v[0-9]+]], [[VAL]]
 
-; GCN:            global_store_dword v[{{[0-9:]+}}], [[OUT]], off
+; GCN:            global_store_dword v{{[0-9]+}}, [[OUT]], s{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @div_1_by_x_25ulp(float addrspace(1)* %arg) {
   %load = load float, float addrspace(1)* %arg, align 4
   %div = fdiv float 1.000000e+00, %load, !fpmath !0
@@ -33,7 +33,7 @@ define amdgpu_kernel void @div_1_by_x_25ulp(float addrspace(1)* %arg) {
 
 ; GCN-FLUSH:      v_rcp_f32_e64 [[OUT:v[0-9]+]], -[[VAL]]
 
-; GCN:            global_store_dword v[{{[0-9:]+}}], [[OUT]], off
+; GCN:            global_store_dword v{{[0-9]+}}, [[OUT]], s{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @div_minus_1_by_x_25ulp(float addrspace(1)* %arg) {
   %load = load float, float addrspace(1)* %arg, align 4
   %div = fdiv float -1.000000e+00, %load, !fpmath !0
@@ -53,7 +53,7 @@ define amdgpu_kernel void @div_minus_1_by_x_25ulp(float addrspace(1)* %arg) {
 
 ; GCN-FLUSH:      v_rcp_f32_e64 [[OUT:v[0-9]+]], -[[VAL]]
 
-; GCN:            global_store_dword v[{{[0-9:]+}}], [[OUT]], off
+; GCN:            global_store_dword v{{[0-9]+}}, [[OUT]], s{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @div_1_by_minus_x_25ulp(float addrspace(1)* %arg) {
   %load = load float, float addrspace(1)* %arg, align 4
   %neg = fsub float -0.000000e+00, %load
@@ -74,7 +74,7 @@ define amdgpu_kernel void @div_1_by_minus_x_25ulp(float addrspace(1)* %arg) {
 
 ; GCN-FLUSH:      v_rcp_f32_e32 [[OUT:v[0-9]+]], [[VAL]]
 
-; GCN:            global_store_dword v[{{[0-9:]+}}], [[OUT]], off
+; GCN:            global_store_dword v{{[0-9]+}}, [[OUT]], s{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @div_minus_1_by_minus_x_25ulp(float addrspace(1)* %arg) {
   %load = load float, float addrspace(1)* %arg, align 4
   %neg = fsub float -0.000000e+00, %load
@@ -112,7 +112,7 @@ define amdgpu_kernel void @div_minus_1_by_minus_x_25ulp(float addrspace(1)* %arg
 ; GCN-FLUSH:      v_rcp_f32_e32
 ; GCN-FLUSH:      v_rcp_f32_e32
 ; GCN-FLUSH:      v_rcp_f32_e32 v[[OUT3:[0-9]+]], s[[VAL3]]
-; GCN-FLUSH:      global_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[OUT0]]:[[OUT3]]], off
+; GCN-FLUSH:      global_store_dwordx4 v{{[0-9]+}}, v{{\[}}[[OUT0]]:[[OUT3]]], s{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @div_v4_1_by_x_25ulp(<4 x float> addrspace(1)* %arg) {
   %load = load <4 x float>, <4 x float> addrspace(1)* %arg, align 16
   %div = fdiv <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %load, !fpmath !0
@@ -121,6 +121,7 @@ define amdgpu_kernel void @div_v4_1_by_x_25ulp(<4 x float> addrspace(1)* %arg) {
 }
 
 ; GCN-LABEL: {{^}}div_v4_minus_1_by_x_25ulp:
+; GCN-DAG:        s_load_dwordx4 s{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
 ; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000
 ; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
 ; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
@@ -156,6 +157,7 @@ define amdgpu_kernel void @div_v4_minus_1_by_x_25ulp(<4 x float> addrspace(1)* %
 }
 
 ; GCN-LABEL: {{^}}div_v4_1_by_minus_x_25ulp:
+; GCN-DAG:        s_load_dwordx4 s{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
 ; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000
 ; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
 ; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
@@ -183,7 +185,7 @@ define amdgpu_kernel void @div_v4_minus_1_by_x_25ulp(<4 x float> addrspace(1)* %
 ; GCN-FLUSH:      v_rcp_f32_e64
 ; GCN-FLUSH:      v_rcp_f32_e64
 ; GCN-FLUSH:      v_rcp_f32_e64 v[[OUT3:[0-9]+]], -s[[VAL3]]
-; GCN-FLUSH:      global_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[OUT0]]:[[OUT3]]], off
+; GCN-FLUSH:      global_store_dwordx4 v{{[0-9]+}}, v{{\[}}[[OUT0]]:[[OUT3]]], s{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @div_v4_1_by_minus_x_25ulp(<4 x float> addrspace(1)* %arg) {
   %load = load <4 x float>, <4 x float> addrspace(1)* %arg, align 16
   %neg = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %load
@@ -221,7 +223,7 @@ define amdgpu_kernel void @div_v4_1_by_minus_x_25ulp(<4 x float> addrspace(1)* %
 ; GCN-FLUSH:      v_rcp_f32_e32
 ; GCN-FLUSH:      v_rcp_f32_e32
 ; GCN-FLUSH:      v_rcp_f32_e32 v[[OUT3:[0-9]+]], s[[VAL3]]
-; GCN-FLUSH:      global_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[OUT0]]:[[OUT3]]], off
+; GCN-FLUSH:      global_store_dwordx4 v{{[0-9]+}}, v{{\[}}[[OUT0]]:[[OUT3]]], s{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @div_v4_minus_1_by_minus_x_25ulp(<4 x float> addrspace(1)* %arg) {
   %load = load <4 x float>, <4 x float> addrspace(1)* %arg, align 16
   %neg = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %load
@@ -334,7 +336,7 @@ define amdgpu_kernel void @div_v4_c_by_minus_x_25ulp(<4 x float> addrspace(1)* %
 ; GCN-FLUSH:      v_rcp_f32_e32 [[RCP:v[0-9]+]], [[PRESCALED]]
 ; GCN-FLUSH:      v_mul_f32_e32 [[OUT:v[0-9]+]], [[SCALE]], [[RCP]]
 
-; GCN:            global_store_dword v[{{[0-9:]+}}], [[OUT]], off
+; GCN:            global_store_dword v{{[0-9]+}}, [[OUT]], s{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @div_v_by_x_25ulp(float addrspace(1)* %arg, float %num) {
   %load = load float, float addrspace(1)* %arg, align 4
   %div = fdiv float %num, %load, !fpmath !0
@@ -345,7 +347,7 @@ define amdgpu_kernel void @div_v_by_x_25ulp(float addrspace(1)* %arg, float %num
 ; GCN-LABEL: {{^}}div_1_by_x_fast:
 ; GCN: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0
 ; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[VAL]]
-; GCN: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
+; GCN: global_store_dword v{{[0-9]+}}, [[RCP]], s{{\[[0-9]:[0-9]+\]}}
 define amdgpu_kernel void @div_1_by_x_fast(float addrspace(1)* %arg) {
   %load = load float, float addrspace(1)* %arg, align 4
   %div = fdiv fast float 1.000000e+00, %load, !fpmath !0
@@ -356,7 +358,7 @@ define amdgpu_kernel void @div_1_by_x_fast(float addrspace(1)* %arg) {
 ; GCN-LABEL: {{^}}div_minus_1_by_x_fast:
 ; GCN: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0
 ; GCN: v_rcp_f32_e64 [[RCP:v[0-9]+]], -[[VAL]]
-; GCN: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
+; GCN: global_store_dword v{{[0-9]+}}, [[RCP]], s{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @div_minus_1_by_x_fast(float addrspace(1)* %arg) {
   %load = load float, float addrspace(1)* %arg, align 4
   %div = fdiv fast float -1.000000e+00, %load, !fpmath !0
@@ -367,7 +369,7 @@ define amdgpu_kernel void @div_minus_1_by_x_fast(float addrspace(1)* %arg) {
 ; GCN-LABEL: {{^}}div_1_by_minus_x_fast:
 ; GCN: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0
 ; GCN: v_rcp_f32_e64 [[RCP:v[0-9]+]], -[[VAL]]
-; GCN: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
+; GCN: global_store_dword v{{[0-9]+}}, [[RCP]], s{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @div_1_by_minus_x_fast(float addrspace(1)* %arg) {
   %load = load float, float addrspace(1)* %arg, align 4
   %neg = fsub float -0.000000e+00, %load, !fpmath !0
@@ -379,7 +381,7 @@ define amdgpu_kernel void @div_1_by_minus_x_fast(float addrspace(1)* %arg) {
 ; GCN-LABEL: {{^}}div_minus_1_by_minus_x_fast:
 ; GCN: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0
 ; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[VAL]]
-; GCN: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
+; GCN: global_store_dword v{{[0-9]+}}, [[RCP]], s{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @div_minus_1_by_minus_x_fast(float addrspace(1)* %arg) {
   %load = load float, float addrspace(1)* %arg, align 4
   %neg = fsub float -0.000000e+00, %load, !fpmath !0

diff  --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
index c4ee8ad40925..42229a7d4a79 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
@@ -26,7 +26,7 @@ define amdgpu_kernel void @fneg_fabs_fadd_f16(half addrspace(1)* %out, half %x,
 ; GFX89-NOT: _and
 ; GFX89: v_mul_f16_e64 [[MUL:v[0-9]+]], {{s[0-9]+}}, -|{{v[0-9]+}}|
 ; GFX89-NOT: [[MUL]]
-; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
+; GFX89: {{flat|global}}_store_short v{{.+}}, [[MUL]]
 define amdgpu_kernel void @fneg_fabs_fmul_f16(half addrspace(1)* %out, half %x, half %y) {
   %fabs = call half @llvm.fabs.f16(half %x)
   %fsub = fsub half -0.0, %fabs
@@ -134,8 +134,8 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(<2 x half> addrspace(1)* %o
 ; GFX9: v_mov_b32_e32 [[V_ABS:v[0-9]+]], [[ABS]]
 ; GFX9: s_xor_b32 [[NEG:s[0-9]+]], [[ABS]], 0x80008000
 ; GFX9-DAG: v_mov_b32_e32 [[V_NEG:v[0-9]+]], [[NEG]]
-; GFX9-DAG: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[V_ABS]]
-; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[V_NEG]]
+; GFX9-DAG: global_store_dword v{{[0-9]+}}, [[V_ABS]], s{{\[[0-9]+:[0-9]+\]}}
+; GFX9: global_store_dword v{{[0-9]+}}, [[V_NEG]], s{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(<2 x half> addrspace(1)* %out0, <2 x half> addrspace(1)* %out1, <2 x half> %in) {
   %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
   %fneg = fsub <2 x half> <half -0.0, half -0.0>, %fabs

diff  --git a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll
index 5afcafc781f4..cca53985196a 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll
@@ -32,7 +32,7 @@ define amdgpu_kernel void @v_fneg_f16(half addrspace(1)* %out, half addrspace(1)
 ; GCN: s_load_dword [[NEG_VALUE:s[0-9]+]],
 ; GCN: s_xor_b32 [[XOR:s[0-9]+]], [[NEG_VALUE]], 0x8000{{$}}
 ; GCN: v_mov_b32_e32 [[V_XOR:v[0-9]+]], [[XOR]]
-; GCN: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[V_XOR]]
+; GCN: {{flat|global}}_store_short v{{.+}}, [[V_XOR]]
 define amdgpu_kernel void @s_fneg_free_f16(half addrspace(1)* %out, i16 %in) #0 {
   %bc = bitcast i16 %in to half
   %fsub = fsub half -0.0, %bc

diff  --git a/llvm/test/CodeGen/AMDGPU/fshl.ll b/llvm/test/CodeGen/AMDGPU/fshl.ll
index 123c40a7c2d0..f2d3c59fc679 100644
--- a/llvm/test/CodeGen/AMDGPU/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshl.ll
@@ -48,16 +48,15 @@ define amdgpu_kernel void @fshl_i32(i32 addrspace(1)* %in, i32 %x, i32 %y, i32 %
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
 ; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x34
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s5
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    s_not_b32 s0, s0
 ; GFX9-NEXT:    s_lshr_b32 s1, s4, 1
-; GFX9-NEXT:    v_alignbit_b32 v0, s4, v0, 1
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-NEXT:    v_alignbit_b32 v2, s1, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_alignbit_b32 v1, s4, v1, 1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    v_alignbit_b32 v1, s1, v1, v2
+; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; R600-LABEL: fshl_i32:
@@ -108,12 +107,11 @@ define amdgpu_kernel void @fshl_i32_imm(i32 addrspace(1)* %in, i32 %x, i32 %y) {
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s1
-; GFX9-NEXT:    v_alignbit_b32 v2, s0, v0, 25
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_alignbit_b32 v1, s0, v1, 25
+; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; R600-LABEL: fshl_i32_imm:
@@ -188,22 +186,21 @@ define amdgpu_kernel void @fshl_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x,
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
 ; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x34
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x3c
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s7
 ; GFX9-NEXT:    s_not_b32 s1, s1
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    s_lshr_b32 s7, s5, 1
 ; GFX9-NEXT:    v_alignbit_b32 v0, s5, v0, 1
-; GFX9-NEXT:    v_alignbit_b32 v1, s7, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    s_lshr_b32 s5, s5, 1
+; GFX9-NEXT:    v_alignbit_b32 v1, s5, v0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9-NEXT:    s_not_b32 s0, s0
 ; GFX9-NEXT:    v_alignbit_b32 v0, s4, v0, 1
 ; GFX9-NEXT:    s_lshr_b32 s1, s4, 1
-; GFX9-NEXT:    v_mov_b32_e32 v2, s0
-; GFX9-NEXT:    v_alignbit_b32 v0, s1, v0, v2
-; GFX9-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT:    v_mov_b32_e32 v3, s0
+; GFX9-NEXT:    v_alignbit_b32 v0, s1, v0, v3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; R600-LABEL: fshl_v2i32:
@@ -265,14 +262,13 @@ define amdgpu_kernel void @fshl_v2i32_imm(<2 x i32> addrspace(1)* %in, <2 x i32>
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s1
-; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX9-NEXT:    v_alignbit_b32 v1, s5, v0, 23
-; GFX9-NEXT:    v_alignbit_b32 v0, s4, v2, 25
-; GFX9-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT:    v_alignbit_b32 v0, s4, v3, 25
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; R600-LABEL: fshl_v2i32_imm:
@@ -373,13 +369,14 @@ define amdgpu_kernel void @fshl_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x,
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
 ; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x44
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x54
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s11
 ; GFX9-NEXT:    s_not_b32 s3, s3
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    s_lshr_b32 s11, s7, 1
 ; GFX9-NEXT:    v_alignbit_b32 v0, s7, v0, 1
-; GFX9-NEXT:    v_alignbit_b32 v3, s11, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    s_lshr_b32 s7, s7, 1
+; GFX9-NEXT:    v_alignbit_b32 v3, s7, v0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s10
 ; GFX9-NEXT:    s_not_b32 s2, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s2
@@ -396,11 +393,9 @@ define amdgpu_kernel void @fshl_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x,
 ; GFX9-NEXT:    s_not_b32 s0, s0
 ; GFX9-NEXT:    v_alignbit_b32 v0, s4, v0, 1
 ; GFX9-NEXT:    s_lshr_b32 s1, s4, 1
-; GFX9-NEXT:    v_mov_b32_e32 v4, s0
-; GFX9-NEXT:    v_alignbit_b32 v0, s1, v0, v4
-; GFX9-NEXT:    v_mov_b32_e32 v4, s12
-; GFX9-NEXT:    v_mov_b32_e32 v5, s13
-; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
+; GFX9-NEXT:    v_mov_b32_e32 v5, s0
+; GFX9-NEXT:    v_alignbit_b32 v0, s1, v0, v5
+; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[12:13]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; R600-LABEL: fshl_v4i32:
@@ -478,9 +473,8 @@ define amdgpu_kernel void @fshl_v4i32_imm(<4 x i32> addrspace(1)* %in, <4 x i32>
 ; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x24
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x44
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v4, s8
-; GFX9-NEXT:    v_mov_b32_e32 v5, s9
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9-NEXT:    v_alignbit_b32 v3, s7, v0, 31
@@ -489,7 +483,7 @@ define amdgpu_kernel void @fshl_v4i32_imm(<4 x i32> addrspace(1)* %in, <4 x i32>
 ; GFX9-NEXT:    v_alignbit_b32 v1, s5, v0, 25
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_alignbit_b32 v0, s4, v0, 31
-; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
+; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[8:9]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; R600-LABEL: fshl_v4i32_imm:

diff  --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll
index 0874b094a2c0..ef7eff29f8b0 100644
--- a/llvm/test/CodeGen/AMDGPU/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshr.ll
@@ -51,13 +51,12 @@ define amdgpu_kernel void @fshr_i32(i32 addrspace(1)* %in, i32 %x, i32 %y, i32 %
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
 ; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x34
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s5
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-NEXT:    v_alignbit_b32 v2, s4, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    v_alignbit_b32 v1, s4, v1, v2
+; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; R600-LABEL: fshr_i32:
@@ -105,12 +104,11 @@ define amdgpu_kernel void @fshr_i32_imm(i32 addrspace(1)* %in, i32 %x, i32 %y) {
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s1
-; GFX9-NEXT:    v_alignbit_b32 v2, s0, v0, 7
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_alignbit_b32 v1, s0, v1, 7
+; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; R600-LABEL: fshr_i32_imm:
@@ -173,16 +171,15 @@ define amdgpu_kernel void @fshr_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x,
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
 ; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x34
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x3c
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    v_alignbit_b32 v1, s5, v0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-NEXT:    v_mov_b32_e32 v2, s0
-; GFX9-NEXT:    v_alignbit_b32 v0, s4, v0, v2
-; GFX9-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT:    v_mov_b32_e32 v3, s0
+; GFX9-NEXT:    v_alignbit_b32 v0, s4, v0, v3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; R600-LABEL: fshr_v2i32:
@@ -240,14 +237,13 @@ define amdgpu_kernel void @fshr_v2i32_imm(<2 x i32> addrspace(1)* %in, <2 x i32>
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s1
-; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX9-NEXT:    v_alignbit_b32 v1, s5, v0, 9
-; GFX9-NEXT:    v_alignbit_b32 v0, s4, v2, 7
-; GFX9-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT:    v_alignbit_b32 v0, s4, v3, 7
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; R600-LABEL: fshr_v2i32_imm:
@@ -324,6 +320,7 @@ define amdgpu_kernel void @fshr_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x,
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
 ; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x44
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x54
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s11
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
@@ -335,11 +332,9 @@ define amdgpu_kernel void @fshr_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x,
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    v_alignbit_b32 v1, s5, v0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s8
-; GFX9-NEXT:    v_mov_b32_e32 v4, s0
-; GFX9-NEXT:    v_alignbit_b32 v0, s4, v0, v4
-; GFX9-NEXT:    v_mov_b32_e32 v4, s12
-; GFX9-NEXT:    v_mov_b32_e32 v5, s13
-; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
+; GFX9-NEXT:    v_mov_b32_e32 v5, s0
+; GFX9-NEXT:    v_alignbit_b32 v0, s4, v0, v5
+; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[12:13]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; R600-LABEL: fshr_v4i32:
@@ -409,9 +404,8 @@ define amdgpu_kernel void @fshr_v4i32_imm(<4 x i32> addrspace(1)* %in, <4 x i32>
 ; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x24
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x44
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v4, s8
-; GFX9-NEXT:    v_mov_b32_e32 v5, s9
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9-NEXT:    v_alignbit_b32 v3, s7, v0, 1
@@ -420,7 +414,7 @@ define amdgpu_kernel void @fshr_v4i32_imm(<4 x i32> addrspace(1)* %in, <4 x i32>
 ; GFX9-NEXT:    v_alignbit_b32 v1, s5, v0, 7
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_alignbit_b32 v0, s4, v0, 1
-; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
+; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[8:9]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; R600-LABEL: fshr_v4i32_imm:

diff  --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
index 3991b2722f9b..122b10d1e4e6 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
@@ -2508,15 +2508,14 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 {
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
-; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v4, s4
-; GFX9-NEXT:    v_mov_b32_e32 v5, s5
-; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[4:5], off
-; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[4:5], off offset:16
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v4, s[4:5]
+; GFX9-NEXT:    global_load_dwordx4 v[4:7], v4, s[4:5] offset:16
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v8i32 at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v8i32 at rel32@hi+12
@@ -2541,6 +2540,7 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 {
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s4
 ; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
@@ -2548,14 +2548,13 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, s4
-; GFX10-NEXT:    v_mov_b32_e32 v5, s5
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v4, s[4:5]
+; GFX10-NEXT:    global_load_dwordx4 v[4:7], v4, s[4:5] offset:16
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_getpc_b64 s[4:5]
 ; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_v8i32 at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_v8i32 at rel32@hi+12
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[4:5], off
-; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[4:5], off offset:16
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
@@ -2655,17 +2654,16 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 {
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v12, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
-; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v13, s5
-; GFX9-NEXT:    v_mov_b32_e32 v12, s4
-; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[12:13], off
-; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[12:13], off offset:16
-; GFX9-NEXT:    global_load_dwordx4 v[8:11], v[12:13], off offset:32
-; GFX9-NEXT:    global_load_dwordx4 v[12:15], v[12:13], off offset:48
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v12, s[4:5]
+; GFX9-NEXT:    global_load_dwordx4 v[4:7], v12, s[4:5] offset:16
+; GFX9-NEXT:    global_load_dwordx4 v[8:11], v12, s[4:5] offset:32
+; GFX9-NEXT:    global_load_dwordx4 v[12:15], v12, s[4:5] offset:48
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v16i32 at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v16i32 at rel32@hi+12
@@ -2690,6 +2688,7 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 {
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s4
 ; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX10-NEXT:    v_mov_b32_e32 v12, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
@@ -2697,16 +2696,15 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v13, s5
-; GFX10-NEXT:    v_mov_b32_e32 v12, s4
+; GFX10-NEXT:    s_clause 0x3
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v12, s[4:5]
+; GFX10-NEXT:    global_load_dwordx4 v[4:7], v12, s[4:5] offset:16
+; GFX10-NEXT:    global_load_dwordx4 v[8:11], v12, s[4:5] offset:32
+; GFX10-NEXT:    global_load_dwordx4 v[12:15], v12, s[4:5] offset:48
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_getpc_b64 s[4:5]
 ; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_v16i32 at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_v16i32 at rel32@hi+12
-; GFX10-NEXT:    s_clause 0x3
-; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[12:13], off
-; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[12:13], off offset:16
-; GFX10-NEXT:    global_load_dwordx4 v[8:11], v[12:13], off offset:32
-; GFX10-NEXT:    global_load_dwordx4 v[12:15], v[12:13], off offset:48
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
@@ -2732,21 +2730,20 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 {
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v28, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
-; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v29, s5
-; GFX9-NEXT:    v_mov_b32_e32 v28, s4
-; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[28:29], off
-; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[28:29], off offset:16
-; GFX9-NEXT:    global_load_dwordx4 v[8:11], v[28:29], off offset:32
-; GFX9-NEXT:    global_load_dwordx4 v[12:15], v[28:29], off offset:48
-; GFX9-NEXT:    global_load_dwordx4 v[16:19], v[28:29], off offset:64
-; GFX9-NEXT:    global_load_dwordx4 v[20:23], v[28:29], off offset:80
-; GFX9-NEXT:    global_load_dwordx4 v[24:27], v[28:29], off offset:96
-; GFX9-NEXT:    global_load_dwordx4 v[28:31], v[28:29], off offset:112
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v28, s[4:5]
+; GFX9-NEXT:    global_load_dwordx4 v[4:7], v28, s[4:5] offset:16
+; GFX9-NEXT:    global_load_dwordx4 v[8:11], v28, s[4:5] offset:32
+; GFX9-NEXT:    global_load_dwordx4 v[12:15], v28, s[4:5] offset:48
+; GFX9-NEXT:    global_load_dwordx4 v[16:19], v28, s[4:5] offset:64
+; GFX9-NEXT:    global_load_dwordx4 v[20:23], v28, s[4:5] offset:80
+; GFX9-NEXT:    global_load_dwordx4 v[24:27], v28, s[4:5] offset:96
+; GFX9-NEXT:    global_load_dwordx4 v[28:31], v28, s[4:5] offset:112
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v32i32 at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v32i32 at rel32@hi+12
@@ -2771,6 +2768,7 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 {
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s4
 ; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX10-NEXT:    v_mov_b32_e32 v28, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
@@ -2778,20 +2776,19 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v29, s5
-; GFX10-NEXT:    v_mov_b32_e32 v28, s4
+; GFX10-NEXT:    s_clause 0x7
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v28, s[4:5]
+; GFX10-NEXT:    global_load_dwordx4 v[4:7], v28, s[4:5] offset:16
+; GFX10-NEXT:    global_load_dwordx4 v[8:11], v28, s[4:5] offset:32
+; GFX10-NEXT:    global_load_dwordx4 v[12:15], v28, s[4:5] offset:48
+; GFX10-NEXT:    global_load_dwordx4 v[16:19], v28, s[4:5] offset:64
+; GFX10-NEXT:    global_load_dwordx4 v[20:23], v28, s[4:5] offset:80
+; GFX10-NEXT:    global_load_dwordx4 v[24:27], v28, s[4:5] offset:96
+; GFX10-NEXT:    global_load_dwordx4 v[28:31], v28, s[4:5] offset:112
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_getpc_b64 s[4:5]
 ; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_v32i32 at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_v32i32 at rel32@hi+12
-; GFX10-NEXT:    s_clause 0x7
-; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[28:29], off
-; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[28:29], off offset:16
-; GFX10-NEXT:    global_load_dwordx4 v[8:11], v[28:29], off offset:32
-; GFX10-NEXT:    global_load_dwordx4 v[12:15], v[28:29], off offset:48
-; GFX10-NEXT:    global_load_dwordx4 v[16:19], v[28:29], off offset:64
-; GFX10-NEXT:    global_load_dwordx4 v[20:23], v[28:29], off offset:80
-; GFX10-NEXT:    global_load_dwordx4 v[24:27], v[28:29], off offset:96
-; GFX10-NEXT:    global_load_dwordx4 v[28:31], v[28:29], off offset:112
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
@@ -2817,21 +2814,20 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 {
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v28, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
-; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v29, s5
-; GFX9-NEXT:    v_mov_b32_e32 v28, s4
-; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[28:29], off
-; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[28:29], off offset:16
-; GFX9-NEXT:    global_load_dwordx4 v[8:11], v[28:29], off offset:32
-; GFX9-NEXT:    global_load_dwordx4 v[12:15], v[28:29], off offset:48
-; GFX9-NEXT:    global_load_dwordx4 v[16:19], v[28:29], off offset:64
-; GFX9-NEXT:    global_load_dwordx4 v[20:23], v[28:29], off offset:80
-; GFX9-NEXT:    global_load_dwordx4 v[24:27], v[28:29], off offset:96
-; GFX9-NEXT:    global_load_dwordx4 v[28:31], v[28:29], off offset:112
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v28, s[4:5]
+; GFX9-NEXT:    global_load_dwordx4 v[4:7], v28, s[4:5] offset:16
+; GFX9-NEXT:    global_load_dwordx4 v[8:11], v28, s[4:5] offset:32
+; GFX9-NEXT:    global_load_dwordx4 v[12:15], v28, s[4:5] offset:48
+; GFX9-NEXT:    global_load_dwordx4 v[16:19], v28, s[4:5] offset:64
+; GFX9-NEXT:    global_load_dwordx4 v[20:23], v28, s[4:5] offset:80
+; GFX9-NEXT:    global_load_dwordx4 v[24:27], v28, s[4:5] offset:96
+; GFX9-NEXT:    global_load_dwordx4 v[28:31], v28, s[4:5] offset:112
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v32i32_i32 at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v32i32_i32 at rel32@hi+12
@@ -2860,28 +2856,28 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 {
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s4
 ; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX10-NEXT:    v_mov_b32_e32 v28, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    global_load_dword v32, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v29, s5
-; GFX10-NEXT:    v_mov_b32_e32 v28, s4
+; GFX10-NEXT:    s_clause 0x7
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v28, s[4:5]
+; GFX10-NEXT:    global_load_dwordx4 v[4:7], v28, s[4:5] offset:16
+; GFX10-NEXT:    global_load_dwordx4 v[8:11], v28, s[4:5] offset:32
+; GFX10-NEXT:    global_load_dwordx4 v[12:15], v28, s[4:5] offset:48
+; GFX10-NEXT:    global_load_dwordx4 v[16:19], v28, s[4:5] offset:64
+; GFX10-NEXT:    global_load_dwordx4 v[20:23], v28, s[4:5] offset:80
+; GFX10-NEXT:    global_load_dwordx4 v[24:27], v28, s[4:5] offset:96
+; GFX10-NEXT:    global_load_dwordx4 v[28:31], v28, s[4:5] offset:112
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_getpc_b64 s[4:5]
 ; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_v32i32_i32 at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_v32i32_i32 at rel32@hi+12
-; GFX10-NEXT:    global_load_dword v32, v[0:1], off
-; GFX10-NEXT:    s_clause 0x7
-; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[28:29], off
-; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[28:29], off offset:16
-; GFX10-NEXT:    global_load_dwordx4 v[8:11], v[28:29], off offset:32
-; GFX10-NEXT:    global_load_dwordx4 v[12:15], v[28:29], off offset:48
-; GFX10-NEXT:    global_load_dwordx4 v[16:19], v[28:29], off offset:64
-; GFX10-NEXT:    global_load_dwordx4 v[20:23], v[28:29], off offset:80
-; GFX10-NEXT:    global_load_dwordx4 v[24:27], v[28:29], off offset:96
-; GFX10-NEXT:    global_load_dwordx4 v[28:31], v[28:29], off offset:112
 ; GFX10-NEXT:    s_waitcnt vmcnt(8)
 ; GFX10-NEXT:    buffer_store_dword v32, off, s[0:3], s32
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
@@ -2987,15 +2983,14 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 {
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
-; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-NEXT:    global_load_dword v1, v[2:3], off offset:4
-; GFX9-NEXT:    global_load_ubyte v0, v[2:3], off
+; GFX9-NEXT:    global_load_dword v1, v0, s[4:5] offset:4
+; GFX9-NEXT:    global_load_ubyte v0, v0, s[4:5]
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_struct_i8_i32 at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_struct_i8_i32 at rel32@hi+12
@@ -3020,6 +3015,7 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 {
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s4
 ; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
@@ -3027,14 +3023,13 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    global_load_ubyte v0, v1, s[4:5]
+; GFX10-NEXT:    global_load_dword v1, v1, s[4:5] offset:4
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_getpc_b64 s[4:5]
 ; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_struct_i8_i32 at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_struct_i8_i32 at rel32@hi+12
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_ubyte v0, v[1:2], off
-; GFX10-NEXT:    global_load_dword v1, v[1:2], off offset:4
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
@@ -3230,14 +3225,13 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 {
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s33, s32
-; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v0, s[4:5]
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v16i8 at rel32@lo+4
@@ -3281,6 +3275,7 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 {
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s4
 ; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
@@ -3288,12 +3283,11 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v0, s[4:5]
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_getpc_b64 s[4:5]
 ; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_v16i8 at rel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_v16i8 at rel32@hi+12
-; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v16, 8, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v0

diff  --git a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll
index f93b6d3de7b4..00766854d3b5 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll
@@ -33,7 +33,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_ieee(float addrspace(1)* %
 ; GFX900-NEXT: s_cbranch_execnz [[LOOP]]
 
 ; GFX908-NOT: v_add_f32
-; GFX908:     global_atomic_add_f32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, off
+; GFX908:     global_atomic_add_f32 v{{[0-9]+}}, v{{[0-9]+}}, s
 ; GFX908-NOT: s_cbranch_execnz
 define amdgpu_kernel void @global_atomic_fadd_noret_f32(float addrspace(1)* %ptr) #0 {
   %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 seq_cst

diff  --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
index b9760a6cba1a..f28b9fdcdccc 100644
--- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
@@ -8,13 +8,26 @@
 ; No vgpr offset, constants
 ; --------------------------------------------------------------------------------
 
+; SGPR base only
+define amdgpu_ps float @global_load_saddr_i8_offset_0(i8 addrspace(1)* inreg %sbase) {
+; GCN-LABEL: global_load_saddr_i8_offset_0:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    global_load_ubyte v0, v0, s[2:3]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ; return to shader part epilog
+  %load = load i8, i8 addrspace(1)* %sbase
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
 ; SGPR base with maximum gfx9 immediate offset
 define amdgpu_ps float @global_load_saddr_i8_offset_4095(i8 addrspace(1)* inreg %sbase) {
 ; GFX9-LABEL: global_load_saddr_i8_offset_4095:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:4095
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:4095
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
@@ -65,9 +78,8 @@ define amdgpu_ps float @global_load_saddr_i8_offset_4097(i8 addrspace(1)* inreg
 define amdgpu_ps float @global_load_saddr_i8_offset_neg4096(i8 addrspace(1)* inreg %sbase) {
 ; GFX9-LABEL: global_load_saddr_i8_offset_neg4096:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:-4096
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:-4096
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
@@ -141,9 +153,8 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg4098(i8 addrspace(1)* inr
 define amdgpu_ps float @global_load_saddr_i8_offset_2048(i8 addrspace(1)* inreg %sbase) {
 ; GFX9-LABEL: global_load_saddr_i8_offset_2048:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:2048
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:2048
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
@@ -164,9 +175,8 @@ define amdgpu_ps float @global_load_saddr_i8_offset_2048(i8 addrspace(1)* inreg
 define amdgpu_ps float @global_load_saddr_i8_offset_2049(i8 addrspace(1)* inreg %sbase) {
 ; GFX9-LABEL: global_load_saddr_i8_offset_2049:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:2049
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:2049
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
@@ -187,9 +197,8 @@ define amdgpu_ps float @global_load_saddr_i8_offset_2049(i8 addrspace(1)* inreg
 define amdgpu_ps float @global_load_saddr_i8_offset_2050(i8 addrspace(1)* inreg %sbase) {
 ; GFX9-LABEL: global_load_saddr_i8_offset_2050:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:2050
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:2050
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
@@ -210,9 +219,8 @@ define amdgpu_ps float @global_load_saddr_i8_offset_2050(i8 addrspace(1)* inreg
 define amdgpu_ps float @global_load_saddr_i8_offset_neg2048(i8 addrspace(1)* inreg %sbase) {
 ; GCN-LABEL: global_load_saddr_i8_offset_neg2048:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NEXT:    global_load_ubyte v0, v[0:1], off offset:-2048
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:-2048
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    ; return to shader part epilog
   %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -2048
@@ -226,9 +234,8 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg2048(i8 addrspace(1)* inr
 define amdgpu_ps float @global_load_saddr_i8_offset_neg2049(i8 addrspace(1)* inreg %sbase) {
 ; GFX9-LABEL: global_load_saddr_i8_offset_neg2049:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:-2049
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:-2049
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
@@ -250,9 +257,8 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg2049(i8 addrspace(1)* inr
 define amdgpu_ps float @global_load_saddr_i8_offset_neg2050(i8 addrspace(1)* inreg %sbase) {
 ; GFX9-LABEL: global_load_saddr_i8_offset_neg2050:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:-2050
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:-2050
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
@@ -2316,5 +2322,44 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128
   ret <2 x half> %cast
 }
 
+; --------------------------------------------------------------------------------
+; or-with-constant as add
+; --------------------------------------------------------------------------------
+
+; Check add-as-or with split 64-bit or.
+define amdgpu_ps float @global_load_saddr_i8_offset_or_i64_imm_offset_16(i8 addrspace(6)* inreg %sbase, i32 %idx) {
+; GCN-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_or_b32_e32 v0, 16, v0
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    global_load_ubyte v0, v[0:1], off
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ; return to shader part epilog
+  %zext.idx = zext i32 %idx to i64
+  %or = or i64 %zext.idx, 16
+  %addr = inttoptr i64 %or to i8 addrspace(1)*
+  %load = load i8, i8 addrspace(1)* %addr
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
+define amdgpu_ps float @global_load_saddr_i8_offset_or_i64_imm_offset_4160(i8 addrspace(6)* inreg %sbase, i32 %idx) {
+; GCN-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_4160:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_or_b32_e32 v0, 0x1040, v0
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    global_load_ubyte v0, v[0:1], off
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ; return to shader part epilog
+  %zext.idx = zext i32 %idx to i64
+  %or = or i64 %zext.idx, 4160
+  %addr = inttoptr i64 %or to i8 addrspace(1)*
+  %load = load i8, i8 addrspace(1)* %addr
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
 !0 = !{i32 0, i32 1073741824} ; (1 << 30)
 !1 = !{i32 0, i32 1073741825} ; (1 << 30) + 1

diff  --git a/llvm/test/CodeGen/AMDGPU/global_atomics.ll b/llvm/test/CodeGen/AMDGPU/global_atomics.ll
index 7bd2fd270747..93dcde07d9ff 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics.ll
@@ -4,7 +4,7 @@
 
 ; GCN-LABEL: {{^}}atomic_add_i32_offset:
 ; SIVI: buffer_atomic_add v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
-; GFX9: global_atomic_add v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off offset:16{{$}}
+; GFX9: global_atomic_add v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:16{{$}}
 define amdgpu_kernel void @atomic_add_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
@@ -13,7 +13,7 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}atomic_add_i32_max_neg_offset:
-; GFX9: global_atomic_add v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off offset:-4096{{$}}
+; GFX9: global_atomic_add v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:-4096{{$}}
 define amdgpu_kernel void @atomic_add_i32_max_neg_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 -1024
@@ -57,7 +57,7 @@ entry:
 ; SIVI: buffer_atomic_add [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; SIVI: buffer_store_dword [[RET]]
 
-; GFX9: global_atomic_add v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off offset:16 glc{{$}}
+; GFX9: global_atomic_add v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:16 glc{{$}}
 define amdgpu_kernel void @atomic_add_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
@@ -84,7 +84,7 @@ entry:
 ; SIVI: buffer_store_dword [[RET]]
 
 ; GFX9: global_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16 glc{{$}}
-; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+; GFX9: global_store_dword v{{[0-9]+}}, [[RET]], s
 define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
@@ -96,7 +96,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_add_i32:
 ; SIVI: buffer_atomic_add v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-; GFX9: global_atomic_add v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off{{$}}
+; GFX9: global_atomic_add v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]$}}
 define amdgpu_kernel void @atomic_add_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile add i32 addrspace(1)* %out, i32 %in seq_cst
@@ -107,8 +107,8 @@ entry:
 ; SIVI: buffer_atomic_add [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; SIVI: buffer_store_dword [[RET]]
 
-; GFX9: global_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off glc{{$}}
-; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+; GFX9: global_atomic_add [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
+; GFX9: global_store_dword v{{[0-9]+}}, [[RET]], s
 define amdgpu_kernel void @atomic_add_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile add i32 addrspace(1)* %out, i32 %in seq_cst
@@ -144,7 +144,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_and_i32_offset:
 ; SIVI: buffer_atomic_and v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
 
-; GFX9: global_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16{{$}}
+; GFX9: global_atomic_and v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:16{{$}}
 define amdgpu_kernel void @atomic_and_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
@@ -156,7 +156,7 @@ entry:
 ; SIVI: buffer_atomic_and [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; SIVI: buffer_store_dword [[RET]]
 
-; GFX9: global_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16 glc{{$}}
+; GFX9: global_atomic_and [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16 glc{{$}}
 define amdgpu_kernel void @atomic_and_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
@@ -196,7 +196,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_and_i32:
 ; SIVI: buffer_atomic_and v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
 
-; GFX9: global_atomic_and v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off{{$}}
+; GFX9: global_atomic_and v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]$}}
 define amdgpu_kernel void @atomic_and_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile and i32 addrspace(1)* %out, i32 %in seq_cst
@@ -207,7 +207,7 @@ entry:
 ; SIVI: buffer_atomic_and [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; SIVI: buffer_store_dword [[RET]]
 
-; GFX9: global_atomic_and v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off glc{{$}}
+; GFX9: global_atomic_and v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
 define amdgpu_kernel void @atomic_and_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile and i32 addrspace(1)* %out, i32 %in seq_cst
@@ -244,7 +244,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_sub_i32_offset:
 ; SIVI: buffer_atomic_sub v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
 
-; GFX9: global_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16{{$}}
+; GFX9: global_atomic_sub v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:16{{$}}
 define amdgpu_kernel void @atomic_sub_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
@@ -256,7 +256,7 @@ entry:
 ; SIVI: buffer_atomic_sub [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; SIVI: buffer_store_dword [[RET]]
 
-; GFX9: global_atomic_sub v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16 glc{{$}}
+; GFX9: global_atomic_sub v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:16 glc{{$}}
 define amdgpu_kernel void @atomic_sub_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
@@ -296,7 +296,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_sub_i32:
 ; SIVI: buffer_atomic_sub v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
 
-; GFX9: global_atomic_sub v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off{{$}}
+; GFX9: global_atomic_sub v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}]{{$}}
 define amdgpu_kernel void @atomic_sub_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile sub i32 addrspace(1)* %out, i32 %in seq_cst
@@ -307,7 +307,7 @@ entry:
 ; SIVI: buffer_atomic_sub [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; SIVI: buffer_store_dword [[RET]]
 
-; GFX9: global_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off glc{{$}}
+; GFX9: global_atomic_sub [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
 define amdgpu_kernel void @atomic_sub_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile sub i32 addrspace(1)* %out, i32 %in seq_cst
@@ -344,7 +344,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_max_i32_offset:
 ; SIVI: buffer_atomic_smax v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
 
-; GFX9: global_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16{{$}}
+; GFX9: global_atomic_smax v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:16{{$}}
 define amdgpu_kernel void @atomic_max_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
@@ -356,7 +356,7 @@ entry:
 ; SIVI: buffer_atomic_smax [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; SIVI: buffer_store_dword [[RET]]
 
-; GFX9: global_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16 glc{{$}}
+; GFX9: global_atomic_smax [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16 glc{{$}}
 define amdgpu_kernel void @atomic_max_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
@@ -396,7 +396,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_max_i32:
 ; SIVI: buffer_atomic_smax v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
 
-; GFX9: global_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off{{$}}
+; GFX9: global_atomic_smax v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}]{{$}}
 define amdgpu_kernel void @atomic_max_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile max i32 addrspace(1)* %out, i32 %in seq_cst
@@ -407,7 +407,7 @@ entry:
 ; SIVI: buffer_atomic_smax [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; SIVI: buffer_store_dword [[RET]]
 
-; GFX9: global_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off glc{{$}}
+; GFX9: global_atomic_smax [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
 define amdgpu_kernel void @atomic_max_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile max i32 addrspace(1)* %out, i32 %in seq_cst
@@ -444,7 +444,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umax_i32_offset:
 ; SIVI: buffer_atomic_umax v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
 
-; GFX9: global_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16{{$}}
+; GFX9: global_atomic_umax v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16{{$}}
 define amdgpu_kernel void @atomic_umax_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
@@ -456,7 +456,7 @@ entry:
 ; SIVI: buffer_atomic_umax [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; SIVI: buffer_store_dword [[RET]]
 
-; GFX9: global_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16 glc{{$}}
+; GFX9: global_atomic_umax [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16 glc{{$}}
 define amdgpu_kernel void @atomic_umax_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
@@ -495,7 +495,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umax_i32:
 ; SIVI: buffer_atomic_umax v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
 
-; GFX9: global_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off{{$}}
+; GFX9: global_atomic_umax v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}]{{$}}
 define amdgpu_kernel void @atomic_umax_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile umax i32 addrspace(1)* %out, i32 %in seq_cst
@@ -506,7 +506,7 @@ entry:
 ; SIVI: buffer_atomic_umax [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; SIVI: buffer_store_dword [[RET]]
 
-; GFX9: global_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off glc{{$}}
+; GFX9: global_atomic_umax [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] glc{{$}}
 define amdgpu_kernel void @atomic_umax_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile umax i32 addrspace(1)* %out, i32 %in seq_cst
@@ -542,7 +542,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_min_i32_offset:
 ; SIVI: buffer_atomic_smin v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
 
-; GFX9: global_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16{{$}}
+; GFX9: global_atomic_smin v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16{{$}}
 define amdgpu_kernel void @atomic_min_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
@@ -554,7 +554,7 @@ entry:
 ; SIVI: buffer_atomic_smin [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; SIVI: buffer_store_dword [[RET]]
 
-; GFX9: global_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16 glc{{$}}
+; GFX9: global_atomic_smin [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16 glc{{$}}
 define amdgpu_kernel void @atomic_min_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
@@ -593,7 +593,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_min_i32:
 ; SIVI: buffer_atomic_smin v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
 
-; GFX9: global_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off{{$}}
+; GFX9: global_atomic_smin v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}]{{$}}
 define amdgpu_kernel void @atomic_min_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile min i32 addrspace(1)* %out, i32 %in seq_cst
@@ -604,7 +604,7 @@ entry:
 ; SIVI: buffer_atomic_smin [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; SIVI: buffer_store_dword [[RET]]
 
-; GFX9: global_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off glc{{$}}
+; GFX9: global_atomic_smin [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] glc{{$}}
 define amdgpu_kernel void @atomic_min_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile min i32 addrspace(1)* %out, i32 %in seq_cst
@@ -640,7 +640,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umin_i32_offset:
 ; SIVI: buffer_atomic_umin v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
 
-; GFX9: global_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16{{$}}
+; GFX9: global_atomic_umin v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16{{$}}
 define amdgpu_kernel void @atomic_umin_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
@@ -652,7 +652,7 @@ entry:
 ; SIVI: buffer_atomic_umin [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; SIVI: buffer_store_dword [[RET]]
 
-; GFX9: global_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16 glc{{$}}
+; GFX9: global_atomic_umin [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16 glc{{$}}
 define amdgpu_kernel void @atomic_umin_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
@@ -690,7 +690,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umin_i32:
 ; SIVI: buffer_atomic_umin v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-; GFX9: global_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off{{$}}
+; GFX9: global_atomic_umin v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}]{{$}}
 define amdgpu_kernel void @atomic_umin_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile umin i32 addrspace(1)* %out, i32 %in seq_cst
@@ -701,7 +701,7 @@ entry:
 ; SIVI: buffer_atomic_umin [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; SIVI: buffer_store_dword [[RET]]
 
-; GFX9: global_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off glc{{$}}
+; GFX9: global_atomic_umin [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
 define amdgpu_kernel void @atomic_umin_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile umin i32 addrspace(1)* %out, i32 %in seq_cst
@@ -737,7 +737,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_or_i32_offset:
 ; SIVI: buffer_atomic_or v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
 
-; GFX9: global_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16{{$}}
+; GFX9: global_atomic_or v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16{{$}}
 define amdgpu_kernel void @atomic_or_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
@@ -749,7 +749,7 @@ entry:
 ; SIVI: buffer_atomic_or [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; SIVI: buffer_store_dword [[RET]]
 
-; GFX9: global_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16 glc{{$}}
+; GFX9: global_atomic_or [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16 glc{{$}}
 define amdgpu_kernel void @atomic_or_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
@@ -788,7 +788,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_or_i32:
 ; SIVI: buffer_atomic_or v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
 
-; GFX9: global_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off{{$}}
+; GFX9: global_atomic_or v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}]{{$}}
 define amdgpu_kernel void @atomic_or_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile or i32 addrspace(1)* %out, i32 %in seq_cst
@@ -799,7 +799,7 @@ entry:
 ; SIVI: buffer_atomic_or [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; SIVI: buffer_store_dword [[RET]]
 
-; GFX9: global_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off glc{{$}}
+; GFX9: global_atomic_or [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] glc{{$}}
 define amdgpu_kernel void @atomic_or_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile or i32 addrspace(1)* %out, i32 %in seq_cst
@@ -835,7 +835,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xchg_i32_offset:
 ; SIVI: buffer_atomic_swap v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
 
-; GFX9: global_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16{{$}}
+; GFX9: global_atomic_swap v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16{{$}}
 define amdgpu_kernel void @atomic_xchg_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
@@ -846,7 +846,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xchg_f32_offset:
 ; SIVI: buffer_atomic_swap v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
 
-; GFX9: global_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16{{$}}
+; GFX9: global_atomic_swap v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16{{$}}
 define amdgpu_kernel void @atomic_xchg_f32_offset(float addrspace(1)* %out, float %in) {
 entry:
   %gep = getelementptr float, float addrspace(1)* %out, i64 4
@@ -858,7 +858,7 @@ entry:
 ; SIVI: buffer_atomic_swap [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; SIVI: buffer_store_dword [[RET]]
 
-; GFX9: global_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16 glc{{$}}
+; GFX9: global_atomic_swap [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16 glc{{$}}
 define amdgpu_kernel void @atomic_xchg_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
@@ -896,7 +896,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xchg_i32:
 ; SIVI: buffer_atomic_swap v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-; GFX9: global_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off{{$}}
+; GFX9: global_atomic_swap v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}]{{$}}
 define amdgpu_kernel void @atomic_xchg_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in seq_cst
@@ -907,7 +907,7 @@ entry:
 ; SIVI: buffer_atomic_swap [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; SIVI: buffer_store_dword [[RET]]
 
-; GFX9: global_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off glc{{$}}
+; GFX9: global_atomic_swap [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] glc{{$}}
 define amdgpu_kernel void @atomic_xchg_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in seq_cst
@@ -943,7 +943,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i32_offset:
 ; SIVI: buffer_atomic_cmpswap v[{{[0-9]+}}:{{[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
 
-; GFX9: global_atomic_cmpswap v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:16{{$}}
+; GFX9: global_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] offset:16{{$}}
 define amdgpu_kernel void @atomic_cmpxchg_i32_offset(i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
@@ -955,7 +955,7 @@ entry:
 ; SIVI: buffer_atomic_cmpswap v{{\[}}[[RET:[0-9]+]]{{:[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; SIVI: buffer_store_dword v[[RET]]
 
-; GFX9: global_atomic_cmpswap [[RET:v[0-9]+]],  v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:16 glc{{$}}
+; GFX9: global_atomic_cmpswap [[RET:v[0-9]+]], v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}] offset:16 glc{{$}}
 define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
@@ -997,7 +997,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i32:
 ; SIVI: buffer_atomic_cmpswap v[{{[0-9]+:[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
 
-; GFX9: global_atomic_cmpswap v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
+; GFX9: global_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}]{{$}}
 define amdgpu_kernel void @atomic_cmpxchg_i32(i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %val = cmpxchg volatile i32 addrspace(1)* %out, i32 %old, i32 %in seq_cst seq_cst
@@ -1008,7 +1008,7 @@ entry:
 ; SIVI: buffer_atomic_cmpswap v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; SIVI: buffer_store_dword v[[RET]]
 
-; GFX9: global_atomic_cmpswap [[RET:v[0-9]+]], v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], off glc{{$}}
+; GFX9: global_atomic_cmpswap [[RET:v[0-9]+]], v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
 define amdgpu_kernel void @atomic_cmpxchg_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i32 %old) {
 entry:
   %val = cmpxchg volatile i32 addrspace(1)* %out, i32 %old, i32 %in seq_cst seq_cst
@@ -1046,7 +1046,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xor_i32_offset:
 ; SIVI: buffer_atomic_xor v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
 
-; GFX9: global_atomic_xor v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off offset:16{{$}}
+; GFX9: global_atomic_xor v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:16{{$}}
 define amdgpu_kernel void @atomic_xor_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
@@ -1058,7 +1058,7 @@ entry:
 ; SIVI: buffer_atomic_xor [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; SIVI: buffer_store_dword [[RET]]
 
-; GFX9: global_atomic_xor v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16 glc{{$}}
+; GFX9: global_atomic_xor v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16 glc{{$}}
 define amdgpu_kernel void @atomic_xor_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
@@ -1096,7 +1096,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xor_i32:
 ; SIVI: buffer_atomic_xor v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-; GFX9: global_atomic_xor v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off{{$}}
+; GFX9: global_atomic_xor v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}]{{$}}
 define amdgpu_kernel void @atomic_xor_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xor i32 addrspace(1)* %out, i32 %in seq_cst
@@ -1107,7 +1107,7 @@ entry:
 ; SIVI: buffer_atomic_xor [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; SIVI: buffer_store_dword [[RET]]
 
-; GFX9: global_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off glc{{$}}
+; GFX9: global_atomic_xor [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 define amdgpu_kernel void @atomic_xor_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile xor i32 addrspace(1)* %out, i32 %in seq_cst
@@ -1145,7 +1145,7 @@ entry:
 ; VI: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
 ; SIVI: buffer_store_dword [[RET]]
 
-; GFX9: global_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], off offset:16 glc{{$}}
+; GFX9: global_load_dword [[RET:v[0-9]+]], v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:16 glc{{$}}
 define amdgpu_kernel void @atomic_load_i32_offset(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %in, i64 4
@@ -1161,7 +1161,7 @@ entry:
 ; VI-NEXT: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, -1
 ; VI: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
 
-; GFX9: global_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], off offset:-512 glc{{$}}
+; GFX9: global_load_dword [[RET:v[0-9]+]], v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:-512 glc{{$}}
 define amdgpu_kernel void @atomic_load_i32_negoffset(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %in, i64 -128
@@ -1175,7 +1175,7 @@ entry:
 ; VI: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
 ; SIVI: buffer_store_dword [[RET]]
 
-; GFX9: global_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], off offset:16 glc{{$}}
+; GFX9: global_load_dword [[RET:v[0-9]+]], v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:16 glc{{$}}
 define amdgpu_kernel void @atomic_load_f32_offset(float addrspace(1)* %in, float addrspace(1)* %out) {
 entry:
   %gep = getelementptr float, float addrspace(1)* %in, i64 4
@@ -1189,7 +1189,7 @@ entry:
 ; VI: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc
 ; SIVI: buffer_store_dword [[RET]]
 
-; GFX9: global_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], off glc
+; GFX9: global_load_dword [[RET:v[0-9]+]], v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] glc
 define amdgpu_kernel void @atomic_load_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(1)* %in seq_cst, align 4
@@ -1244,7 +1244,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_store_i32_offset:
 ; SI: buffer_store_dword {{v[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
 ; VI: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+$}}
-; GFX9: global_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}, off offset:16{{$}}
+; GFX9: global_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:16{{$}}
 define amdgpu_kernel void @atomic_store_i32_offset(i32 %in, i32 addrspace(1)* %out) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
@@ -1255,7 +1255,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_store_i32:
 ; SI: buffer_store_dword {{v[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
 ; VI: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+$}}
-; GFX9: global_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}, off{{$}}
+; GFX9: global_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s{{\[[0-9]+:[0-9]+\]$}}
 define amdgpu_kernel void @atomic_store_i32(i32 %in, i32 addrspace(1)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(1)* %out seq_cst, align 4
@@ -1265,7 +1265,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_store_f32:
 ; SI: buffer_store_dword {{v[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
 ; VI: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+$}}
-; GFX9: global_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}, off{{$}}
+; GFX9: global_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}]{{$}}
 define amdgpu_kernel void @atomic_store_f32(float %in, float addrspace(1)* %out) {
 entry:
   store atomic float %in, float addrspace(1)* %out seq_cst, align 4

diff  --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
index 88bec6cf66df..456080dc8e02 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
@@ -5,7 +5,7 @@
 ; GCN-LABEL: {{^}}atomic_add_i64_offset:
 ; CIVI: buffer_atomic_add_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
 
-; GFX9: global_atomic_add_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], off offset:32{{$}}
+; GFX9: global_atomic_add_x2 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:32{{$}}
 define amdgpu_kernel void @atomic_add_i64_offset(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
@@ -17,7 +17,7 @@ entry:
 ; CIVI: buffer_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
 ; CIVI: buffer_store_dwordx2 [[RET]]
 
-; GFX9: global_atomic_add_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], off offset:32 glc{{$}}
+; GFX9: global_atomic_add_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:32 glc{{$}}
 define amdgpu_kernel void @atomic_add_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
@@ -55,7 +55,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_add_i64:
 ; SIVI: buffer_atomic_add_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-; GFX9: global_atomic_add_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off{{$}}
+; GFX9: global_atomic_add_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]$}}
 define amdgpu_kernel void @atomic_add_i64(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile add i64 addrspace(1)* %out, i64 %in seq_cst
@@ -66,7 +66,7 @@ entry:
 ; CIVI: buffer_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; CIVI: buffer_store_dwordx2 [[RET]]
 
-; GFX9: global_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off glc{{$}}
+; GFX9: global_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 define amdgpu_kernel void @atomic_add_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile add i64 addrspace(1)* %out, i64 %in seq_cst
@@ -101,7 +101,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_and_i64_offset:
 ; CIVI: buffer_atomic_and_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
-; GFX9: global_atomic_and_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32{{$}}
+; GFX9: global_atomic_and_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}}
 define amdgpu_kernel void @atomic_and_i64_offset(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
@@ -113,7 +113,7 @@ entry:
 ; CIVI: buffer_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
 ; CIVI: buffer_store_dwordx2 [[RET]]
 
-; GFX9: global_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32 glc{{$}}
+; GFX9: global_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32 glc{{$}}
 define amdgpu_kernel void @atomic_and_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
@@ -151,7 +151,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_and_i64:
 ; CIVI: buffer_atomic_and_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-; GFX9: global_atomic_and_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off{{$}}
+; GFX9: global_atomic_and_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]$}}
 define amdgpu_kernel void @atomic_and_i64(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile and i64 addrspace(1)* %out, i64 %in seq_cst
@@ -162,7 +162,7 @@ entry:
 ; CIVI: buffer_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; CIVI: buffer_store_dwordx2 [[RET]]
 
-; GFX9: global_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off glc{{$}}
+; GFX9: global_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 define amdgpu_kernel void @atomic_and_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile and i64 addrspace(1)* %out, i64 %in seq_cst
@@ -197,7 +197,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_sub_i64_offset:
 ; CIVI: buffer_atomic_sub_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
-; GFX9: global_atomic_sub_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32{{$}}
+; GFX9: global_atomic_sub_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}}
 define amdgpu_kernel void @atomic_sub_i64_offset(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
@@ -209,7 +209,7 @@ entry:
 ; CIVI: buffer_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
 ; CIVI: buffer_store_dwordx2 [[RET]]
 
-; GFX9: global_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32 glc{{$}}
+; GFX9: global_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32 glc{{$}}
 define amdgpu_kernel void @atomic_sub_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
@@ -247,7 +247,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_sub_i64:
 ; CIVI: buffer_atomic_sub_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-; GFX9: global_atomic_sub_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off{{$}}
+; GFX9: global_atomic_sub_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]$}}
 define amdgpu_kernel void @atomic_sub_i64(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile sub i64 addrspace(1)* %out, i64 %in seq_cst
@@ -258,7 +258,7 @@ entry:
 ; CIVI: buffer_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; CIVI: buffer_store_dwordx2 [[RET]]
 
-; GFX9: global_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off glc{{$}}
+; GFX9: global_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 define amdgpu_kernel void @atomic_sub_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile sub i64 addrspace(1)* %out, i64 %in seq_cst
@@ -293,7 +293,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_max_i64_offset:
 ; CIVI: buffer_atomic_smax_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
-; GFX9: global_atomic_smax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32{{$}}
+; GFX9: global_atomic_smax_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}}
 define amdgpu_kernel void @atomic_max_i64_offset(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
@@ -305,7 +305,7 @@ entry:
 ; CIVI: buffer_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
 ; CIVI: buffer_store_dwordx2 [[RET]]
 
-; GFX9: global_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32 glc{{$}}
+; GFX9: global_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32 glc{{$}}
 define amdgpu_kernel void @atomic_max_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
@@ -343,7 +343,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_max_i64:
 ; CIVI: buffer_atomic_smax_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-; GFX9: global_atomic_smax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off{{$}}
+; GFX9: global_atomic_smax_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]$}}
 define amdgpu_kernel void @atomic_max_i64(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile max i64 addrspace(1)* %out, i64 %in seq_cst
@@ -354,7 +354,7 @@ entry:
 ; CIVI: buffer_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; CIVI: buffer_store_dwordx2 [[RET]]
 
-; GFX9: global_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off glc{{$}}
+; GFX9: global_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 define amdgpu_kernel void @atomic_max_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile max i64 addrspace(1)* %out, i64 %in seq_cst
@@ -389,7 +389,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umax_i64_offset:
 ; CIVI: buffer_atomic_umax_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
-; GFX9: global_atomic_umax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32{{$}}
+; GFX9: global_atomic_umax_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}}
 define amdgpu_kernel void @atomic_umax_i64_offset(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
@@ -401,7 +401,7 @@ entry:
 ; CIVI: buffer_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
 ; CIVI: buffer_store_dwordx2 [[RET]]
 
-; GFX9: global_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32 glc{{$}}
+; GFX9: global_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32 glc{{$}}
 define amdgpu_kernel void @atomic_umax_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
@@ -439,7 +439,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umax_i64:
 ; CIVI: buffer_atomic_umax_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-; GFX9: global_atomic_umax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off{{$}}
+; GFX9: global_atomic_umax_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]$}}
 define amdgpu_kernel void @atomic_umax_i64(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile umax i64 addrspace(1)* %out, i64 %in seq_cst
@@ -450,7 +450,7 @@ entry:
 ; CIVI: buffer_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; CIVI: buffer_store_dwordx2 [[RET]]
 
-; GFX9: global_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off glc{{$}}
+; GFX9: global_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 define amdgpu_kernel void @atomic_umax_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile umax i64 addrspace(1)* %out, i64 %in seq_cst
@@ -485,7 +485,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_min_i64_offset:
 ; CIVI: buffer_atomic_smin_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
-; GFX9: global_atomic_smin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32{{$}}
+; GFX9: global_atomic_smin_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}}
 define amdgpu_kernel void @atomic_min_i64_offset(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
@@ -497,7 +497,7 @@ entry:
 ; CIVI: buffer_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
 ; CIVI: buffer_store_dwordx2 [[RET]]
 
-; GFX9: global_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32 glc{{$}}
+; GFX9: global_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32 glc{{$}}
 define amdgpu_kernel void @atomic_min_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
@@ -535,7 +535,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_min_i64:
 ; CIVI: buffer_atomic_smin_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-; GFX9: global_atomic_smin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off{{$}}
+; GFX9: global_atomic_smin_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]$}}
 define amdgpu_kernel void @atomic_min_i64(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile min i64 addrspace(1)* %out, i64 %in seq_cst
@@ -546,7 +546,7 @@ entry:
 ; CIVI: buffer_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; CIVI: buffer_store_dwordx2 [[RET]]
 
-; GFX9: global_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off glc{{$}}
+; GFX9: global_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 define amdgpu_kernel void @atomic_min_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile min i64 addrspace(1)* %out, i64 %in seq_cst
@@ -582,7 +582,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umin_i64_offset:
 ; CIVI: buffer_atomic_umin_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
 
-; GFX9: global_atomic_umin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32{{$}}
+; GFX9: global_atomic_umin_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}}
 define amdgpu_kernel void @atomic_umin_i64_offset(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
@@ -594,7 +594,7 @@ entry:
 ; CIVI: buffer_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
 ; CIVI: buffer_store_dwordx2 [[RET]]
 
-; GFX9: global_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32 glc{{$}}
+; GFX9: global_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32 glc{{$}}
 define amdgpu_kernel void @atomic_umin_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
@@ -632,7 +632,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umin_i64:
 ; CIVI: buffer_atomic_umin_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-; GFX9: global_atomic_umin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off{{$}}
+; GFX9: global_atomic_umin_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]$}}
 define amdgpu_kernel void @atomic_umin_i64(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile umin i64 addrspace(1)* %out, i64 %in seq_cst
@@ -643,7 +643,7 @@ entry:
 ; CIVI: buffer_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; CIVI: buffer_store_dwordx2 [[RET]]
 
-; GFX9: global_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off glc{{$}}
+; GFX9: global_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 define amdgpu_kernel void @atomic_umin_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile umin i64 addrspace(1)* %out, i64 %in seq_cst
@@ -678,7 +678,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_or_i64_offset:
 ; CIVI: buffer_atomic_or_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
-; GFX9: global_atomic_or_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32{{$}}
+; GFX9: global_atomic_or_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}}
 define amdgpu_kernel void @atomic_or_i64_offset(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
@@ -690,7 +690,7 @@ entry:
 ; CIVI: buffer_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
 ; CIVI: buffer_store_dwordx2 [[RET]]
 
-; GFX9: global_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32 glc{{$}}
+; GFX9: global_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32 glc{{$}}
 define amdgpu_kernel void @atomic_or_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
@@ -728,7 +728,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_or_i64:
 ; CIVI: buffer_atomic_or_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-; GFX9: global_atomic_or_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off{{$}}
+; GFX9: global_atomic_or_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}{{$}}
 define amdgpu_kernel void @atomic_or_i64(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile or i64 addrspace(1)* %out, i64 %in seq_cst
@@ -739,7 +739,7 @@ entry:
 ; CIVI: buffer_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; CIVI: buffer_store_dwordx2 [[RET]]
 
-; GFX9: global_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off glc{{$}}
+; GFX9: global_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 define amdgpu_kernel void @atomic_or_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile or i64 addrspace(1)* %out, i64 %in seq_cst
@@ -775,7 +775,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xchg_i64_offset:
 ; CIVI: buffer_atomic_swap_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
 
-; GFX9: global_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32{{$}}
+; GFX9: global_atomic_swap_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}}
 define amdgpu_kernel void @atomic_xchg_i64_offset(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
@@ -786,7 +786,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xchg_f64_offset:
 ; CIVI: buffer_atomic_swap_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
 
-; GFX9: global_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32{{$}}
+; GFX9: global_atomic_swap_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}}
 define amdgpu_kernel void @atomic_xchg_f64_offset(double addrspace(1)* %out, double %in) {
 entry:
   %gep = getelementptr double, double addrspace(1)* %out, i64 4
@@ -798,7 +798,7 @@ entry:
 ; CIVI: buffer_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
 ; CIVI: buffer_store_dwordx2 [[RET]]
 
-; GFX9: global_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32 glc{{$}}
+; GFX9: global_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:32 glc{{$}}
 define amdgpu_kernel void @atomic_xchg_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
@@ -836,7 +836,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xchg_i64:
 ; CIVI: buffer_atomic_swap_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-; GFX9: global_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off{{$}}
+; GFX9: global_atomic_swap_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}{{$}}
 define amdgpu_kernel void @atomic_xchg_i64(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile xchg i64 addrspace(1)* %out, i64 %in seq_cst
@@ -847,7 +847,7 @@ entry:
 ; CIVI: buffer_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; CIVI: buffer_store_dwordx2 [[RET]]
 
-; GFX9: global_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off glc{{$}}
+; GFX9: global_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 define amdgpu_kernel void @atomic_xchg_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile xchg i64 addrspace(1)* %out, i64 %in seq_cst
@@ -882,7 +882,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xor_i64_offset:
 ; CIVI: buffer_atomic_xor_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
-; GFX9: global_atomic_xor_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32{{$}}
+; GFX9: global_atomic_xor_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}}
 define amdgpu_kernel void @atomic_xor_i64_offset(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
@@ -894,7 +894,7 @@ entry:
 ; CIVI: buffer_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
 ; CIVI: buffer_store_dwordx2 [[RET]]
 
-; GFX9: global_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32 glc{{$}}
+; GFX9: global_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32 glc{{$}}
 define amdgpu_kernel void @atomic_xor_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
@@ -932,7 +932,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xor_i64:
 ; CIVI: buffer_atomic_xor_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-; GFX9: global_atomic_xor_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off{{$}}
+; GFX9: global_atomic_xor_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}{{$}}
 define amdgpu_kernel void @atomic_xor_i64(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile xor i64 addrspace(1)* %out, i64 %in seq_cst
@@ -943,7 +943,7 @@ entry:
 ; CIVI: buffer_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; CIVI: buffer_store_dwordx2 [[RET]]
 
-; GFX9: global_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off glc{{$}}
+; GFX9: global_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 define amdgpu_kernel void @atomic_xor_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile xor i64 addrspace(1)* %out, i64 %in seq_cst
@@ -979,7 +979,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_offset:
 ; CIVI: buffer_atomic_cmpswap_x2 v[{{[0-9]+}}:{{[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
-; GFX9: global_atomic_cmpswap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32{{$}}
+; GFX9: global_atomic_cmpswap_x2 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}}
 define amdgpu_kernel void @atomic_cmpxchg_i64_offset(i64 addrspace(1)* %out, i64 %in, i64 %old) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
@@ -1004,7 +1004,7 @@ entry:
 ; CIVI: buffer_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]{{:[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
 ; CIVI: buffer_store_dwordx2 v{{\[}}[[RET]]:
 
-; GFX9: global_atomic_cmpswap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32 glc{{$}}
+; GFX9: global_atomic_cmpswap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32 glc{{$}}
 define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %old) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
@@ -1044,7 +1044,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i64:
 ; CIVI: buffer_atomic_cmpswap_x2 v[{{[0-9]+:[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-; GFX9: global_atomic_cmpswap_x2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
+; GFX9: global_atomic_cmpswap_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}]{{$}}
 define amdgpu_kernel void @atomic_cmpxchg_i64(i64 addrspace(1)* %out, i64 %in, i64 %old) {
 entry:
   %val = cmpxchg volatile i64 addrspace(1)* %out, i64 %old, i64 %in seq_cst seq_cst
@@ -1055,7 +1055,7 @@ entry:
 ; CIVI: buffer_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; CIVI: buffer_store_dwordx2 v{{\[}}[[RET]]:
 
-; GFX9: global_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off glc{{$}}
+; GFX9: global_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+\]}}, v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+:[0-9]+}}] glc{{$}}
 define amdgpu_kernel void @atomic_cmpxchg_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %old) {
 entry:
   %val = cmpxchg volatile i64 addrspace(1)* %out, i64 %old, i64 %in seq_cst seq_cst
@@ -1095,7 +1095,7 @@ entry:
 ; VI: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
 ; CIVI: buffer_store_dwordx2 [[RET]]
 
-; GFX9: global_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}], off offset:32 glc{{$}}
+; GFX9: global_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:32 glc{{$}}
 define amdgpu_kernel void @atomic_load_i64_offset(i64 addrspace(1)* %in, i64 addrspace(1)* %out) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %in, i64 4
@@ -1115,7 +1115,7 @@ entry:
 
 ; CIVI: buffer_store_dwordx2 [[RET]]
 
-; GFX9: global_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}], off offset:-32 glc{{$}}
+; GFX9: global_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:-32 glc{{$}}
 define amdgpu_kernel void @atomic_load_i64_neg_offset(i64 addrspace(1)* %in, i64 addrspace(1)* %out) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %in, i64 -4
@@ -1129,7 +1129,7 @@ entry:
 ; VI: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}] glc
 ; CIVI: buffer_store_dwordx2 [[RET]]
 
-; GFX9: global_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}], off glc{{$}}
+; GFX9: global_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
 define amdgpu_kernel void @atomic_load_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %out) {
 entry:
   %val = load atomic i64, i64 addrspace(1)* %in seq_cst, align 8
@@ -1184,7 +1184,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_store_i64_offset:
 ; CI: buffer_store_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
 ; VI: flat_store_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
-; GFX9: global_store_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}], off offset:32{{$}}
+; GFX9: global_store_dwordx2 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:32{{$}}
 define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, i64 addrspace(1)* %out) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
@@ -1195,7 +1195,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_store_i64:
 ; CI: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
 ; VI: flat_store_dwordx2 {{v\[[0-9]+:[0-9]\]}}, v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
-; GFX9: global_store_dwordx2 {{v\[[0-9]+:[0-9]\]}}, v[{{[0-9]+}}:{{[0-9]+}}], off{{$}}
+; GFX9: global_store_dwordx2 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]\]}}, s[{{[0-9]+}}:{{[0-9]+}}]{{$}}
 define amdgpu_kernel void @atomic_store_i64(i64 %in, i64 addrspace(1)* %out) {
 entry:
   store atomic i64 %in, i64 addrspace(1)* %out seq_cst, align 8

diff  --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll
index 5df0896e46f3..069658cd1813 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll
@@ -18,7 +18,7 @@
 ; WAVE64:    .sgpr_count:     8
 ; WAVE32:    .sgpr_count:     10
 ; CHECK:     .symbol:         test.kd
-; CHECK:     .vgpr_count:     6
+; CHECK:     .vgpr_count:     {{3|6}}
 ; WAVE64:    .wavefront_size: 64
 ; WAVE32:    .wavefront_size: 32
 define amdgpu_kernel void @test(
@@ -50,8 +50,8 @@ entry:
 ; CHECK:   .name:       num_spilled_sgprs
 ; GFX700:   .sgpr_spill_count: 38
 ; GFX803:   .sgpr_spill_count: 22
-; GFX900:   .sgpr_spill_count: 22
-; GFX1010:  .sgpr_spill_count: 22
+; GFX900:   .sgpr_spill_count: 48
+; GFX1010:  .sgpr_spill_count: 48
 ; CHECK:   .symbol:     num_spilled_sgprs.kd
 define amdgpu_kernel void @num_spilled_sgprs(
     i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, [8 x i32],
@@ -88,7 +88,7 @@ entry:
 
 ; CHECK:   .name:       num_spilled_vgprs
 ; CHECK:   .symbol:     num_spilled_vgprs.kd
-; CHECK:   .vgpr_spill_count: 14
+; CHECK:   .vgpr_spill_count: {{13|14}}
 define amdgpu_kernel void @num_spilled_vgprs() #1 {
   %val0 = load volatile float, float addrspace(1)* @var
   %val1 = load volatile float, float addrspace(1)* @var

diff  --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
index 76de5b908e02..98e7983ec4bc 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
@@ -17,7 +17,7 @@
 ; CHECK:     KernargSegmentAlign:     8
 ; CHECK:     WavefrontSize:           64
 ; CHECK:     NumSGPRs:                8
-; CHECK:     NumVGPRs:                6
+; CHECK:     NumVGPRs:                {{3|6}}
 ; CHECK:     MaxFlatWorkGroupSize:    1024
 define amdgpu_kernel void @test(
     half addrspace(1)* %r,
@@ -40,7 +40,7 @@ entry:
 ; CHECK:     KernargSegmentAlign:     8
 ; CHECK:     WavefrontSize:           64
 ; CHECK:     NumSGPRs:                8
-; CHECK:     NumVGPRs:                6
+; CHECK:     NumVGPRs:                {{3|6}}
 ; CHECK:     MaxFlatWorkGroupSize:    256
 define amdgpu_kernel void @test_max_flat_workgroup_size(
     half addrspace(1)* %r,
@@ -59,7 +59,7 @@ entry:
 ; CHECK:   CodeProps:
 ; GFX700:     NumSpilledSGPRs: 38
 ; GFX803:     NumSpilledSGPRs: 22
-; GFX900:     NumSpilledSGPRs: 22
+; GFX900:     NumSpilledSGPRs: {{22|48}}
 define amdgpu_kernel void @num_spilled_sgprs(
     i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, [8 x i32],
     i32 addrspace(1)* %out2, i32 addrspace(1)* %out3, [8 x i32],
@@ -96,7 +96,7 @@ entry:
 ; CHECK-LABEL: - Name:       num_spilled_vgprs
 ; CHECK:   SymbolName: 'num_spilled_vgprs at kd'
 ; CHECK:   CodeProps:
-; CHECK:     NumSpilledVGPRs: 14
+; CHECK:     NumSpilledVGPRs: {{13|14}}
 define amdgpu_kernel void @num_spilled_vgprs() #1 {
   %val0 = load volatile float, float addrspace(1)* @var
   %val1 = load volatile float, float addrspace(1)* @var

diff  --git a/llvm/test/CodeGen/AMDGPU/hsa.ll b/llvm/test/CodeGen/AMDGPU/hsa.ll
index 192ddd4e42ca..6493b29fe09e 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa.ll
@@ -66,7 +66,7 @@
 ; HSA-VI: s_mov_b32 s[[HI:[0-9]]], 0x1100f000
 ; Make sure we generate flat store for HSA
 ; PRE-GFX10: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}
-; GFX10: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}
+; GFX10: global_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}
 
 ; HSA: .Lfunc_end0:
 ; HSA: .size   simple, .Lfunc_end0-simple

diff  --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
index ffa17c94a93a..8b9931af6a19 100644
--- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
+++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
@@ -16,31 +16,30 @@ define amdgpu_kernel void @udiv32_invariant_denom(i32 addrspace(1)* nocapture %a
 ; GFX9-NEXT:    v_mul_lo_u32 v1, s3, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:  BB0_1: ; %bb3
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_mul_lo_u32 v3, s5, v0
-; GFX9-NEXT:    v_mul_hi_u32 v4, s4, v0
-; GFX9-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
-; GFX9-NEXT:    v_mul_lo_u32 v4, s3, v3
-; GFX9-NEXT:    v_not_b32_e32 v6, v3
-; GFX9-NEXT:    v_mul_lo_u32 v6, s2, v6
-; GFX9-NEXT:    v_add_u32_e32 v5, 1, v3
-; GFX9-NEXT:    v_add_u32_e32 v4, s4, v4
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GFX9-NEXT:    v_add_u32_e32 v5, s4, v6
+; GFX9-NEXT:    v_mul_lo_u32 v2, s5, v0
+; GFX9-NEXT:    v_mul_hi_u32 v3, s4, v0
+; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
+; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v2
+; GFX9-NEXT:    v_not_b32_e32 v5, v2
+; GFX9-NEXT:    v_mul_lo_u32 v5, s2, v5
+; GFX9-NEXT:    v_add_u32_e32 v4, 1, v2
+; GFX9-NEXT:    v_add_u32_e32 v3, s4, v3
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX9-NEXT:    v_add_u32_e32 v4, s4, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX9-NEXT:    s_add_u32 s4, s4, 1
+; GFX9-NEXT:    v_add_u32_e32 v4, 1, v2
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX9-NEXT:    s_addc_u32 s5, s5, 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX9-NEXT:    global_store_dword v1, v2, s[0:1]
 ; GFX9-NEXT:    s_add_u32 s0, s0, 4
 ; GFX9-NEXT:    s_addc_u32 s1, s1, 0
-; GFX9-NEXT:    v_add_u32_e32 v5, 1, v3
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
 ; GFX9-NEXT:    s_cmpk_eq_i32 s4, 0x400
-; GFX9-NEXT:    global_store_dword v[1:2], v3, off
 ; GFX9-NEXT:    s_cbranch_scc0 BB0_1
 ; GFX9-NEXT:  ; %bb.2: ; %bb2
 ; GFX9-NEXT:    s_endpgm
@@ -76,29 +75,28 @@ define amdgpu_kernel void @urem32_invariant_denom(i32 addrspace(1)* nocapture %a
 ; GFX9-NEXT:    v_mul_lo_u32 v1, s3, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:  BB1_1: ; %bb3
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_mul_lo_u32 v3, s5, v0
-; GFX9-NEXT:    v_mul_hi_u32 v4, s4, v0
-; GFX9-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
-; GFX9-NEXT:    v_mul_lo_u32 v4, s3, v3
-; GFX9-NEXT:    v_not_b32_e32 v3, v3
-; GFX9-NEXT:    v_mul_lo_u32 v3, s2, v3
-; GFX9-NEXT:    v_add_u32_e32 v4, s4, v4
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v4
+; GFX9-NEXT:    v_mul_lo_u32 v2, s5, v0
+; GFX9-NEXT:    v_mul_hi_u32 v3, s4, v0
+; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
+; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v2
+; GFX9-NEXT:    v_not_b32_e32 v2, v2
+; GFX9-NEXT:    v_mul_lo_u32 v2, s2, v2
 ; GFX9-NEXT:    v_add_u32_e32 v3, s4, v3
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v3
+; GFX9-NEXT:    v_add_u32_e32 v2, s4, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
 ; GFX9-NEXT:    s_add_u32 s4, s4, 1
+; GFX9-NEXT:    v_subrev_u32_e32 v3, s2, v2
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX9-NEXT:    s_addc_u32 s5, s5, 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX9-NEXT:    global_store_dword v1, v2, s[0:1]
 ; GFX9-NEXT:    s_add_u32 s0, s0, 4
 ; GFX9-NEXT:    s_addc_u32 s1, s1, 0
-; GFX9-NEXT:    v_subrev_u32_e32 v4, s2, v3
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX9-NEXT:    s_cmpk_eq_i32 s4, 0x400
-; GFX9-NEXT:    global_store_dword v[1:2], v3, off
 ; GFX9-NEXT:    s_cbranch_scc0 BB1_1
 ; GFX9-NEXT:  ; %bb.2: ; %bb2
 ; GFX9-NEXT:    s_endpgm
@@ -137,28 +135,27 @@ define amdgpu_kernel void @sdiv32_invariant_denom(i32 addrspace(1)* nocapture %a
 ; GFX9-NEXT:    s_mov_b32 s4, 0
 ; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:  BB2_1: ; %bb3
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_mul_hi_u32 v3, s4, v0
-; GFX9-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-NEXT:    v_mul_lo_u32 v4, v3, s3
-; GFX9-NEXT:    v_add_u32_e32 v5, 1, v3
-; GFX9-NEXT:    v_sub_u32_e32 v4, s4, v4
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v5, s3, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX9-NEXT:    v_mul_hi_u32 v2, s4, v0
+; GFX9-NEXT:    v_mul_lo_u32 v3, v2, s3
+; GFX9-NEXT:    v_add_u32_e32 v4, 1, v2
+; GFX9-NEXT:    v_sub_u32_e32 v3, s4, v3
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX9-NEXT:    v_subrev_u32_e32 v4, s3, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX9-NEXT:    v_add_u32_e32 v4, 1, v2
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v2, s2, v2
+; GFX9-NEXT:    v_subrev_u32_e32 v2, s2, v2
 ; GFX9-NEXT:    s_add_i32 s4, s4, 1
-; GFX9-NEXT:    v_add_u32_e32 v5, 1, v3
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v4
+; GFX9-NEXT:    global_store_dword v1, v2, s[0:1]
 ; GFX9-NEXT:    s_add_u32 s0, s0, 4
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
 ; GFX9-NEXT:    s_addc_u32 s1, s1, 0
-; GFX9-NEXT:    v_xor_b32_e32 v3, s2, v3
 ; GFX9-NEXT:    s_cmpk_eq_i32 s4, 0x400
-; GFX9-NEXT:    v_subrev_u32_e32 v3, s2, v3
-; GFX9-NEXT:    global_store_dword v[1:2], v3, off
 ; GFX9-NEXT:    s_cbranch_scc0 BB2_1
 ; GFX9-NEXT:  ; %bb.2: ; %bb2
 ; GFX9-NEXT:    s_endpgm
@@ -197,24 +194,23 @@ define amdgpu_kernel void @srem32_invariant_denom(i32 addrspace(1)* nocapture %a
 ; GFX9-NEXT:    s_mov_b32 s3, 0
 ; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:  BB3_1: ; %bb3
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_mul_hi_u32 v3, s3, v0
-; GFX9-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-NEXT:    v_mul_lo_u32 v3, v3, s2
-; GFX9-NEXT:    v_sub_u32_e32 v3, s3, v3
+; GFX9-NEXT:    v_mul_hi_u32 v2, s3, v0
+; GFX9-NEXT:    v_mul_lo_u32 v2, v2, s2
+; GFX9-NEXT:    v_sub_u32_e32 v2, s3, v2
+; GFX9-NEXT:    v_subrev_u32_e32 v3, s2, v2
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX9-NEXT:    v_subrev_u32_e32 v3, s2, v2
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX9-NEXT:    s_add_i32 s3, s3, 1
-; GFX9-NEXT:    v_subrev_u32_e32 v4, s2, v3
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX9-NEXT:    global_store_dword v1, v2, s[0:1]
 ; GFX9-NEXT:    s_add_u32 s0, s0, 4
 ; GFX9-NEXT:    s_addc_u32 s1, s1, 0
-; GFX9-NEXT:    v_subrev_u32_e32 v4, s2, v3
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v3
 ; GFX9-NEXT:    s_cmpk_eq_i32 s3, 0x400
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; GFX9-NEXT:    global_store_dword v[1:2], v3, off
 ; GFX9-NEXT:    s_cbranch_scc0 BB3_1
 ; GFX9-NEXT:  ; %bb.2: ; %bb2
 ; GFX9-NEXT:    s_endpgm

diff  --git a/llvm/test/CodeGen/AMDGPU/idot2.ll b/llvm/test/CodeGen/AMDGPU/idot2.ll
index 2a7b47bf68bc..a7b137b3b92a 100644
--- a/llvm/test/CodeGen/AMDGPU/idot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot2.ll
@@ -64,6 +64,7 @@ define amdgpu_kernel void @udot2(<2 x i16> addrspace(1)* %src1,
 ; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-NODL-NEXT:    s_mov_b32 s2, 0xffff
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NODL-NEXT:    s_load_dword s3, s[4:5], 0x0
 ; GFX9-NODL-NEXT:    s_load_dword s4, s[6:7], 0x0
@@ -73,31 +74,28 @@ define amdgpu_kernel void @udot2(<2 x i16> addrspace(1)* %src1,
 ; GFX9-NODL-NEXT:    s_lshr_b32 s3, s3, 16
 ; GFX9-NODL-NEXT:    s_and_b32 s2, s4, s2
 ; GFX9-NODL-NEXT:    s_lshr_b32 s4, s4, 16
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s2, v1, v0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s4, v1, v2
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s2, v2, v1
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: udot2:
 ; GFX9-DL:       ; %bb.0: ; %entry
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s3, s[0:1], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-DL-NEXT:    v_dot2_u32_u16 v2, s4, v0, v1
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-DL-NEXT:    v_dot2_u32_u16 v1, s4, v1, v2
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: udot2:
@@ -105,6 +103,7 @@ define amdgpu_kernel void @udot2(<2 x i16> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_clause 0x1
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -112,10 +111,8 @@ define amdgpu_kernel void @udot2(<2 x i16> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s6
-; GFX10-DL-NEXT:    v_dot2_u32_u16 v2, s1, s0, v0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT:    v_dot2_u32_u16 v0, s1, s0, v0
+; GFX10-DL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-DL-NEXT:    s_endpgm
                                  <2 x i16> addrspace(1)* %src2,
                                  i32 addrspace(1)* nocapture %dst) {
@@ -200,6 +197,7 @@ define amdgpu_kernel void @udot2_MulMul(<2 x i16> addrspace(1)* %src1,
 ; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-NODL-NEXT:    s_mov_b32 s2, 0xffff
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NODL-NEXT:    s_load_dword s3, s[4:5], 0x0
 ; GFX9-NODL-NEXT:    s_load_dword s4, s[6:7], 0x0
@@ -207,16 +205,14 @@ define amdgpu_kernel void @udot2_MulMul(<2 x i16> addrspace(1)* %src1,
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NODL-NEXT:    s_and_b32 s6, s3, s2
 ; GFX9-NODL-NEXT:    s_and_b32 s2, s4, s2
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NODL-NEXT:    s_lshr_b32 s3, s3, 16
 ; GFX9-NODL-NEXT:    s_lshr_b32 s4, s4, 16
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NODL-NEXT:    v_mul_u32_u24_e32 v0, s2, v0
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
-; GFX9-NODL-NEXT:    v_add_u32_e32 v2, s5, v0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NODL-NEXT:    v_mul_u32_u24_e32 v1, s2, v1
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s4, v2, v1
+; GFX9-NODL-NEXT:    v_add_u32_e32 v1, s5, v1
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: udot2_MulMul:
@@ -224,6 +220,7 @@ define amdgpu_kernel void @udot2_MulMul(<2 x i16> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-DL-NEXT:    s_mov_b32 s2, 0xffff
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    s_load_dword s3, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
@@ -231,16 +228,14 @@ define amdgpu_kernel void @udot2_MulMul(<2 x i16> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    s_and_b32 s6, s3, s2
 ; GFX9-DL-NEXT:    s_and_b32 s2, s4, s2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-DL-NEXT:    s_lshr_b32 s3, s3, 16
 ; GFX9-DL-NEXT:    s_lshr_b32 s4, s4, 16
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v0, s2, v0
-; GFX9-DL-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
-; GFX9-DL-NEXT:    v_add_u32_e32 v2, s5, v0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v1, s2, v1
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s4, v2, v1
+; GFX9-DL-NEXT:    v_add_u32_e32 v1, s5, v1
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: udot2_MulMul:
@@ -248,6 +243,7 @@ define amdgpu_kernel void @udot2_MulMul(<2 x i16> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_clause 0x1
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
@@ -261,10 +257,8 @@ define amdgpu_kernel void @udot2_MulMul(<2 x i16> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_mul_u32_u24_e64 v0, s5, s6
 ; GFX10-DL-NEXT:    s_lshr_b32 s3, s3, 16
 ; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s3, s2, v0
-; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, s4, v0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v0, s4, v0
+; GFX10-DL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
                                         <2 x i16> addrspace(1)* %src2,
                                         i32 addrspace(1)* nocapture %dst) {
@@ -341,6 +335,7 @@ define amdgpu_kernel void @idot2(<2 x i16> addrspace(1)* %src1,
 ; GFX9-NODL:       ; %bb.0: ; %entry
 ; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NODL-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-NODL-NEXT:    s_load_dword s3, s[6:7], 0x0
@@ -350,31 +345,28 @@ define amdgpu_kernel void @idot2(<2 x i16> addrspace(1)* %src1,
 ; GFX9-NODL-NEXT:    s_ashr_i32 s2, s2, 16
 ; GFX9-NODL-NEXT:    s_sext_i32_i16 s6, s3
 ; GFX9-NODL-NEXT:    s_ashr_i32 s3, s3, 16
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v1, s3, v2, v1
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s5
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v0, s3, v1, v0
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, s6, v2, v0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v1, s6, v2, v1
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: idot2:
 ; GFX9-DL:       ; %bb.0: ; %entry
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s3, s[0:1], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-DL-NEXT:    v_dot2_i32_i16 v2, s4, v0, v1
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-DL-NEXT:    v_dot2_i32_i16 v1, s4, v1, v2
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: idot2:
@@ -382,6 +374,7 @@ define amdgpu_kernel void @idot2(<2 x i16> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_clause 0x1
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -389,10 +382,8 @@ define amdgpu_kernel void @idot2(<2 x i16> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s6
-; GFX10-DL-NEXT:    v_dot2_i32_i16 v2, s1, s0, v0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT:    v_dot2_i32_i16 v0, s1, s0, v0
+; GFX10-DL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-DL-NEXT:    s_endpgm
                                  <2 x i16> addrspace(1)* %src2,
                                  i32 addrspace(1)* nocapture %dst) {
@@ -470,6 +461,7 @@ define amdgpu_kernel void @idot2_MixedTypedMul(<2 x i16> addrspace(1)* %src1,
 ; GFX9-NODL:       ; %bb.0: ; %entry
 ; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NODL-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-NODL-NEXT:    s_load_dword s3, s[6:7], 0x0
@@ -479,20 +471,19 @@ define amdgpu_kernel void @idot2_MixedTypedMul(<2 x i16> addrspace(1)* %src1,
 ; GFX9-NODL-NEXT:    s_lshr_b32 s2, s2, 16
 ; GFX9-NODL-NEXT:    s_sext_i32_i16 s6, s3
 ; GFX9-NODL-NEXT:    s_lshr_b32 s3, s3, 16
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s3, v2, v1
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s5
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v0, s3, v1, v0
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, s6, v2, v0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v1, s6, v2, v1
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: idot2_MixedTypedMul:
 ; GFX9-DL:       ; %bb.0: ; %entry
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
@@ -502,14 +493,12 @@ define amdgpu_kernel void @idot2_MixedTypedMul(<2 x i16> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_lshr_b32 s2, s2, 16
 ; GFX9-DL-NEXT:    s_sext_i32_i16 s6, s3
 ; GFX9-DL-NEXT:    s_lshr_b32 s3, s3, 16
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s3, v2, v1
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s5
-; GFX9-DL-NEXT:    v_mad_u32_u24 v0, s3, v1, v0
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s6, v2, v0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s6, v2, v1
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: idot2_MixedTypedMul:
@@ -517,6 +506,7 @@ define amdgpu_kernel void @idot2_MixedTypedMul(<2 x i16> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_clause 0x1
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -529,10 +519,8 @@ define amdgpu_kernel void @idot2_MixedTypedMul(<2 x i16> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_sext_i32_i16 s0, s0
 ; GFX10-DL-NEXT:    s_sext_i32_i16 s1, s1
 ; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s3, s2, v0
-; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s1, s0, v0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT:    v_mad_i32_i24 v0, s1, s0, v0
+; GFX10-DL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-DL-NEXT:    s_endpgm
                                                <2 x i16> addrspace(1)* %src2,
                                                i32 addrspace(1)* nocapture %dst) {
@@ -614,6 +602,7 @@ define amdgpu_kernel void @udot2_alt_AddOperands(<2 x i16> addrspace(1)* %src1,
 ; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-NODL-NEXT:    s_mov_b32 s2, 0xffff
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NODL-NEXT:    s_load_dword s3, s[4:5], 0x0
 ; GFX9-NODL-NEXT:    s_load_dword s4, s[6:7], 0x0
@@ -623,31 +612,28 @@ define amdgpu_kernel void @udot2_alt_AddOperands(<2 x i16> addrspace(1)* %src1,
 ; GFX9-NODL-NEXT:    s_lshr_b32 s3, s3, 16
 ; GFX9-NODL-NEXT:    s_and_b32 s2, s4, s2
 ; GFX9-NODL-NEXT:    s_lshr_b32 s4, s4, 16
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s2, v1, v0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s4, v1, v2
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s2, v2, v1
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: udot2_alt_AddOperands:
 ; GFX9-DL:       ; %bb.0: ; %entry
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s3, s[0:1], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-DL-NEXT:    v_dot2_u32_u16 v2, s4, v0, v1
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-DL-NEXT:    v_dot2_u32_u16 v1, s4, v1, v2
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: udot2_alt_AddOperands:
@@ -655,6 +641,7 @@ define amdgpu_kernel void @udot2_alt_AddOperands(<2 x i16> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_clause 0x1
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -662,10 +649,8 @@ define amdgpu_kernel void @udot2_alt_AddOperands(<2 x i16> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s6
-; GFX10-DL-NEXT:    v_dot2_u32_u16 v2, s1, s0, v0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT:    v_dot2_u32_u16 v0, s1, s0, v0
+; GFX10-DL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-DL-NEXT:    s_endpgm
                                                  <2 x i16> addrspace(1)* %src2,
                                                  i32 addrspace(1)* nocapture %dst) {
@@ -743,6 +728,7 @@ define amdgpu_kernel void @idot2_MixedExt(<2 x i16> addrspace(1)* %src1,
 ; GFX9-NODL:       ; %bb.0: ; %entry
 ; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NODL-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-NODL-NEXT:    s_load_dword s3, s[6:7], 0x0
@@ -752,20 +738,19 @@ define amdgpu_kernel void @idot2_MixedExt(<2 x i16> addrspace(1)* %src1,
 ; GFX9-NODL-NEXT:    s_ashr_i32 s2, s2, 16
 ; GFX9-NODL-NEXT:    s_and_b32 s6, s3, 0xffff
 ; GFX9-NODL-NEXT:    s_ashr_i32 s3, s3, 16
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v1, s3, v2, v1
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s5
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v0, s3, v1, v0
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, s6, v2, v0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v1, s6, v2, v1
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: idot2_MixedExt:
 ; GFX9-DL:       ; %bb.0: ; %entry
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
@@ -775,14 +760,12 @@ define amdgpu_kernel void @idot2_MixedExt(<2 x i16> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_ashr_i32 s2, s2, 16
 ; GFX9-DL-NEXT:    s_and_b32 s6, s3, 0xffff
 ; GFX9-DL-NEXT:    s_ashr_i32 s3, s3, 16
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s3, v2, v1
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s5
-; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s3, v1, v0
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s6, v2, v0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s6, v2, v1
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: idot2_MixedExt:
@@ -790,6 +773,7 @@ define amdgpu_kernel void @idot2_MixedExt(<2 x i16> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_clause 0x1
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -802,10 +786,8 @@ define amdgpu_kernel void @idot2_MixedExt(<2 x i16> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_sext_i32_i16 s0, s0
 ; GFX10-DL-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX10-DL-NEXT:    v_mad_i32_i24 v0, s3, s2, v0
-; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s1, s0, v0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT:    v_mad_i32_i24 v0, s1, s0, v0
+; GFX10-DL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-DL-NEXT:    s_endpgm
                                           <2 x i16> addrspace(1)* %src2,
                                           i32 addrspace(1)* nocapture %dst) {
@@ -875,38 +857,36 @@ define amdgpu_kernel void @notudot2_SameVec(<2 x i16> addrspace(1)* %src1,
 ; GFX9-NODL:       ; %bb.0: ; %entry
 ; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NODL-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX9-NODL-NEXT:    s_load_dword s3, s[0:1], 0x0
 ; GFX9-NODL-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NODL-NEXT:    s_lshr_b32 s2, s2, 16
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s3
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NODL-NEXT:    s_and_b32 s4, s4, 0xffff
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v0, s2, s2, v0
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s4, s4, v0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s2, s2, v1
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s4, s4, v1
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: notudot2_SameVec:
 ; GFX9-DL:       ; %bb.0: ; %entry
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s3, s[0:1], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    s_lshr_b32 s2, s2, 16
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s3
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-DL-NEXT:    s_and_b32 s4, s4, 0xffff
-; GFX9-DL-NEXT:    v_mad_u32_u24 v0, s2, s2, v0
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s4, s4, v0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s2, s2, v1
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s4, s4, v1
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: notudot2_SameVec:
@@ -914,6 +894,7 @@ define amdgpu_kernel void @notudot2_SameVec(<2 x i16> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_clause 0x1
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
@@ -923,10 +904,8 @@ define amdgpu_kernel void @notudot2_SameVec(<2 x i16> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_lshr_b32 s2, s2, 16
 ; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s2, s2, s3
 ; GFX10-DL-NEXT:    s_and_b32 s2, s4, 0xffff
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s2, v0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s2, s2, v0
+; GFX10-DL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
                                             <2 x i16> addrspace(1)* %src2,
                                             i32 addrspace(1)* nocapture %dst) {
@@ -1008,6 +987,7 @@ define amdgpu_kernel void @udot2_v4i16(<4 x i16> addrspace(1)* %src1,
 ; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-NODL-NEXT:    s_mov_b32 s2, 0xffff
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NODL-NEXT:    s_load_dword s3, s[4:5], 0x0
 ; GFX9-NODL-NEXT:    s_load_dword s4, s[6:7], 0x0
@@ -1017,31 +997,28 @@ define amdgpu_kernel void @udot2_v4i16(<4 x i16> addrspace(1)* %src1,
 ; GFX9-NODL-NEXT:    s_lshr_b32 s3, s3, 16
 ; GFX9-NODL-NEXT:    s_and_b32 s2, s4, s2
 ; GFX9-NODL-NEXT:    s_lshr_b32 s4, s4, 16
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s2, v1, v0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s4, v1, v2
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s2, v2, v1
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: udot2_v4i16:
 ; GFX9-DL:       ; %bb.0: ; %entry
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s3, s[0:1], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-DL-NEXT:    v_dot2_u32_u16 v2, s4, v0, v1
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-DL-NEXT:    v_dot2_u32_u16 v1, s4, v1, v2
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: udot2_v4i16:
@@ -1049,6 +1026,7 @@ define amdgpu_kernel void @udot2_v4i16(<4 x i16> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_clause 0x1
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -1056,10 +1034,8 @@ define amdgpu_kernel void @udot2_v4i16(<4 x i16> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s6
-; GFX10-DL-NEXT:    v_dot2_u32_u16 v2, s1, s0, v0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT:    v_dot2_u32_u16 v0, s1, s0, v0
+; GFX10-DL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-DL-NEXT:    s_endpgm
                                        <4 x i16> addrspace(1)* %src2,
                                        i32 addrspace(1)* nocapture %dst) {
@@ -1141,6 +1117,7 @@ define amdgpu_kernel void @udot2_v4i16_Hi(<4 x i16> addrspace(1)* %src1,
 ; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-NODL-NEXT:    s_mov_b32 s2, 0xffff
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NODL-NEXT:    s_load_dword s3, s[4:5], 0x4
 ; GFX9-NODL-NEXT:    s_load_dword s4, s[6:7], 0x4
@@ -1150,31 +1127,28 @@ define amdgpu_kernel void @udot2_v4i16_Hi(<4 x i16> addrspace(1)* %src1,
 ; GFX9-NODL-NEXT:    s_lshr_b32 s3, s3, 16
 ; GFX9-NODL-NEXT:    s_and_b32 s2, s4, s2
 ; GFX9-NODL-NEXT:    s_lshr_b32 s4, s4, 16
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s2, v1, v0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s4, v1, v2
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s2, v2, v1
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: udot2_v4i16_Hi:
 ; GFX9-DL:       ; %bb.0: ; %entry
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x4
 ; GFX9-DL-NEXT:    s_load_dword s3, s[0:1], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-DL-NEXT:    v_dot2_u32_u16 v2, s4, v0, v1
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-DL-NEXT:    v_dot2_u32_u16 v1, s4, v1, v2
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: udot2_v4i16_Hi:
@@ -1182,6 +1156,7 @@ define amdgpu_kernel void @udot2_v4i16_Hi(<4 x i16> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_clause 0x1
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -1189,10 +1164,8 @@ define amdgpu_kernel void @udot2_v4i16_Hi(<4 x i16> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s6
-; GFX10-DL-NEXT:    v_dot2_u32_u16 v2, s1, s0, v0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT:    v_dot2_u32_u16 v0, s1, s0, v0
+; GFX10-DL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-DL-NEXT:    s_endpgm
                                           <4 x i16> addrspace(1)* %src2,
                                           i32 addrspace(1)* nocapture %dst) {
@@ -1274,6 +1247,7 @@ define amdgpu_kernel void @notudot2_v4i16_Even(<4 x i16> addrspace(1)* %src1,
 ; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-NODL-NEXT:    s_mov_b32 s8, 0xffff
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
 ; GFX9-NODL-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
@@ -1282,15 +1256,13 @@ define amdgpu_kernel void @notudot2_v4i16_Even(<4 x i16> addrspace(1)* %src1,
 ; GFX9-NODL-NEXT:    s_and_b32 s3, s3, s8
 ; GFX9-NODL-NEXT:    s_and_b32 s2, s2, s8
 ; GFX9-NODL-NEXT:    s_and_b32 s5, s5, s8
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v0, s5, v1, v0
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s5, v1, v2
 ; GFX9-NODL-NEXT:    s_and_b32 s4, s4, s8
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s4, v1, v0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s4, v2, v1
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: notudot2_v4i16_Even:
@@ -1298,6 +1270,7 @@ define amdgpu_kernel void @notudot2_v4i16_Even(<4 x i16> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-DL-NEXT:    s_mov_b32 s8, 0xffff
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
@@ -1306,15 +1279,13 @@ define amdgpu_kernel void @notudot2_v4i16_Even(<4 x i16> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_and_b32 s3, s3, s8
 ; GFX9-DL-NEXT:    s_and_b32 s2, s2, s8
 ; GFX9-DL-NEXT:    s_and_b32 s5, s5, s8
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-DL-NEXT:    v_mad_u32_u24 v0, s5, v1, v0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s5, v1, v2
 ; GFX9-DL-NEXT:    s_and_b32 s4, s4, s8
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s4, v1, v0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s4, v2, v1
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: notudot2_v4i16_Even:
@@ -1323,6 +1294,7 @@ define amdgpu_kernel void @notudot2_v4i16_Even(<4 x i16> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-DL-NEXT:    s_mov_b32 s7, 0xffff
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -1335,10 +1307,8 @@ define amdgpu_kernel void @notudot2_v4i16_Even(<4 x i16> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_and_b32 s0, s0, s7
 ; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s3, s1, v0
 ; GFX10-DL-NEXT:    s_and_b32 s1, s2, s7
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s1, s0, v0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s1, s0, v0
+; GFX10-DL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-DL-NEXT:    s_endpgm
                                                <4 x i16> addrspace(1)* %src2,
                                                i32 addrspace(1)* nocapture %dst) {
@@ -1420,6 +1390,7 @@ define amdgpu_kernel void @notudot2_v4i16_Middle(<4 x i16> addrspace(1)* %src1,
 ; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-NODL-NEXT:    s_mov_b32 s8, 0xffff
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
 ; GFX9-NODL-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
@@ -1428,15 +1399,13 @@ define amdgpu_kernel void @notudot2_v4i16_Middle(<4 x i16> addrspace(1)* %src1,
 ; GFX9-NODL-NEXT:    s_and_b32 s3, s3, s8
 ; GFX9-NODL-NEXT:    s_lshr_b32 s2, s2, 16
 ; GFX9-NODL-NEXT:    s_and_b32 s5, s5, s8
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v0, s5, v1, v0
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s5, v1, v2
 ; GFX9-NODL-NEXT:    s_lshr_b32 s4, s4, 16
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s4, v1, v0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s4, v2, v1
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: notudot2_v4i16_Middle:
@@ -1444,6 +1413,7 @@ define amdgpu_kernel void @notudot2_v4i16_Middle(<4 x i16> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-DL-NEXT:    s_mov_b32 s8, 0xffff
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
@@ -1452,15 +1422,13 @@ define amdgpu_kernel void @notudot2_v4i16_Middle(<4 x i16> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_and_b32 s3, s3, s8
 ; GFX9-DL-NEXT:    s_lshr_b32 s2, s2, 16
 ; GFX9-DL-NEXT:    s_and_b32 s5, s5, s8
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-DL-NEXT:    v_mad_u32_u24 v0, s5, v1, v0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s5, v1, v2
 ; GFX9-DL-NEXT:    s_lshr_b32 s4, s4, 16
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s4, v1, v0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s4, v2, v1
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: notudot2_v4i16_Middle:
@@ -1469,6 +1437,7 @@ define amdgpu_kernel void @notudot2_v4i16_Middle(<4 x i16> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-DL-NEXT:    s_mov_b32 s7, 0xffff
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -1481,10 +1450,8 @@ define amdgpu_kernel void @notudot2_v4i16_Middle(<4 x i16> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s3, s1, v0
 ; GFX10-DL-NEXT:    s_lshr_b32 s1, s2, 16
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s1, s0, v0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s1, s0, v0
+; GFX10-DL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-DL-NEXT:    s_endpgm
                                                  <4 x i16> addrspace(1)* %src2,
                                                  i32 addrspace(1)* nocapture %dst) {
@@ -1566,6 +1533,7 @@ define amdgpu_kernel void @notudot2_DiffIndex(<2 x i16> addrspace(1)* %src1,
 ; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-NODL-NEXT:    s_mov_b32 s2, 0xffff
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NODL-NEXT:    s_load_dword s3, s[4:5], 0x0
 ; GFX9-NODL-NEXT:    s_load_dword s4, s[6:7], 0x0
@@ -1574,15 +1542,13 @@ define amdgpu_kernel void @notudot2_DiffIndex(<2 x i16> addrspace(1)* %src1,
 ; GFX9-NODL-NEXT:    s_and_b32 s6, s3, s2
 ; GFX9-NODL-NEXT:    s_lshr_b32 s3, s3, 16
 ; GFX9-NODL-NEXT:    s_and_b32 s2, s4, s2
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v0, s2, v1, v0
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s2, v1, v2
 ; GFX9-NODL-NEXT:    s_lshr_b32 s7, s4, 16
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s7, v1, v0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s7, v2, v1
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: notudot2_DiffIndex:
@@ -1590,6 +1556,7 @@ define amdgpu_kernel void @notudot2_DiffIndex(<2 x i16> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-DL-NEXT:    s_mov_b32 s2, 0xffff
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    s_load_dword s3, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
@@ -1598,15 +1565,13 @@ define amdgpu_kernel void @notudot2_DiffIndex(<2 x i16> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_and_b32 s6, s3, s2
 ; GFX9-DL-NEXT:    s_lshr_b32 s3, s3, 16
 ; GFX9-DL-NEXT:    s_and_b32 s2, s4, s2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-DL-NEXT:    v_mad_u32_u24 v0, s2, v1, v0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s2, v1, v2
 ; GFX9-DL-NEXT:    s_lshr_b32 s7, s4, 16
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s7, v1, v0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s7, v2, v1
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: notudot2_DiffIndex:
@@ -1614,6 +1579,7 @@ define amdgpu_kernel void @notudot2_DiffIndex(<2 x i16> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_clause 0x1
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -1627,10 +1593,8 @@ define amdgpu_kernel void @notudot2_DiffIndex(<2 x i16> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_and_b32 s0, s0, s2
 ; GFX10-DL-NEXT:    s_lshr_b32 s1, s1, 16
 ; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s6, s3, v0
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s1, s0, v0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s1, s0, v0
+; GFX10-DL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-DL-NEXT:    s_endpgm
                                               <2 x i16> addrspace(1)* %src2,
                                               i32 addrspace(1)* nocapture %dst) {
@@ -1714,6 +1678,7 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(<2 x i16> addrspace(1)* %src1
 ; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-NODL-NEXT:    s_mov_b32 s2, 0xffff
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NODL-NEXT:    s_load_dword s3, s[4:5], 0x0
 ; GFX9-NODL-NEXT:    s_load_dword s4, s[6:7], 0x0
@@ -1723,15 +1688,13 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(<2 x i16> addrspace(1)* %src1
 ; GFX9-NODL-NEXT:    s_lshr_b32 s3, s3, 16
 ; GFX9-NODL-NEXT:    s_and_b32 s2, s4, s2
 ; GFX9-NODL-NEXT:    s_lshr_b32 s4, s4, 16
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s2, v1, v0
-; GFX9-NODL-NEXT:    v_add_u32_e32 v2, v1, v0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s4, v1, v2
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s2, v2, v1
+; GFX9-NODL-NEXT:    v_add_u32_e32 v1, v2, v1
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: udot2_MultipleUses_add1:
@@ -1739,6 +1702,7 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(<2 x i16> addrspace(1)* %src1
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-DL-NEXT:    s_mov_b32 s2, 0xffff
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    s_load_dword s3, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
@@ -1748,15 +1712,13 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(<2 x i16> addrspace(1)* %src1
 ; GFX9-DL-NEXT:    s_lshr_b32 s3, s3, 16
 ; GFX9-DL-NEXT:    s_and_b32 s2, s4, s2
 ; GFX9-DL-NEXT:    s_lshr_b32 s4, s4, 16
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-DL-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s2, v1, v0
-; GFX9-DL-NEXT:    v_add_u32_e32 v2, v1, v0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s4, v1, v2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s2, v2, v1
+; GFX9-DL-NEXT:    v_add_u32_e32 v1, v2, v1
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: udot2_MultipleUses_add1:
@@ -1764,6 +1726,7 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(<2 x i16> addrspace(1)* %src1
 ; GFX10-DL-NEXT:    s_clause 0x1
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -1778,10 +1741,8 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(<2 x i16> addrspace(1)* %src1
 ; GFX10-DL-NEXT:    s_and_b32 s0, s0, s6
 ; GFX10-DL-NEXT:    s_and_b32 s1, s1, s6
 ; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s1, s0, v0
-; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v1, v0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v0, v1, v0
+; GFX10-DL-NEXT:    global_store_dword v2, v0, s[4:5]
 ; GFX10-DL-NEXT:    s_endpgm
                                                    <2 x i16> addrspace(1)* %src2,
                                                    i32 addrspace(1)* nocapture %dst) {
@@ -1863,6 +1824,7 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(<2 x i16> addrspace(1)* %src1
 ; GFX9-NODL:       ; %bb.0: ; %entry
 ; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NODL-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-NODL-NEXT:    s_load_dword s3, s[6:7], 0x0
@@ -1872,21 +1834,20 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(<2 x i16> addrspace(1)* %src1
 ; GFX9-NODL-NEXT:    s_ashr_i32 s2, s2, 16
 ; GFX9-NODL-NEXT:    s_sext_i32_i16 s6, s3
 ; GFX9-NODL-NEXT:    s_ashr_i32 s3, s3, 16
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v1, s3, v2, v1
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s5
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v0, s3, v1, v0
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v1, s6, v2, v0
-; GFX9-NODL-NEXT:    v_add_u32_e32 v2, v1, v0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, s6, v2, v1
+; GFX9-NODL-NEXT:    v_add_u32_e32 v1, v2, v1
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: idot2_MultipleUses_add1:
 ; GFX9-DL:       ; %bb.0: ; %entry
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
@@ -1896,15 +1857,13 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(<2 x i16> addrspace(1)* %src1
 ; GFX9-DL-NEXT:    s_ashr_i32 s2, s2, 16
 ; GFX9-DL-NEXT:    s_sext_i32_i16 s6, s3
 ; GFX9-DL-NEXT:    s_ashr_i32 s3, s3, 16
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s3, v2, v1
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s5
-; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s3, v1, v0
-; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s6, v2, v0
-; GFX9-DL-NEXT:    v_add_u32_e32 v2, v1, v0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s6, v2, v1
+; GFX9-DL-NEXT:    v_add_u32_e32 v1, v2, v1
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: idot2_MultipleUses_add1:
@@ -1912,6 +1871,7 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(<2 x i16> addrspace(1)* %src1
 ; GFX10-DL-NEXT:    s_clause 0x1
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -1925,10 +1885,8 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(<2 x i16> addrspace(1)* %src1
 ; GFX10-DL-NEXT:    s_sext_i32_i16 s1, s1
 ; GFX10-DL-NEXT:    v_mad_i32_i24 v0, s3, s2, v0
 ; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s1, s0, v0
-; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v1, v0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v0, v1, v0
+; GFX10-DL-NEXT:    global_store_dword v2, v0, s[4:5]
 ; GFX10-DL-NEXT:    s_endpgm
                                                    <2 x i16> addrspace(1)* %src2,
                                                    i32 addrspace(1)* nocapture %dst) {
@@ -2014,6 +1972,7 @@ define amdgpu_kernel void @udot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1
 ; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-NODL-NEXT:    s_mov_b32 s2, 0xffff
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NODL-NEXT:    s_load_dword s3, s[4:5], 0x0
 ; GFX9-NODL-NEXT:    s_load_dword s4, s[6:7], 0x0
@@ -2022,16 +1981,14 @@ define amdgpu_kernel void @udot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1
 ; GFX9-NODL-NEXT:    s_and_b32 s6, s3, s2
 ; GFX9-NODL-NEXT:    s_and_b32 s2, s4, s2
 ; GFX9-NODL-NEXT:    s_lshr_b32 s3, s3, 16
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX9-NODL-NEXT:    s_lshr_b32 s4, s4, 16
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v0, s2, v1, v0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s3
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v0, s4, v2, v0
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s2, v1, v0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s2, v1, v2
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s4, v3, v2
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s2, v1, v2
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: udot2_MultipleUses_mul1:
@@ -2039,6 +1996,7 @@ define amdgpu_kernel void @udot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-DL-NEXT:    s_mov_b32 s2, 0xffff
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    s_load_dword s3, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
@@ -2047,16 +2005,14 @@ define amdgpu_kernel void @udot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1
 ; GFX9-DL-NEXT:    s_and_b32 s6, s3, s2
 ; GFX9-DL-NEXT:    s_and_b32 s2, s4, s2
 ; GFX9-DL-NEXT:    s_lshr_b32 s3, s3, 16
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX9-DL-NEXT:    s_lshr_b32 s4, s4, 16
-; GFX9-DL-NEXT:    v_mad_u32_u24 v0, s2, v1, v0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s3
-; GFX9-DL-NEXT:    v_mad_u32_u24 v0, s4, v2, v0
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s2, v1, v0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s2, v1, v2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s4, v3, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s2, v1, v2
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: udot2_MultipleUses_mul1:
@@ -2064,6 +2020,7 @@ define amdgpu_kernel void @udot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1
 ; GFX10-DL-NEXT:    s_clause 0x1
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -2078,10 +2035,8 @@ define amdgpu_kernel void @udot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1
 ; GFX10-DL-NEXT:    s_lshr_b32 s1, s1, 16
 ; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s2, s3, v0
 ; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s1, s0, v0
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s3, v0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s2, s3, v0
+; GFX10-DL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-DL-NEXT:    s_endpgm
                                                    <2 x i16> addrspace(1)* %src2,
                                                    i32 addrspace(1)* nocapture %dst) {
@@ -2164,6 +2119,7 @@ define amdgpu_kernel void @idot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1
 ; GFX9-NODL:       ; %bb.0: ; %entry
 ; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NODL-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-NODL-NEXT:    s_load_dword s3, s[6:7], 0x0
@@ -2172,22 +2128,21 @@ define amdgpu_kernel void @idot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1
 ; GFX9-NODL-NEXT:    s_sext_i32_i16 s5, s2
 ; GFX9-NODL-NEXT:    s_sext_i32_i16 s6, s3
 ; GFX9-NODL-NEXT:    s_ashr_i32 s2, s2, 16
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX9-NODL-NEXT:    s_ashr_i32 s3, s3, 16
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v0, s6, v1, v0
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v0, s3, v2, v0
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, s6, v1, v0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v1, s6, v2, v1
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s2
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v1, s3, v3, v1
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v1, s6, v2, v1
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: idot2_MultipleUses_mul1:
 ; GFX9-DL:       ; %bb.0: ; %entry
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
@@ -2196,16 +2151,14 @@ define amdgpu_kernel void @idot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1
 ; GFX9-DL-NEXT:    s_sext_i32_i16 s5, s2
 ; GFX9-DL-NEXT:    s_sext_i32_i16 s6, s3
 ; GFX9-DL-NEXT:    s_ashr_i32 s2, s2, 16
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX9-DL-NEXT:    s_ashr_i32 s3, s3, 16
-; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s6, v1, v0
-; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s3, v2, v0
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s6, v1, v0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s6, v2, v1
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s2
+; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s3, v3, v1
+; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s6, v2, v1
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: idot2_MultipleUses_mul1:
@@ -2213,6 +2166,7 @@ define amdgpu_kernel void @idot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1
 ; GFX10-DL-NEXT:    s_clause 0x1
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -2226,10 +2180,8 @@ define amdgpu_kernel void @idot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1
 ; GFX10-DL-NEXT:    s_ashr_i32 s1, s1, 16
 ; GFX10-DL-NEXT:    v_mad_i32_i24 v0, s3, s2, v0
 ; GFX10-DL-NEXT:    v_mad_i32_i24 v0, s1, s0, v0
-; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s3, s2, v0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT:    v_mad_i32_i24 v0, s3, s2, v0
+; GFX10-DL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-DL-NEXT:    s_endpgm
                                                    <2 x i16> addrspace(1)* %src2,
                                                    i32 addrspace(1)* nocapture %dst) {
@@ -2316,6 +2268,7 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(<2 x i16> addrspace(1)* %src1
 ; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-NODL-NEXT:    s_mov_b32 s2, 0xffff
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NODL-NEXT:    s_load_dword s3, s[4:5], 0x0
 ; GFX9-NODL-NEXT:    s_load_dword s4, s[6:7], 0x0
@@ -2325,15 +2278,13 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(<2 x i16> addrspace(1)* %src1
 ; GFX9-NODL-NEXT:    s_lshr_b32 s3, s3, 16
 ; GFX9-NODL-NEXT:    s_and_b32 s2, s4, s2
 ; GFX9-NODL-NEXT:    s_lshr_b32 s4, s4, 16
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s2, v1, v0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s4, v1, v2
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s4, v1, v2
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s2, v2, v1
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: udot2_MultipleUses_mul2:
@@ -2341,6 +2292,7 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(<2 x i16> addrspace(1)* %src1
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-DL-NEXT:    s_mov_b32 s2, 0xffff
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    s_load_dword s3, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
@@ -2350,15 +2302,13 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(<2 x i16> addrspace(1)* %src1
 ; GFX9-DL-NEXT:    s_lshr_b32 s3, s3, 16
 ; GFX9-DL-NEXT:    s_and_b32 s2, s4, s2
 ; GFX9-DL-NEXT:    s_lshr_b32 s4, s4, 16
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-DL-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
-; GFX9-DL-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s2, v1, v0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s4, v1, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s4, v1, v2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s2, v2, v1
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: udot2_MultipleUses_mul2:
@@ -2366,6 +2316,7 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(<2 x i16> addrspace(1)* %src1
 ; GFX10-DL-NEXT:    s_clause 0x1
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -2380,10 +2331,8 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(<2 x i16> addrspace(1)* %src1
 ; GFX10-DL-NEXT:    s_and_b32 s0, s0, s6
 ; GFX10-DL-NEXT:    s_and_b32 s1, s1, s6
 ; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s3, s2, v0
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s1, s0, v0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s1, s0, v0
+; GFX10-DL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-DL-NEXT:    s_endpgm
                                                    <2 x i16> addrspace(1)* %src2,
                                                    i32 addrspace(1)* nocapture %dst) {
@@ -2466,6 +2415,7 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(<2 x i16> addrspace(1)* %src1
 ; GFX9-NODL:       ; %bb.0: ; %entry
 ; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NODL-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-NODL-NEXT:    s_load_dword s3, s[6:7], 0x0
@@ -2475,21 +2425,20 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(<2 x i16> addrspace(1)* %src1
 ; GFX9-NODL-NEXT:    s_ashr_i32 s2, s2, 16
 ; GFX9-NODL-NEXT:    s_sext_i32_i16 s6, s3
 ; GFX9-NODL-NEXT:    s_ashr_i32 s3, s3, 16
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v0, s3, v1, v0
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v1, s3, v2, v1
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v1, s3, v2, v1
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s5
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v0, s3, v1, v0
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, s6, v2, v0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v1, s6, v2, v1
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: idot2_MultipleUses_mul2:
 ; GFX9-DL:       ; %bb.0: ; %entry
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
@@ -2499,15 +2448,13 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(<2 x i16> addrspace(1)* %src1
 ; GFX9-DL-NEXT:    s_ashr_i32 s2, s2, 16
 ; GFX9-DL-NEXT:    s_sext_i32_i16 s6, s3
 ; GFX9-DL-NEXT:    s_ashr_i32 s3, s3, 16
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s3, v1, v0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s3, v2, v1
+; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s3, v2, v1
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s5
-; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s3, v1, v0
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s6, v2, v0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s6, v2, v1
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: idot2_MultipleUses_mul2:
@@ -2515,6 +2462,7 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(<2 x i16> addrspace(1)* %src1
 ; GFX10-DL-NEXT:    s_clause 0x1
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -2528,10 +2476,8 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(<2 x i16> addrspace(1)* %src1
 ; GFX10-DL-NEXT:    s_sext_i32_i16 s1, s1
 ; GFX10-DL-NEXT:    v_mad_i32_i24 v0, s3, s2, v0
 ; GFX10-DL-NEXT:    v_mad_i32_i24 v0, s3, s2, v0
-; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s1, s0, v0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT:    v_mad_i32_i24 v0, s1, s0, v0
+; GFX10-DL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-DL-NEXT:    s_endpgm
                                                    <2 x i16> addrspace(1)* %src2,
                                                    i32 addrspace(1)* nocapture %dst) {
@@ -2614,58 +2560,54 @@ define amdgpu_kernel void @udot2_acc16(<2 x i16> addrspace(1)* %src1,
 ; GFX9-NODL:       ; %bb.0: ; %entry
 ; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NODL-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT:    global_load_ushort v2, v[0:1], off
-; GFX9-NODL-NEXT:    s_load_dword s1, s[4:5], 0x0
-; GFX9-NODL-NEXT:    s_load_dword s2, s[6:7], 0x0
-; GFX9-NODL-NEXT:    s_mov_b32 s0, 0xffff
+; GFX9-NODL-NEXT:    global_load_ushort v1, v0, s[0:1]
+; GFX9-NODL-NEXT:    s_load_dword s3, s[4:5], 0x0
+; GFX9-NODL-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    s_and_b32 s3, s2, s0
-; GFX9-NODL-NEXT:    s_lshr_b32 s2, s2, 16
-; GFX9-NODL-NEXT:    s_and_b32 s0, s1, s0
-; GFX9-NODL-NEXT:    s_lshr_b32 s1, s1, 16
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s2
+; GFX9-NODL-NEXT:    s_and_b32 s5, s4, s2
+; GFX9-NODL-NEXT:    s_lshr_b32 s4, s4, 16
+; GFX9-NODL-NEXT:    s_and_b32 s2, s3, s2
+; GFX9-NODL-NEXT:    s_lshr_b32 s3, s3, 16
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s1, v3, v2
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
-; GFX9-NODL-NEXT:    global_store_short v[0:1], v2, off
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s3, v2, v1
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s2, v2, v1
+; GFX9-NODL-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: udot2_acc16:
 ; GFX9-DL:       ; %bb.0: ; %entry
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT:    global_load_ushort v2, v[0:1], off
+; GFX9-DL-NEXT:    global_load_ushort v1, v0, s[0:1]
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_dot2_u32_u16 v2, s2, v3, v2
-; GFX9-DL-NEXT:    global_store_short v[0:1], v2, off
+; GFX9-DL-NEXT:    v_dot2_u32_u16 v1, s2, v2, v1
+; GFX9-DL-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: udot2_acc16:
 ; GFX10-DL:       ; %bb.0: ; %entry
-; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-DL-NEXT:    global_load_ushort v2, v[0:1], off
+; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    global_load_ushort v1, v0, s[4:5]
 ; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NEXT:    v_dot2_u32_u16 v2, s0, s1, v2
-; GFX10-DL-NEXT:    global_store_short v[0:1], v2, off
+; GFX10-DL-NEXT:    v_dot2_u32_u16 v1, s0, s1, v1
+; GFX10-DL-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-DL-NEXT:    s_endpgm
                                        <2 x i16> addrspace(1)* %src2,
                                        i16 addrspace(1)* nocapture %dst) {
@@ -2751,86 +2693,71 @@ define amdgpu_kernel void @notsdot2_sext8(<2 x i8> addrspace(1)* %src1,
 ; GFX9-NODL:       ; %bb.0: ; %entry
 ; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s6
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s7
-; GFX9-NODL-NEXT:    global_load_ushort v0, v[0:1], off
-; GFX9-NODL-NEXT:    global_load_ushort v1, v[2:3], off
+; GFX9-NODL-NEXT:    global_load_ushort v1, v0, s[4:5]
+; GFX9-NODL-NEXT:    global_load_ushort v2, v0, s[6:7]
 ; GFX9-NODL-NEXT:    s_load_dword s2, s[0:1], 0x0
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NODL-NEXT:    v_bfe_i32 v2, v0, 0, 8
-; GFX9-NODL-NEXT:    v_lshrrev_b16_e32 v0, 8, v0
-; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NODL-NEXT:    v_bfe_i32 v3, v1, 0, 8
 ; GFX9-NODL-NEXT:    v_lshrrev_b16_e32 v1, 8, v1
-; GFX9-NODL-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NODL-NEXT:    v_bfe_i32 v4, v2, 0, 8
+; GFX9-NODL-NEXT:    v_lshrrev_b16_e32 v2, 8, v2
 ; GFX9-NODL-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX9-NODL-NEXT:    v_bfe_i32 v2, v2, 0, 8
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v0, v1, v0, s2
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, v3, v2, v0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v1, v2, v1, s2
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v1, v4, v3, v1
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: notsdot2_sext8:
 ; GFX9-DL:       ; %bb.0: ; %entry
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s6
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s7
-; GFX9-DL-NEXT:    global_load_ushort v0, v[0:1], off
-; GFX9-DL-NEXT:    global_load_ushort v1, v[2:3], off
+; GFX9-DL-NEXT:    global_load_ushort v1, v0, s[4:5]
+; GFX9-DL-NEXT:    global_load_ushort v2, v0, s[6:7]
 ; GFX9-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-DL-NEXT:    v_bfe_i32 v2, v0, 0, 8
-; GFX9-DL-NEXT:    v_lshrrev_b16_e32 v0, 8, v0
-; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-DL-NEXT:    v_bfe_i32 v3, v1, 0, 8
 ; GFX9-DL-NEXT:    v_lshrrev_b16_e32 v1, 8, v1
-; GFX9-DL-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT:    v_bfe_i32 v4, v2, 0, 8
+; GFX9-DL-NEXT:    v_lshrrev_b16_e32 v2, 8, v2
 ; GFX9-DL-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX9-DL-NEXT:    v_bfe_i32 v2, v2, 0, 8
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mad_i32_i24 v0, v1, v0, s2
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, v3, v2, v0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-DL-NEXT:    v_mad_i32_i24 v1, v2, v1, s2
+; GFX9-DL-NEXT:    v_mad_i32_i24 v1, v4, v3, v1
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: notsdot2_sext8:
 ; GFX10-DL:       ; %bb.0: ; %entry
-; GFX10-DL-NEXT:    s_clause 0x1
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s6
-; GFX10-DL-NEXT:    v_mov_b32_e32 v3, s7
-; GFX10-DL-NEXT:    global_load_ushort v0, v[0:1], off
-; GFX10-DL-NEXT:    global_load_ushort v1, v[2:3], off
+; GFX10-DL-NEXT:    s_clause 0x1
+; GFX10-DL-NEXT:    global_load_ushort v1, v0, s[4:5]
+; GFX10-DL-NEXT:    global_load_ushort v2, v0, s[6:7]
 ; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-DL-NEXT:    v_lshrrev_b16_e64 v2, 8, v0
-; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DL-NEXT:    v_lshrrev_b16_e64 v3, 8, v1
-; GFX10-DL-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:    v_lshrrev_b16_e64 v4, 8, v2
 ; GFX10-DL-NEXT:    v_bfe_i32 v1, v1, 0, 8
 ; GFX10-DL-NEXT:    v_bfe_i32 v2, v2, 0, 8
 ; GFX10-DL-NEXT:    v_bfe_i32 v3, v3, 0, 8
+; GFX10-DL-NEXT:    v_bfe_i32 v4, v4, 0, 8
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mad_i32_i24 v2, v3, v2, s2
-; GFX10-DL-NEXT:    v_mad_i32_i24 v2, v1, v0, v2
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT:    v_mad_i32_i24 v3, v4, v3, s2
+; GFX10-DL-NEXT:    v_mad_i32_i24 v1, v2, v1, v3
+; GFX10-DL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
                                           <2 x i8> addrspace(1)* %src2,
                                           i32 addrspace(1)* nocapture %dst) {

diff  --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll
index 629538ac1bc9..c94108055da7 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll
@@ -73,6 +73,7 @@ define amdgpu_kernel void @idot4_acc32(<4 x i8> addrspace(1)* %src1,
 ; GFX9-NODL:       ; %bb.0: ; %entry
 ; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NODL-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-NODL-NEXT:    s_load_dword s3, s[6:7], 0x0
@@ -81,40 +82,37 @@ define amdgpu_kernel void @idot4_acc32(<4 x i8> addrspace(1)* %src1,
 ; GFX9-NODL-NEXT:    s_sext_i32_i8 s4, s2
 ; GFX9-NODL-NEXT:    s_sext_i32_i8 s5, s3
 ; GFX9-NODL-NEXT:    s_bfe_i32 s7, s3, 0x80008
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s5
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s10
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX9-NODL-NEXT:    s_bfe_i32 s9, s3, 0x80010
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v0, s4, v0, v1
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v1, s4, v1, v2
 ; GFX9-NODL-NEXT:    s_bfe_i32 s6, s2, 0x80008
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s7
 ; GFX9-NODL-NEXT:    s_bfe_i32 s8, s2, 0x80010
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v0, s6, v1, v0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v1, s6, v2, v1
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s9
 ; GFX9-NODL-NEXT:    s_ashr_i32 s3, s3, 24
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v0, s8, v1, v0
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v1, s8, v2, v1
 ; GFX9-NODL-NEXT:    s_ashr_i32 s2, s2, 24
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, s2, v1, v0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v1, s2, v2, v1
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: idot4_acc32:
 ; GFX9-DL:       ; %bb.0: ; %entry
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s3, s[0:1], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-DL-NEXT:    v_dot4_i32_i8 v2, s4, v0, v1
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-DL-NEXT:    v_dot4_i32_i8 v1, s4, v1, v2
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: idot4_acc32:
@@ -122,6 +120,7 @@ define amdgpu_kernel void @idot4_acc32(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_clause 0x1
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -129,10 +128,8 @@ define amdgpu_kernel void @idot4_acc32(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s6
-; GFX10-DL-NEXT:    v_dot4_i32_i8 v2, s0, s1, v0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT:    v_dot4_i32_i8 v0, s0, s1, v0
+; GFX10-DL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-DL-NEXT:    s_endpgm
                                        <4 x i8> addrspace(1)* %src2,
                                        i32 addrspace(1)* nocapture %dst) {
@@ -251,65 +248,61 @@ define amdgpu_kernel void @idot4_acc16(<4 x i8> addrspace(1)* %src1,
 ; GFX9-NODL:       ; %bb.0: ; %entry
 ; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT:    global_load_ushort v2, v[0:1], off
-; GFX9-NODL-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX9-NODL-NEXT:    s_load_dword s1, s[6:7], 0x0
+; GFX9-NODL-NEXT:    global_load_ushort v1, v0, s[0:1]
+; GFX9-NODL-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX9-NODL-NEXT:    s_load_dword s3, s[6:7], 0x0
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    s_sext_i32_i8 s2, s0
-; GFX9-NODL-NEXT:    s_sext_i32_i8 s3, s1
-; GFX9-NODL-NEXT:    s_bfe_i32 s5, s1, 0x80008
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-NODL-NEXT:    s_bfe_i32 s7, s1, 0x80010
-; GFX9-NODL-NEXT:    s_bfe_i32 s4, s0, 0x80008
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v4, s5
-; GFX9-NODL-NEXT:    s_bfe_i32 s6, s0, 0x80010
-; GFX9-NODL-NEXT:    s_ashr_i32 s1, s1, 24
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v5, s7
-; GFX9-NODL-NEXT:    s_ashr_i32 s0, s0, 24
+; GFX9-NODL-NEXT:    s_sext_i32_i8 s4, s2
+; GFX9-NODL-NEXT:    s_sext_i32_i8 s5, s3
+; GFX9-NODL-NEXT:    s_bfe_i32 s7, s3, 0x80008
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-NODL-NEXT:    s_bfe_i32 s9, s3, 0x80010
+; GFX9-NODL-NEXT:    s_bfe_i32 s6, s2, 0x80008
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NODL-NEXT:    s_bfe_i32 s8, s2, 0x80010
+; GFX9-NODL-NEXT:    s_ashr_i32 s3, s3, 24
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v4, s9
+; GFX9-NODL-NEXT:    s_ashr_i32 s2, s2, 24
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, s2, v3, v2
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, s4, v4, v2
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, s6, v5, v2
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, s0, v3, v2
-; GFX9-NODL-NEXT:    global_store_short v[0:1], v2, off
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v1, s4, v2, v1
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v1, s6, v3, v1
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v1, s8, v4, v1
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v1, s2, v2, v1
+; GFX9-NODL-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: idot4_acc16:
 ; GFX9-DL:       ; %bb.0: ; %entry
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT:    global_load_ushort v2, v[0:1], off
+; GFX9-DL-NEXT:    global_load_ushort v1, v0, s[0:1]
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_dot4_i32_i8 v2, s2, v3, v2
-; GFX9-DL-NEXT:    global_store_short v[0:1], v2, off
+; GFX9-DL-NEXT:    v_dot4_i32_i8 v1, s2, v2, v1
+; GFX9-DL-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: idot4_acc16:
 ; GFX10-DL:       ; %bb.0: ; %entry
-; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-DL-NEXT:    global_load_ushort v2, v[0:1], off
+; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    global_load_ushort v1, v0, s[4:5]
 ; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NEXT:    v_dot4_i32_i8 v2, s0, s1, v2
-; GFX10-DL-NEXT:    global_store_short v[0:1], v2, off
+; GFX10-DL-NEXT:    v_dot4_i32_i8 v1, s0, s1, v1
+; GFX10-DL-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-DL-NEXT:    s_endpgm
                                        <4 x i8> addrspace(1)* %src2,
                                        i16 addrspace(1)* nocapture %dst) {
@@ -419,66 +412,62 @@ define amdgpu_kernel void @idot4_acc8(<4 x i8> addrspace(1)* %src1,
 ; GFX9-NODL:       ; %bb.0: ; %entry
 ; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NODL-NEXT:    s_movk_i32 s2, 0xff
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT:    global_load_ubyte v2, v[0:1], off
-; GFX9-NODL-NEXT:    s_load_dword s1, s[4:5], 0x0
-; GFX9-NODL-NEXT:    s_load_dword s2, s[6:7], 0x0
-; GFX9-NODL-NEXT:    s_movk_i32 s0, 0xff
+; GFX9-NODL-NEXT:    global_load_ubyte v1, v0, s[0:1]
+; GFX9-NODL-NEXT:    s_load_dword s3, s[4:5], 0x0
+; GFX9-NODL-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    s_bfe_u32 s5, s1, 0x80008
-; GFX9-NODL-NEXT:    s_and_b32 s3, s2, s0
-; GFX9-NODL-NEXT:    s_bfe_u32 s4, s2, 0x80008
-; GFX9-NODL-NEXT:    s_and_b32 s0, s1, s0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-NODL-NEXT:    s_bfe_u32 s6, s2, 0x80010
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v4, s4
-; GFX9-NODL-NEXT:    s_bfe_u32 s7, s1, 0x80010
-; GFX9-NODL-NEXT:    s_lshr_b32 s2, s2, 24
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v5, s6
-; GFX9-NODL-NEXT:    s_lshr_b32 s1, s1, 24
+; GFX9-NODL-NEXT:    s_bfe_u32 s7, s3, 0x80008
+; GFX9-NODL-NEXT:    s_and_b32 s5, s4, s2
+; GFX9-NODL-NEXT:    s_bfe_u32 s6, s4, 0x80008
+; GFX9-NODL-NEXT:    s_and_b32 s2, s3, s2
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-NODL-NEXT:    s_bfe_u32 s8, s4, 0x80010
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s6
+; GFX9-NODL-NEXT:    s_bfe_u32 s9, s3, 0x80010
+; GFX9-NODL-NEXT:    s_lshr_b32 s4, s4, 24
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v4, s8
+; GFX9-NODL-NEXT:    s_lshr_b32 s3, s3, 24
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s5, v4, v2
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s7, v5, v2
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s2
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s1, v3, v2
-; GFX9-NODL-NEXT:    global_store_byte v[0:1], v2, off
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s2, v2, v1
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s7, v3, v1
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s9, v4, v1
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s3, v2, v1
+; GFX9-NODL-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: idot4_acc8:
 ; GFX9-DL:       ; %bb.0: ; %entry
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT:    global_load_ubyte v2, v[0:1], off
+; GFX9-DL-NEXT:    global_load_ubyte v1, v0, s[0:1]
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_dot4_u32_u8 v2, s2, v3, v2
-; GFX9-DL-NEXT:    global_store_byte v[0:1], v2, off
+; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, s2, v2, v1
+; GFX9-DL-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: idot4_acc8:
 ; GFX10-DL:       ; %bb.0: ; %entry
-; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-DL-NEXT:    global_load_ubyte v2, v[0:1], off
+; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    global_load_ubyte v1, v0, s[4:5]
 ; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NEXT:    v_dot4_u32_u8 v2, s0, s1, v2
-; GFX10-DL-NEXT:    global_store_byte v[0:1], v2, off
+; GFX10-DL-NEXT:    v_dot4_u32_u8 v1, s0, s1, v1
+; GFX10-DL-NEXT:    global_store_byte v0, v1, s[4:5]
 ; GFX10-DL-NEXT:    s_endpgm
                                       <4 x i8> addrspace(1)* %src2,
                                       i8 addrspace(1)* nocapture %dst) {
@@ -580,6 +569,7 @@ define amdgpu_kernel void @idot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1,
 ; GFX9-NODL:       ; %bb.0: ; %entry
 ; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NODL-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-NODL-NEXT:    s_load_dword s3, s[6:7], 0x0
@@ -588,30 +578,29 @@ define amdgpu_kernel void @idot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1,
 ; GFX9-NODL-NEXT:    s_sext_i32_i8 s4, s2
 ; GFX9-NODL-NEXT:    s_sext_i32_i8 s5, s3
 ; GFX9-NODL-NEXT:    s_bfe_i32 s7, s3, 0x80008
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s5
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s10
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX9-NODL-NEXT:    s_bfe_i32 s6, s2, 0x80008
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v1, s4, v0, v1
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s7
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, s4, v1, v2
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX9-NODL-NEXT:    s_bfe_i32 s9, s3, 0x80010
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v1, s6, v2, v1
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, s6, v3, v2
 ; GFX9-NODL-NEXT:    s_bfe_i32 s8, s2, 0x80010
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v0, s4, v0, v1
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v1, s4, v1, v2
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s9
 ; GFX9-NODL-NEXT:    s_ashr_i32 s3, s3, 24
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v0, s8, v1, v0
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v1, s8, v2, v1
 ; GFX9-NODL-NEXT:    s_ashr_i32 s2, s2, 24
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, s2, v1, v0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v1, s2, v2, v1
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: idot4_multiuse_mul1:
 ; GFX9-DL:       ; %bb.0: ; %entry
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
@@ -620,24 +609,22 @@ define amdgpu_kernel void @idot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_sext_i32_i8 s4, s2
 ; GFX9-DL-NEXT:    s_sext_i32_i8 s5, s3
 ; GFX9-DL-NEXT:    s_bfe_i32 s7, s3, 0x80008
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s5
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s10
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX9-DL-NEXT:    s_bfe_i32 s6, s2, 0x80008
-; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s4, v0, v1
-; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s7
+; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s4, v1, v2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX9-DL-NEXT:    s_bfe_i32 s9, s3, 0x80010
-; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s6, v2, v1
+; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s6, v3, v2
 ; GFX9-DL-NEXT:    s_bfe_i32 s8, s2, 0x80010
-; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s4, v0, v1
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s4, v1, v2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s9
 ; GFX9-DL-NEXT:    s_ashr_i32 s3, s3, 24
-; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s8, v1, v0
+; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s8, v2, v1
 ; GFX9-DL-NEXT:    s_ashr_i32 s2, s2, 24
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s2, v1, v0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s2, v2, v1
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: idot4_multiuse_mul1:
@@ -645,6 +632,7 @@ define amdgpu_kernel void @idot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_clause 0x1
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -664,10 +652,8 @@ define amdgpu_kernel void @idot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_ashr_i32 s0, s0, 24
 ; GFX10-DL-NEXT:    s_ashr_i32 s1, s1, 24
 ; GFX10-DL-NEXT:    v_mad_i32_i24 v0, s2, s3, v0
-; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s0, s1, v0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT:    v_mad_i32_i24 v0, s0, s1, v0
+; GFX10-DL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-DL-NEXT:    s_endpgm
                                                <4 x i8> addrspace(1)* %src2,
                                                i32 addrspace(1)* nocapture %dst) {
@@ -779,64 +765,62 @@ define amdgpu_kernel void @idot4_acc32_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX9-NODL:       ; %bb.0: ; %entry
 ; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NODL-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-NODL-NEXT:    s_load_dword s3, s[6:7], 0x0
 ; GFX9-NODL-NEXT:    s_load_dword s8, s[0:1], 0x0
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    v_lshrrev_b16_e64 v0, 8, s2
-; GFX9-NODL-NEXT:    v_lshrrev_b16_e64 v1, 8, s3
+; GFX9-NODL-NEXT:    v_lshrrev_b16_e64 v1, 8, s2
+; GFX9-NODL-NEXT:    v_lshrrev_b16_e64 v2, 8, s3
 ; GFX9-NODL-NEXT:    s_ashr_i32 s6, s3, 24
 ; GFX9-NODL-NEXT:    s_bfe_i32 s7, s3, 0x80010
 ; GFX9-NODL-NEXT:    s_sext_i32_i8 s3, s3
 ; GFX9-NODL-NEXT:    s_ashr_i32 s4, s2, 24
 ; GFX9-NODL-NEXT:    s_bfe_i32 s5, s2, 0x80010
 ; GFX9-NODL-NEXT:    s_sext_i32_i8 s2, s2
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s3
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s8
-; GFX9-NODL-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v4, s8
 ; GFX9-NODL-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, s2, v2, v3
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v0, v0, v1, v2
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v0, s5, v1, v0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, s4, v1, v0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NODL-NEXT:    v_bfe_i32 v2, v2, 0, 8
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v3, s2, v3, v4
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v1, v1, v2, v3
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s7
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v1, s5, v2, v1
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v1, s4, v2, v1
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: idot4_acc32_vecMul:
 ; GFX9-DL:       ; %bb.0: ; %entry
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s8, s[0:1], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_lshrrev_b16_e64 v0, 8, s2
-; GFX9-DL-NEXT:    v_lshrrev_b16_e64 v1, 8, s3
+; GFX9-DL-NEXT:    v_lshrrev_b16_e64 v1, 8, s2
+; GFX9-DL-NEXT:    v_lshrrev_b16_e64 v2, 8, s3
 ; GFX9-DL-NEXT:    s_ashr_i32 s6, s3, 24
 ; GFX9-DL-NEXT:    s_bfe_i32 s7, s3, 0x80010
 ; GFX9-DL-NEXT:    s_sext_i32_i8 s3, s3
 ; GFX9-DL-NEXT:    s_ashr_i32 s4, s2, 24
 ; GFX9-DL-NEXT:    s_bfe_i32 s5, s2, 0x80010
 ; GFX9-DL-NEXT:    s_sext_i32_i8 s2, s2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s3
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s8
-; GFX9-DL-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s8
 ; GFX9-DL-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s2, v2, v3
-; GFX9-DL-NEXT:    v_mad_i32_i24 v0, v0, v1, v2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s5, v1, v0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s4, v1, v0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-DL-NEXT:    v_bfe_i32 v2, v2, 0, 8
+; GFX9-DL-NEXT:    v_mad_i32_i24 v3, s2, v3, v4
+; GFX9-DL-NEXT:    v_mad_i32_i24 v1, v1, v2, v3
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s7
+; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s5, v2, v1
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s4, v2, v1
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: idot4_acc32_vecMul:
@@ -863,11 +847,10 @@ define amdgpu_kernel void @idot4_acc32_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_ashr_i32 s2, s2, 24
 ; GFX10-DL-NEXT:    s_ashr_i32 s3, s3, 24
 ; GFX10-DL-NEXT:    v_mad_i32_i24 v0, v0, v1, v2
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-DL-NEXT:    v_mad_i32_i24 v0, s4, s5, v0
-; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s2, s3, v0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT:    v_mad_i32_i24 v0, s2, s3, v0
+; GFX10-DL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
                                               <4 x i8> addrspace(1)* %src2,
                                               i32 addrspace(1)* nocapture %dst) {
@@ -962,122 +945,118 @@ define amdgpu_kernel void @idot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX9-NODL:       ; %bb.0: ; %entry
 ; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v4, 0xffff
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v5, 0xffff
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NODL-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-NODL-NEXT:    s_load_dword s3, s[6:7], 0x0
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NODL-NEXT:    s_lshr_b32 s4, s2, 16
 ; GFX9-NODL-NEXT:    s_lshr_b32 s5, s3, 16
-; GFX9-NODL-NEXT:    v_ashrrev_i16_e64 v3, 8, s5
+; GFX9-NODL-NEXT:    v_ashrrev_i16_e64 v4, 8, s5
 ; GFX9-NODL-NEXT:    s_bfe_i32 s5, s5, 0x80000
-; GFX9-NODL-NEXT:    v_ashrrev_i16_e64 v2, 8, s4
-; GFX9-NODL-NEXT:    v_and_b32_e32 v5, s5, v4
+; GFX9-NODL-NEXT:    v_ashrrev_i16_e64 v3, 8, s4
+; GFX9-NODL-NEXT:    v_and_b32_e32 v6, s5, v5
 ; GFX9-NODL-NEXT:    s_bfe_i32 s4, s4, 0x80000
-; GFX9-NODL-NEXT:    v_lshl_or_b32 v3, v3, 16, v5
-; GFX9-NODL-NEXT:    v_and_b32_e32 v5, s4, v4
-; GFX9-NODL-NEXT:    v_lshl_or_b32 v2, v2, 16, v5
-; GFX9-NODL-NEXT:    v_ashrrev_i16_e64 v1, 8, s3
+; GFX9-NODL-NEXT:    v_lshl_or_b32 v4, v4, 16, v6
+; GFX9-NODL-NEXT:    v_and_b32_e32 v6, s4, v5
+; GFX9-NODL-NEXT:    v_lshl_or_b32 v3, v3, 16, v6
+; GFX9-NODL-NEXT:    v_ashrrev_i16_e64 v2, 8, s3
 ; GFX9-NODL-NEXT:    s_bfe_i32 s3, s3, 0x80000
-; GFX9-NODL-NEXT:    v_ashrrev_i16_e64 v0, 8, s2
-; GFX9-NODL-NEXT:    v_pk_mul_lo_u16 v2, v2, v3
-; GFX9-NODL-NEXT:    v_and_b32_e32 v3, s3, v4
+; GFX9-NODL-NEXT:    v_ashrrev_i16_e64 v1, 8, s2
+; GFX9-NODL-NEXT:    v_pk_mul_lo_u16 v3, v3, v4
+; GFX9-NODL-NEXT:    v_and_b32_e32 v4, s3, v5
 ; GFX9-NODL-NEXT:    s_bfe_i32 s2, s2, 0x80000
-; GFX9-NODL-NEXT:    v_lshl_or_b32 v1, v1, 16, v3
-; GFX9-NODL-NEXT:    v_and_b32_e32 v3, s2, v4
-; GFX9-NODL-NEXT:    v_lshl_or_b32 v0, v0, 16, v3
-; GFX9-NODL-NEXT:    v_pk_mul_lo_u16 v3, v0, v1
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT:    global_load_ushort v4, v[0:1], off
+; GFX9-NODL-NEXT:    v_lshl_or_b32 v2, v2, 16, v4
+; GFX9-NODL-NEXT:    v_and_b32_e32 v4, s2, v5
+; GFX9-NODL-NEXT:    v_lshl_or_b32 v1, v1, 16, v4
+; GFX9-NODL-NEXT:    v_pk_mul_lo_u16 v1, v1, v2
+; GFX9-NODL-NEXT:    global_load_ushort v2, v0, s[0:1]
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT:    v_add_u32_e32 v4, v3, v4
-; GFX9-NODL-NEXT:    v_add_u32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NODL-NEXT:    v_add_u32_e32 v3, v3, v2
-; GFX9-NODL-NEXT:    v_add_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NODL-NEXT:    global_store_short v[0:1], v2, off
+; GFX9-NODL-NEXT:    v_add_u32_e32 v2, v1, v2
+; GFX9-NODL-NEXT:    v_add_u32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NODL-NEXT:    v_add_u32_e32 v1, v1, v3
+; GFX9-NODL-NEXT:    v_add_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NODL-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: idot4_acc16_vecMul:
 ; GFX9-DL:       ; %bb.0: ; %entry
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX9-DL-NEXT:    v_mov_b32_e32 v4, 0xffff
+; GFX9-DL-NEXT:    v_mov_b32_e32 v5, 0xffff
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    s_lshr_b32 s4, s2, 16
 ; GFX9-DL-NEXT:    s_lshr_b32 s5, s3, 16
-; GFX9-DL-NEXT:    v_ashrrev_i16_e64 v3, 8, s5
+; GFX9-DL-NEXT:    v_ashrrev_i16_e64 v4, 8, s5
 ; GFX9-DL-NEXT:    s_bfe_i32 s5, s5, 0x80000
-; GFX9-DL-NEXT:    v_ashrrev_i16_e64 v2, 8, s4
-; GFX9-DL-NEXT:    v_and_b32_e32 v5, s5, v4
+; GFX9-DL-NEXT:    v_ashrrev_i16_e64 v3, 8, s4
+; GFX9-DL-NEXT:    v_and_b32_e32 v6, s5, v5
 ; GFX9-DL-NEXT:    s_bfe_i32 s4, s4, 0x80000
-; GFX9-DL-NEXT:    v_lshl_or_b32 v3, v3, 16, v5
-; GFX9-DL-NEXT:    v_and_b32_e32 v5, s4, v4
-; GFX9-DL-NEXT:    v_lshl_or_b32 v2, v2, 16, v5
-; GFX9-DL-NEXT:    v_ashrrev_i16_e64 v1, 8, s3
+; GFX9-DL-NEXT:    v_lshl_or_b32 v4, v4, 16, v6
+; GFX9-DL-NEXT:    v_and_b32_e32 v6, s4, v5
+; GFX9-DL-NEXT:    v_lshl_or_b32 v3, v3, 16, v6
+; GFX9-DL-NEXT:    v_ashrrev_i16_e64 v2, 8, s3
 ; GFX9-DL-NEXT:    s_bfe_i32 s3, s3, 0x80000
-; GFX9-DL-NEXT:    v_ashrrev_i16_e64 v0, 8, s2
-; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v2, v2, v3
-; GFX9-DL-NEXT:    v_and_b32_e32 v3, s3, v4
+; GFX9-DL-NEXT:    v_ashrrev_i16_e64 v1, 8, s2
+; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v3, v3, v4
+; GFX9-DL-NEXT:    v_and_b32_e32 v4, s3, v5
 ; GFX9-DL-NEXT:    s_bfe_i32 s2, s2, 0x80000
-; GFX9-DL-NEXT:    v_lshl_or_b32 v1, v1, 16, v3
-; GFX9-DL-NEXT:    v_and_b32_e32 v3, s2, v4
-; GFX9-DL-NEXT:    v_lshl_or_b32 v0, v0, 16, v3
-; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v3, v0, v1
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT:    global_load_ushort v4, v[0:1], off
+; GFX9-DL-NEXT:    v_lshl_or_b32 v2, v2, 16, v4
+; GFX9-DL-NEXT:    v_and_b32_e32 v4, s2, v5
+; GFX9-DL-NEXT:    v_lshl_or_b32 v1, v1, 16, v4
+; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v1, v1, v2
+; GFX9-DL-NEXT:    global_load_ushort v2, v0, s[0:1]
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_add_u32_e32 v4, v3, v4
-; GFX9-DL-NEXT:    v_add_u32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-DL-NEXT:    v_add_u32_e32 v3, v3, v2
-; GFX9-DL-NEXT:    v_add_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-DL-NEXT:    global_store_short v[0:1], v2, off
+; GFX9-DL-NEXT:    v_add_u32_e32 v2, v1, v2
+; GFX9-DL-NEXT:    v_add_u32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-DL-NEXT:    v_add_u32_e32 v1, v1, v3
+; GFX9-DL-NEXT:    v_add_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-DL-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: idot4_acc16_vecMul:
 ; GFX10-DL:       ; %bb.0: ; %entry
-; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX10-DL-NEXT:    v_mov_b32_e32 v3, 0xffff
-; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-DL-NEXT:    global_load_ushort v2, v[0:1], off
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0xffff
+; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    global_load_ushort v1, v0, s[4:5]
 ; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_lshr_b32 s2, s0, 16
-; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v4, 8, s0
+; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v3, 8, s0
 ; GFX10-DL-NEXT:    s_bfe_i32 s0, s0, 0x80000
 ; GFX10-DL-NEXT:    s_bfe_i32 s3, s1, 0x80000
-; GFX10-DL-NEXT:    v_and_b32_e32 v7, s0, v3
-; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v5, 8, s1
-; GFX10-DL-NEXT:    v_and_b32_e32 v6, s3, v3
+; GFX10-DL-NEXT:    v_and_b32_e32 v6, s0, v2
+; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v4, 8, s1
+; GFX10-DL-NEXT:    v_and_b32_e32 v5, s3, v2
 ; GFX10-DL-NEXT:    s_lshr_b32 s0, s1, 16
-; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v8, 8, s2
-; GFX10-DL-NEXT:    v_lshl_or_b32 v4, v4, 16, v7
+; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v7, 8, s2
+; GFX10-DL-NEXT:    v_lshl_or_b32 v3, v3, 16, v6
 ; GFX10-DL-NEXT:    s_bfe_i32 s1, s2, 0x80000
-; GFX10-DL-NEXT:    v_lshl_or_b32 v5, v5, 16, v6
+; GFX10-DL-NEXT:    v_lshl_or_b32 v4, v4, 16, v5
 ; GFX10-DL-NEXT:    s_bfe_i32 s2, s0, 0x80000
-; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v6, 8, s0
-; GFX10-DL-NEXT:    v_and_b32_e32 v7, s2, v3
-; GFX10-DL-NEXT:    v_and_b32_e32 v3, s1, v3
-; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v4, v4, v5
-; GFX10-DL-NEXT:    v_lshl_or_b32 v5, v6, 16, v7
-; GFX10-DL-NEXT:    v_lshl_or_b32 v3, v8, 16, v3
-; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v3, v3, v5
+; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v5, 8, s0
+; GFX10-DL-NEXT:    v_and_b32_e32 v6, s2, v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v2, s1, v2
+; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v3, v3, v4
+; GFX10-DL-NEXT:    v_lshl_or_b32 v4, v5, 16, v6
+; GFX10-DL-NEXT:    v_lshl_or_b32 v2, v7, 16, v2
+; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v2, v2, v4
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v4, v2
-; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v2, v3
-; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-DL-NEXT:    global_store_short v[0:1], v2, off
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v1, v3, v1
+; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v1, v1, v2
+; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-DL-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-DL-NEXT:    s_endpgm
                                               <4 x i8> addrspace(1)* %src2,
                                               i16 addrspace(1)* nocapture %dst) {

diff  --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll
index aa4dc4e143d9..b491db021d1f 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll
@@ -76,6 +76,7 @@ define amdgpu_kernel void @udot4_acc32(<4 x i8> addrspace(1)* %src1,
 ; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-NODL-NEXT:    s_movk_i32 s2, 0xff
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NODL-NEXT:    s_load_dword s3, s[4:5], 0x0
 ; GFX9-NODL-NEXT:    s_load_dword s4, s[6:7], 0x0
@@ -84,40 +85,37 @@ define amdgpu_kernel void @udot4_acc32(<4 x i8> addrspace(1)* %src1,
 ; GFX9-NODL-NEXT:    s_and_b32 s5, s3, s2
 ; GFX9-NODL-NEXT:    s_and_b32 s2, s4, s2
 ; GFX9-NODL-NEXT:    s_bfe_u32 s7, s4, 0x80008
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s10
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX9-NODL-NEXT:    s_bfe_u32 s9, s4, 0x80010
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v0, s5, v0, v1
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s5, v1, v2
 ; GFX9-NODL-NEXT:    s_bfe_u32 s6, s3, 0x80008
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s7
 ; GFX9-NODL-NEXT:    s_bfe_u32 s8, s3, 0x80010
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v0, s6, v1, v0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s6, v2, v1
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s9
 ; GFX9-NODL-NEXT:    s_lshr_b32 s4, s4, 24
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v0, s8, v1, v0
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s8, v2, v1
 ; GFX9-NODL-NEXT:    s_lshr_b32 s3, s3, 24
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s3, v1, v0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s3, v2, v1
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: udot4_acc32:
 ; GFX9-DL:       ; %bb.0: ; %entry
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s3, s[0:1], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-DL-NEXT:    v_dot4_u32_u8 v2, s4, v0, v1
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, s4, v1, v2
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: udot4_acc32:
@@ -125,6 +123,7 @@ define amdgpu_kernel void @udot4_acc32(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_clause 0x1
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -132,10 +131,8 @@ define amdgpu_kernel void @udot4_acc32(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s6
-; GFX10-DL-NEXT:    v_dot4_u32_u8 v2, s0, s1, v0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT:    v_dot4_u32_u8 v0, s0, s1, v0
+; GFX10-DL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-DL-NEXT:    s_endpgm
                                        <4 x i8> addrspace(1)* %src2,
                                        i32 addrspace(1)* nocapture %dst) {
@@ -246,66 +243,62 @@ define amdgpu_kernel void @udot4_acc16(<4 x i8> addrspace(1)* %src1,
 ; GFX9-NODL:       ; %bb.0: ; %entry
 ; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NODL-NEXT:    s_movk_i32 s2, 0xff
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT:    global_load_ushort v2, v[0:1], off
-; GFX9-NODL-NEXT:    s_load_dword s1, s[4:5], 0x0
-; GFX9-NODL-NEXT:    s_load_dword s2, s[6:7], 0x0
-; GFX9-NODL-NEXT:    s_movk_i32 s0, 0xff
+; GFX9-NODL-NEXT:    global_load_ushort v1, v0, s[0:1]
+; GFX9-NODL-NEXT:    s_load_dword s3, s[4:5], 0x0
+; GFX9-NODL-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    s_and_b32 s3, s1, s0
-; GFX9-NODL-NEXT:    s_and_b32 s0, s2, s0
-; GFX9-NODL-NEXT:    s_bfe_u32 s5, s2, 0x80008
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s0
-; GFX9-NODL-NEXT:    s_bfe_u32 s7, s2, 0x80010
-; GFX9-NODL-NEXT:    s_bfe_u32 s4, s1, 0x80008
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v4, s5
-; GFX9-NODL-NEXT:    s_bfe_u32 s6, s1, 0x80010
-; GFX9-NODL-NEXT:    s_lshr_b32 s2, s2, 24
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v5, s7
-; GFX9-NODL-NEXT:    s_lshr_b32 s1, s1, 24
+; GFX9-NODL-NEXT:    s_and_b32 s5, s3, s2
+; GFX9-NODL-NEXT:    s_and_b32 s2, s4, s2
+; GFX9-NODL-NEXT:    s_bfe_u32 s7, s4, 0x80008
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-NODL-NEXT:    s_bfe_u32 s9, s4, 0x80010
+; GFX9-NODL-NEXT:    s_bfe_u32 s6, s3, 0x80008
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NODL-NEXT:    s_bfe_u32 s8, s3, 0x80010
+; GFX9-NODL-NEXT:    s_lshr_b32 s4, s4, 24
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v4, s9
+; GFX9-NODL-NEXT:    s_lshr_b32 s3, s3, 24
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s3, v3, v2
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s4, v4, v2
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s6, v5, v2
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s2
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s1, v3, v2
-; GFX9-NODL-NEXT:    global_store_short v[0:1], v2, off
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s5, v2, v1
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s6, v3, v1
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s8, v4, v1
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s3, v2, v1
+; GFX9-NODL-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: udot4_acc16:
 ; GFX9-DL:       ; %bb.0: ; %entry
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT:    global_load_ushort v2, v[0:1], off
+; GFX9-DL-NEXT:    global_load_ushort v1, v0, s[0:1]
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_dot4_u32_u8 v2, s2, v3, v2
-; GFX9-DL-NEXT:    global_store_short v[0:1], v2, off
+; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, s2, v2, v1
+; GFX9-DL-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: udot4_acc16:
 ; GFX10-DL:       ; %bb.0: ; %entry
-; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-DL-NEXT:    global_load_ushort v2, v[0:1], off
+; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    global_load_ushort v1, v0, s[4:5]
 ; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NEXT:    v_dot4_u32_u8 v2, s0, s1, v2
-; GFX10-DL-NEXT:    global_store_short v[0:1], v2, off
+; GFX10-DL-NEXT:    v_dot4_u32_u8 v1, s0, s1, v1
+; GFX10-DL-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-DL-NEXT:    s_endpgm
                                        <4 x i8> addrspace(1)* %src2,
                                        i16 addrspace(1)* nocapture %dst) {
@@ -416,66 +409,62 @@ define amdgpu_kernel void @udot4_acc8(<4 x i8> addrspace(1)* %src1,
 ; GFX9-NODL:       ; %bb.0: ; %entry
 ; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NODL-NEXT:    s_movk_i32 s2, 0xff
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT:    global_load_ubyte v2, v[0:1], off
-; GFX9-NODL-NEXT:    s_load_dword s1, s[4:5], 0x0
-; GFX9-NODL-NEXT:    s_load_dword s2, s[6:7], 0x0
-; GFX9-NODL-NEXT:    s_movk_i32 s0, 0xff
+; GFX9-NODL-NEXT:    global_load_ubyte v1, v0, s[0:1]
+; GFX9-NODL-NEXT:    s_load_dword s3, s[4:5], 0x0
+; GFX9-NODL-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    s_bfe_u32 s5, s1, 0x80008
-; GFX9-NODL-NEXT:    s_and_b32 s3, s2, s0
-; GFX9-NODL-NEXT:    s_bfe_u32 s4, s2, 0x80008
-; GFX9-NODL-NEXT:    s_and_b32 s0, s1, s0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-NODL-NEXT:    s_bfe_u32 s6, s2, 0x80010
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v4, s4
-; GFX9-NODL-NEXT:    s_bfe_u32 s7, s1, 0x80010
-; GFX9-NODL-NEXT:    s_lshr_b32 s2, s2, 24
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v5, s6
-; GFX9-NODL-NEXT:    s_lshr_b32 s1, s1, 24
+; GFX9-NODL-NEXT:    s_bfe_u32 s7, s3, 0x80008
+; GFX9-NODL-NEXT:    s_and_b32 s5, s4, s2
+; GFX9-NODL-NEXT:    s_bfe_u32 s6, s4, 0x80008
+; GFX9-NODL-NEXT:    s_and_b32 s2, s3, s2
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-NODL-NEXT:    s_bfe_u32 s8, s4, 0x80010
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s6
+; GFX9-NODL-NEXT:    s_bfe_u32 s9, s3, 0x80010
+; GFX9-NODL-NEXT:    s_lshr_b32 s4, s4, 24
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v4, s8
+; GFX9-NODL-NEXT:    s_lshr_b32 s3, s3, 24
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s5, v4, v2
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s7, v5, v2
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s2
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s1, v3, v2
-; GFX9-NODL-NEXT:    global_store_byte v[0:1], v2, off
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s2, v2, v1
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s7, v3, v1
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s9, v4, v1
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s3, v2, v1
+; GFX9-NODL-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: udot4_acc8:
 ; GFX9-DL:       ; %bb.0: ; %entry
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT:    global_load_ubyte v2, v[0:1], off
+; GFX9-DL-NEXT:    global_load_ubyte v1, v0, s[0:1]
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_dot4_u32_u8 v2, s2, v3, v2
-; GFX9-DL-NEXT:    global_store_byte v[0:1], v2, off
+; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, s2, v2, v1
+; GFX9-DL-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: udot4_acc8:
 ; GFX10-DL:       ; %bb.0: ; %entry
-; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-DL-NEXT:    global_load_ubyte v2, v[0:1], off
+; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    global_load_ubyte v1, v0, s[4:5]
 ; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NEXT:    v_dot4_u32_u8 v2, s0, s1, v2
-; GFX10-DL-NEXT:    global_store_byte v[0:1], v2, off
+; GFX10-DL-NEXT:    v_dot4_u32_u8 v1, s0, s1, v1
+; GFX10-DL-NEXT:    global_store_byte v0, v1, s[4:5]
 ; GFX10-DL-NEXT:    s_endpgm
                                       <4 x i8> addrspace(1)* %src2,
                                       i8 addrspace(1)* nocapture %dst) {
@@ -563,60 +552,56 @@ define amdgpu_kernel void @udot2_8(<4 x i8> addrspace(1)* %src1,
 ; GFX9-NODL:       ; %bb.0: ; %entry
 ; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NODL-NEXT:    s_movk_i32 s2, 0xff
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT:    global_load_ubyte v2, v[0:1], off
-; GFX9-NODL-NEXT:    s_load_dword s1, s[4:5], 0x0
-; GFX9-NODL-NEXT:    s_load_dword s2, s[6:7], 0x0
-; GFX9-NODL-NEXT:    s_movk_i32 s0, 0xff
+; GFX9-NODL-NEXT:    global_load_ubyte v1, v0, s[0:1]
+; GFX9-NODL-NEXT:    s_load_dword s3, s[4:5], 0x0
+; GFX9-NODL-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    s_and_b32 s3, s2, s0
-; GFX9-NODL-NEXT:    s_and_b32 s0, s1, s0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-NODL-NEXT:    s_bfe_u32 s2, s2, 0x80008
-; GFX9-NODL-NEXT:    s_bfe_u32 s1, s1, 0x80008
+; GFX9-NODL-NEXT:    s_and_b32 s5, s4, s2
+; GFX9-NODL-NEXT:    s_and_b32 s2, s3, s2
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-NODL-NEXT:    s_bfe_u32 s4, s4, 0x80008
+; GFX9-NODL-NEXT:    s_bfe_u32 s3, s3, 0x80008
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s2
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s1, v3, v2
-; GFX9-NODL-NEXT:    global_store_byte v[0:1], v2, off
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s2, v2, v1
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s3, v2, v1
+; GFX9-NODL-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: udot2_8:
 ; GFX9-DL:       ; %bb.0: ; %entry
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT:    s_movk_i32 s2, 0xff
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT:    global_load_ubyte v2, v[0:1], off
-; GFX9-DL-NEXT:    s_load_dword s1, s[4:5], 0x0
-; GFX9-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
-; GFX9-DL-NEXT:    s_movk_i32 s0, 0xff
+; GFX9-DL-NEXT:    global_load_ubyte v1, v0, s[0:1]
+; GFX9-DL-NEXT:    s_load_dword s3, s[4:5], 0x0
+; GFX9-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_and_b32 s3, s2, s0
-; GFX9-DL-NEXT:    s_and_b32 s0, s1, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-DL-NEXT:    s_bfe_u32 s2, s2, 0x80008
-; GFX9-DL-NEXT:    s_bfe_u32 s1, s1, 0x80008
+; GFX9-DL-NEXT:    s_and_b32 s5, s4, s2
+; GFX9-DL-NEXT:    s_and_b32 s2, s3, s2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-DL-NEXT:    s_bfe_u32 s4, s4, 0x80008
+; GFX9-DL-NEXT:    s_bfe_u32 s3, s3, 0x80008
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s1, v3, v2
-; GFX9-DL-NEXT:    global_store_byte v[0:1], v2, off
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s2, v2, v1
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s3, v2, v1
+; GFX9-DL-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: udot2_8:
 ; GFX10-DL:       ; %bb.0: ; %entry
-; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-DL-NEXT:    global_load_ubyte v2, v[0:1], off
+; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    global_load_ubyte v1, v0, s[4:5]
 ; GFX10-DL-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX10-DL-NEXT:    s_movk_i32 s1, 0xff
@@ -625,10 +610,10 @@ define amdgpu_kernel void @udot2_8(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_and_b32 s1, s0, s1
 ; GFX10-DL-NEXT:    s_bfe_u32 s0, s0, 0x80008
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s1, s3, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s1, s3, v1
 ; GFX10-DL-NEXT:    s_bfe_u32 s1, s2, 0x80008
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s0, s1, v2
-; GFX10-DL-NEXT:    global_store_byte v[0:1], v2, off
+; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s0, s1, v1
+; GFX10-DL-NEXT:    global_store_byte v0, v1, s[4:5]
 ; GFX10-DL-NEXT:    s_endpgm
                                    <4 x i8> addrspace(1)* %src2,
                                    i8 addrspace(1)* nocapture %dst) {
@@ -720,66 +705,62 @@ define amdgpu_kernel void @udot4_CommutationInsideMAD(<4 x i8> addrspace(1)* %sr
 ; GFX9-NODL:       ; %bb.0: ; %entry
 ; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NODL-NEXT:    s_movk_i32 s2, 0xff
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT:    global_load_ubyte v2, v[0:1], off
-; GFX9-NODL-NEXT:    s_load_dword s1, s[4:5], 0x0
-; GFX9-NODL-NEXT:    s_load_dword s2, s[6:7], 0x0
-; GFX9-NODL-NEXT:    s_movk_i32 s0, 0xff
+; GFX9-NODL-NEXT:    global_load_ubyte v1, v0, s[0:1]
+; GFX9-NODL-NEXT:    s_load_dword s3, s[4:5], 0x0
+; GFX9-NODL-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    s_and_b32 s3, s1, s0
-; GFX9-NODL-NEXT:    s_bfe_u32 s4, s1, 0x80008
-; GFX9-NODL-NEXT:    s_and_b32 s0, s2, s0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-NODL-NEXT:    s_bfe_u32 s6, s1, 0x80010
-; GFX9-NODL-NEXT:    s_bfe_u32 s5, s2, 0x80008
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v4, s4
-; GFX9-NODL-NEXT:    s_bfe_u32 s7, s2, 0x80010
-; GFX9-NODL-NEXT:    s_lshr_b32 s1, s1, 24
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v5, s6
-; GFX9-NODL-NEXT:    s_lshr_b32 s2, s2, 24
+; GFX9-NODL-NEXT:    s_and_b32 s5, s3, s2
+; GFX9-NODL-NEXT:    s_bfe_u32 s6, s3, 0x80008
+; GFX9-NODL-NEXT:    s_and_b32 s2, s4, s2
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-NODL-NEXT:    s_bfe_u32 s8, s3, 0x80010
+; GFX9-NODL-NEXT:    s_bfe_u32 s7, s4, 0x80008
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s6
+; GFX9-NODL-NEXT:    s_bfe_u32 s9, s4, 0x80010
+; GFX9-NODL-NEXT:    s_lshr_b32 s3, s3, 24
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v4, s8
+; GFX9-NODL-NEXT:    s_lshr_b32 s4, s4, 24
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s5, v4, v2
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s7, v5, v2
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
-; GFX9-NODL-NEXT:    global_store_byte v[0:1], v2, off
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s2, v2, v1
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s7, v3, v1
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s9, v4, v1
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s4, v2, v1
+; GFX9-NODL-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: udot4_CommutationInsideMAD:
 ; GFX9-DL:       ; %bb.0: ; %entry
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT:    global_load_ubyte v2, v[0:1], off
+; GFX9-DL-NEXT:    global_load_ubyte v1, v0, s[0:1]
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_dot4_u32_u8 v2, s3, v3, v2
-; GFX9-DL-NEXT:    global_store_byte v[0:1], v2, off
+; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, s3, v2, v1
+; GFX9-DL-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: udot4_CommutationInsideMAD:
 ; GFX10-DL:       ; %bb.0: ; %entry
-; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-DL-NEXT:    global_load_ubyte v2, v[0:1], off
+; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    global_load_ubyte v1, v0, s[4:5]
 ; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NEXT:    v_dot4_u32_u8 v2, s1, s0, v2
-; GFX10-DL-NEXT:    global_store_byte v[0:1], v2, off
+; GFX10-DL-NEXT:    v_dot4_u32_u8 v1, s1, s0, v1
+; GFX10-DL-NEXT:    global_store_byte v0, v1, s[4:5]
 ; GFX10-DL-NEXT:    s_endpgm
                                                       <4 x i8> addrspace(1)* %src2,
                                                       i8 addrspace(1)* nocapture %dst) {
@@ -883,94 +864,90 @@ define amdgpu_kernel void @udot4_CommutationAccrossMADs(<4 x i8> addrspace(1)* %
 ; GFX9-NODL:       ; %bb.0: ; %entry
 ; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NODL-NEXT:    s_movk_i32 s2, 0xff
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT:    global_load_ubyte v2, v[0:1], off
-; GFX9-NODL-NEXT:    s_load_dword s1, s[4:5], 0x0
-; GFX9-NODL-NEXT:    s_load_dword s2, s[6:7], 0x0
-; GFX9-NODL-NEXT:    s_movk_i32 s0, 0xff
+; GFX9-NODL-NEXT:    global_load_ubyte v1, v0, s[0:1]
+; GFX9-NODL-NEXT:    s_load_dword s3, s[4:5], 0x0
+; GFX9-NODL-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    s_bfe_u32 s4, s1, 0x80008
-; GFX9-NODL-NEXT:    s_and_b32 s3, s1, s0
-; GFX9-NODL-NEXT:    s_bfe_u32 s5, s2, 0x80008
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s4
-; GFX9-NODL-NEXT:    s_bfe_u32 s6, s1, 0x80010
-; GFX9-NODL-NEXT:    s_and_b32 s0, s2, s0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v4, s3
-; GFX9-NODL-NEXT:    s_bfe_u32 s7, s2, 0x80010
-; GFX9-NODL-NEXT:    s_lshr_b32 s1, s1, 24
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v5, s6
-; GFX9-NODL-NEXT:    s_lshr_b32 s2, s2, 24
+; GFX9-NODL-NEXT:    s_bfe_u32 s6, s3, 0x80008
+; GFX9-NODL-NEXT:    s_and_b32 s5, s3, s2
+; GFX9-NODL-NEXT:    s_bfe_u32 s7, s4, 0x80008
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NODL-NEXT:    s_bfe_u32 s8, s3, 0x80010
+; GFX9-NODL-NEXT:    s_and_b32 s2, s4, s2
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NODL-NEXT:    s_bfe_u32 s9, s4, 0x80010
+; GFX9-NODL-NEXT:    s_lshr_b32 s3, s3, 24
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v4, s8
+; GFX9-NODL-NEXT:    s_lshr_b32 s4, s4, 24
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s5, v3, v2
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s0, v4, v2
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s7, v5, v2
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
-; GFX9-NODL-NEXT:    global_store_byte v[0:1], v2, off
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s7, v2, v1
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s2, v3, v1
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s9, v4, v1
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s4, v2, v1
+; GFX9-NODL-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: udot4_CommutationAccrossMADs:
 ; GFX9-DL:       ; %bb.0: ; %entry
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT:    s_movk_i32 s2, 0xff
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT:    global_load_ubyte v2, v[0:1], off
-; GFX9-DL-NEXT:    s_load_dword s1, s[4:5], 0x0
-; GFX9-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
-; GFX9-DL-NEXT:    s_movk_i32 s0, 0xff
+; GFX9-DL-NEXT:    global_load_ubyte v1, v0, s[0:1]
+; GFX9-DL-NEXT:    s_load_dword s3, s[4:5], 0x0
+; GFX9-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_bfe_u32 s4, s1, 0x80008
-; GFX9-DL-NEXT:    s_and_b32 s3, s1, s0
-; GFX9-DL-NEXT:    s_bfe_u32 s5, s2, 0x80008
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s4
-; GFX9-DL-NEXT:    s_bfe_u32 s6, s1, 0x80010
-; GFX9-DL-NEXT:    s_and_b32 s0, s2, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s3
-; GFX9-DL-NEXT:    s_bfe_u32 s7, s2, 0x80010
-; GFX9-DL-NEXT:    s_lshr_b32 s1, s1, 24
-; GFX9-DL-NEXT:    v_mov_b32_e32 v5, s6
-; GFX9-DL-NEXT:    s_lshr_b32 s2, s2, 24
+; GFX9-DL-NEXT:    s_bfe_u32 s6, s3, 0x80008
+; GFX9-DL-NEXT:    s_and_b32 s5, s3, s2
+; GFX9-DL-NEXT:    s_bfe_u32 s7, s4, 0x80008
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-DL-NEXT:    s_bfe_u32 s8, s3, 0x80010
+; GFX9-DL-NEXT:    s_and_b32 s2, s4, s2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-DL-NEXT:    s_bfe_u32 s9, s4, 0x80010
+; GFX9-DL-NEXT:    s_lshr_b32 s3, s3, 24
+; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s8
+; GFX9-DL-NEXT:    s_lshr_b32 s4, s4, 24
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s5, v3, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s0, v4, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s7, v5, v2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
-; GFX9-DL-NEXT:    global_store_byte v[0:1], v2, off
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s7, v2, v1
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s2, v3, v1
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s9, v4, v1
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s4, v2, v1
+; GFX9-DL-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: udot4_CommutationAccrossMADs:
 ; GFX10-DL:       ; %bb.0: ; %entry
-; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX10-DL-NEXT:    s_movk_i32 s4, 0xff
-; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-DL-NEXT:    global_load_ubyte v2, v[0:1], off
+; GFX10-DL-NEXT:    s_movk_i32 s6, 0xff
+; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    global_load_ubyte v1, v0, s[4:5]
 ; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x80008
 ; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x80008
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s3, s2, v2
-; GFX10-DL-NEXT:    s_and_b32 s2, s0, s4
-; GFX10-DL-NEXT:    s_and_b32 s3, s1, s4
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s3, s2, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s3, s2, v1
+; GFX10-DL-NEXT:    s_and_b32 s2, s0, s6
+; GFX10-DL-NEXT:    s_and_b32 s3, s1, s6
+; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s3, s2, v1
 ; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x80010
 ; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x80010
 ; GFX10-DL-NEXT:    s_lshr_b32 s0, s0, 24
 ; GFX10-DL-NEXT:    s_lshr_b32 s1, s1, 24
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s3, s2, v2
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s1, s0, v2
-; GFX10-DL-NEXT:    global_store_byte v[0:1], v2, off
+; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s3, s2, v1
+; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s1, s0, v1
+; GFX10-DL-NEXT:    global_store_byte v0, v1, s[4:5]
 ; GFX10-DL-NEXT:    s_endpgm
                                                         <4 x i8> addrspace(1)* %src2,
                                                         i8 addrspace(1)* nocapture %dst) {
@@ -1076,6 +1053,7 @@ define amdgpu_kernel void @udot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1,
 ; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-NODL-NEXT:    s_movk_i32 s2, 0xff
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NODL-NEXT:    s_load_dword s3, s[4:5], 0x0
 ; GFX9-NODL-NEXT:    s_load_dword s4, s[6:7], 0x0
@@ -1084,24 +1062,22 @@ define amdgpu_kernel void @udot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1,
 ; GFX9-NODL-NEXT:    s_and_b32 s5, s3, s2
 ; GFX9-NODL-NEXT:    s_and_b32 s2, s4, s2
 ; GFX9-NODL-NEXT:    s_bfe_u32 s7, s4, 0x80008
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s10
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX9-NODL-NEXT:    s_bfe_u32 s6, s3, 0x80008
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s5, v0, v1
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s7
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s5, v1, v2
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX9-NODL-NEXT:    s_bfe_u32 s9, s4, 0x80010
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s6, v2, v1
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s6, v3, v2
 ; GFX9-NODL-NEXT:    s_bfe_u32 s8, s3, 0x80010
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v0, s5, v0, v1
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s5, v1, v2
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s9
 ; GFX9-NODL-NEXT:    s_lshr_b32 s4, s4, 24
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v0, s8, v1, v0
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s8, v2, v1
 ; GFX9-NODL-NEXT:    s_lshr_b32 s3, s3, 24
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s3, v1, v0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s3, v2, v1
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: udot4_multiuse_mul1:
@@ -1109,6 +1085,7 @@ define amdgpu_kernel void @udot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-DL-NEXT:    s_movk_i32 s2, 0xff
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    s_load_dword s3, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
@@ -1117,24 +1094,22 @@ define amdgpu_kernel void @udot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_and_b32 s5, s3, s2
 ; GFX9-DL-NEXT:    s_and_b32 s2, s4, s2
 ; GFX9-DL-NEXT:    s_bfe_u32 s7, s4, 0x80008
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s10
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX9-DL-NEXT:    s_bfe_u32 s6, s3, 0x80008
-; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s5, v0, v1
-; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s7
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s5, v1, v2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX9-DL-NEXT:    s_bfe_u32 s9, s4, 0x80010
-; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s6, v2, v1
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s6, v3, v2
 ; GFX9-DL-NEXT:    s_bfe_u32 s8, s3, 0x80010
-; GFX9-DL-NEXT:    v_mad_u32_u24 v0, s5, v0, v1
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s5, v1, v2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s9
 ; GFX9-DL-NEXT:    s_lshr_b32 s4, s4, 24
-; GFX9-DL-NEXT:    v_mad_u32_u24 v0, s8, v1, v0
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s8, v2, v1
 ; GFX9-DL-NEXT:    s_lshr_b32 s3, s3, 24
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s3, v1, v0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s3, v2, v1
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: udot4_multiuse_mul1:
@@ -1142,6 +1117,7 @@ define amdgpu_kernel void @udot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_clause 0x1
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -1162,10 +1138,8 @@ define amdgpu_kernel void @udot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_lshr_b32 s0, s0, 24
 ; GFX10-DL-NEXT:    s_lshr_b32 s1, s1, 24
 ; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s2, s3, v0
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s0, s1, v0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s0, s1, v0
+; GFX10-DL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-DL-NEXT:    s_endpgm
                                                <4 x i8> addrspace(1)* %src2,
                                                i32 addrspace(1)* nocapture %dst) {
@@ -1282,6 +1256,7 @@ define amdgpu_kernel void @udot4_multiuse_add1(<4 x i8> addrspace(1)* %src1,
 ; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-NODL-NEXT:    s_movk_i32 s2, 0xff
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NODL-NEXT:    s_load_dword s3, s[4:5], 0x0
 ; GFX9-NODL-NEXT:    s_load_dword s4, s[6:7], 0x0
@@ -1291,24 +1266,22 @@ define amdgpu_kernel void @udot4_multiuse_add1(<4 x i8> addrspace(1)* %src1,
 ; GFX9-NODL-NEXT:    s_bfe_u32 s7, s4, 0x80008
 ; GFX9-NODL-NEXT:    s_and_b32 s2, s4, s2
 ; GFX9-NODL-NEXT:    s_bfe_u32 s6, s3, 0x80008
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s7
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s10
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v0, s6, v0, v1
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s10
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s6, v1, v2
 ; GFX9-NODL-NEXT:    s_bfe_u32 s9, s4, 0x80010
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX9-NODL-NEXT:    s_bfe_u32 s8, s3, 0x80010
-; GFX9-NODL-NEXT:    v_add_u32_e32 v1, s10, v0
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v0, s5, v2, v0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s9
+; GFX9-NODL-NEXT:    v_add_u32_e32 v2, s10, v1
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s5, v3, v1
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s9
 ; GFX9-NODL-NEXT:    s_lshr_b32 s4, s4, 24
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v0, s8, v2, v0
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s8, v3, v1
 ; GFX9-NODL-NEXT:    s_lshr_b32 s3, s3, 24
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v0, s3, v2, v0
-; GFX9-NODL-NEXT:    v_add_u32_e32 v2, v0, v1
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s4
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s3, v3, v1
+; GFX9-NODL-NEXT:    v_add_u32_e32 v1, v1, v2
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: udot4_multiuse_add1:
@@ -1316,6 +1289,7 @@ define amdgpu_kernel void @udot4_multiuse_add1(<4 x i8> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-DL-NEXT:    s_movk_i32 s2, 0xff
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    s_load_dword s3, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
@@ -1325,24 +1299,22 @@ define amdgpu_kernel void @udot4_multiuse_add1(<4 x i8> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_bfe_u32 s7, s4, 0x80008
 ; GFX9-DL-NEXT:    s_and_b32 s2, s4, s2
 ; GFX9-DL-NEXT:    s_bfe_u32 s6, s3, 0x80008
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s7
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s10
-; GFX9-DL-NEXT:    v_mad_u32_u24 v0, s6, v0, v1
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s10
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s6, v1, v2
 ; GFX9-DL-NEXT:    s_bfe_u32 s9, s4, 0x80010
-; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX9-DL-NEXT:    s_bfe_u32 s8, s3, 0x80010
-; GFX9-DL-NEXT:    v_add_u32_e32 v1, s10, v0
-; GFX9-DL-NEXT:    v_mad_u32_u24 v0, s5, v2, v0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s9
+; GFX9-DL-NEXT:    v_add_u32_e32 v2, s10, v1
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s5, v3, v1
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s9
 ; GFX9-DL-NEXT:    s_lshr_b32 s4, s4, 24
-; GFX9-DL-NEXT:    v_mad_u32_u24 v0, s8, v2, v0
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s8, v3, v1
 ; GFX9-DL-NEXT:    s_lshr_b32 s3, s3, 24
-; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-DL-NEXT:    v_mad_u32_u24 v0, s3, v2, v0
-; GFX9-DL-NEXT:    v_add_u32_e32 v2, v0, v1
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s4
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s3, v3, v1
+; GFX9-DL-NEXT:    v_add_u32_e32 v1, v1, v2
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: udot4_multiuse_add1:
@@ -1351,6 +1323,7 @@ define amdgpu_kernel void @udot4_multiuse_add1(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-DL-NEXT:    s_movk_i32 s7, 0xff
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -1371,10 +1344,8 @@ define amdgpu_kernel void @udot4_multiuse_add1(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s3, v1
 ; GFX10-DL-NEXT:    v_add_nc_u32_e32 v0, s6, v0
 ; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s0, s1, v1
-; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v1, v0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v0, v1, v0
+; GFX10-DL-NEXT:    global_store_dword v2, v0, s[4:5]
 ; GFX10-DL-NEXT:    s_endpgm
                                                <4 x i8> addrspace(1)* %src2,
                                                i32 addrspace(1)* nocapture %dst) {
@@ -1487,91 +1458,87 @@ define amdgpu_kernel void @notdot4_mixedtypes(<4 x i8> addrspace(1)* %src1,
 ; GFX9-NODL:       ; %bb.0: ; %entry
 ; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT:    global_load_ushort v2, v[0:1], off
-; GFX9-NODL-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX9-NODL-NEXT:    s_load_dword s1, s[6:7], 0x0
+; GFX9-NODL-NEXT:    global_load_ushort v1, v0, s[0:1]
+; GFX9-NODL-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX9-NODL-NEXT:    s_load_dword s3, s[6:7], 0x0
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    s_bfe_u32 s4, s0, 0x80008
-; GFX9-NODL-NEXT:    s_bfe_u32 s5, s1, 0x80008
-; GFX9-NODL-NEXT:    s_sext_i32_i8 s3, s1
+; GFX9-NODL-NEXT:    s_bfe_u32 s6, s2, 0x80008
+; GFX9-NODL-NEXT:    s_bfe_u32 s7, s3, 0x80008
+; GFX9-NODL-NEXT:    s_sext_i32_i8 s5, s3
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s7
+; GFX9-NODL-NEXT:    s_bfe_u32 s9, s3, 0x80010
+; GFX9-NODL-NEXT:    s_sext_i32_i8 s4, s2
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-NODL-NEXT:    s_bfe_u32 s7, s1, 0x80010
-; GFX9-NODL-NEXT:    s_sext_i32_i8 s2, s0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v4, s3
-; GFX9-NODL-NEXT:    s_bfe_u32 s6, s0, 0x80010
-; GFX9-NODL-NEXT:    s_lshr_b32 s1, s1, 24
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v5, s7
-; GFX9-NODL-NEXT:    s_lshr_b32 s0, s0, 24
+; GFX9-NODL-NEXT:    s_bfe_u32 s8, s2, 0x80010
+; GFX9-NODL-NEXT:    s_lshr_b32 s3, s3, 24
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v4, s9
+; GFX9-NODL-NEXT:    s_lshr_b32 s2, s2, 24
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s4, v3, v2
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, s2, v4, v2
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s6, v5, v2
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
-; GFX9-NODL-NEXT:    global_store_short v[0:1], v2, off
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s6, v2, v1
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v1, s4, v3, v1
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s8, v4, v1
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s2, v2, v1
+; GFX9-NODL-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: notdot4_mixedtypes:
 ; GFX9-DL:       ; %bb.0: ; %entry
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT:    global_load_ushort v2, v[0:1], off
-; GFX9-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX9-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
+; GFX9-DL-NEXT:    global_load_ushort v1, v0, s[0:1]
+; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_bfe_u32 s4, s0, 0x80008
-; GFX9-DL-NEXT:    s_bfe_u32 s5, s1, 0x80008
-; GFX9-DL-NEXT:    s_sext_i32_i8 s3, s1
+; GFX9-DL-NEXT:    s_bfe_u32 s6, s2, 0x80008
+; GFX9-DL-NEXT:    s_bfe_u32 s7, s3, 0x80008
+; GFX9-DL-NEXT:    s_sext_i32_i8 s5, s3
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s7
+; GFX9-DL-NEXT:    s_bfe_u32 s9, s3, 0x80010
+; GFX9-DL-NEXT:    s_sext_i32_i8 s4, s2
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-DL-NEXT:    s_bfe_u32 s7, s1, 0x80010
-; GFX9-DL-NEXT:    s_sext_i32_i8 s2, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s3
-; GFX9-DL-NEXT:    s_bfe_u32 s6, s0, 0x80010
-; GFX9-DL-NEXT:    s_lshr_b32 s1, s1, 24
-; GFX9-DL-NEXT:    v_mov_b32_e32 v5, s7
-; GFX9-DL-NEXT:    s_lshr_b32 s0, s0, 24
+; GFX9-DL-NEXT:    s_bfe_u32 s8, s2, 0x80010
+; GFX9-DL-NEXT:    s_lshr_b32 s3, s3, 24
+; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s9
+; GFX9-DL-NEXT:    s_lshr_b32 s2, s2, 24
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s4, v3, v2
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s2, v4, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s6, v5, v2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
-; GFX9-DL-NEXT:    global_store_short v[0:1], v2, off
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s6, v2, v1
+; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s4, v3, v1
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s8, v4, v1
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s2, v2, v1
+; GFX9-DL-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: notdot4_mixedtypes:
 ; GFX10-DL:       ; %bb.0: ; %entry
-; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-DL-NEXT:    global_load_ushort v2, v[0:1], off
+; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    global_load_ushort v1, v0, s[4:5]
 ; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x80008
 ; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x80008
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s3, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s3, v1
 ; GFX10-DL-NEXT:    s_sext_i32_i8 s2, s0
 ; GFX10-DL-NEXT:    s_sext_i32_i8 s3, s1
-; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s2, s3, v2
+; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s2, s3, v1
 ; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x80010
 ; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x80010
 ; GFX10-DL-NEXT:    s_lshr_b32 s0, s0, 24
 ; GFX10-DL-NEXT:    s_lshr_b32 s1, s1, 24
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s3, v2
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s0, s1, v2
-; GFX10-DL-NEXT:    global_store_short v[0:1], v2, off
+; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s3, v1
+; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s0, s1, v1
+; GFX10-DL-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-DL-NEXT:    s_endpgm
                                               <4 x i8> addrspace(1)* %src2,
                                               i16 addrspace(1)* nocapture %dst) {
@@ -1685,6 +1652,7 @@ define amdgpu_kernel void @udot4_acc32_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-NODL-NEXT:    s_movk_i32 s2, 0xff
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NODL-NEXT:    s_load_dword s3, s[4:5], 0x0
 ; GFX9-NODL-NEXT:    s_load_dword s4, s[6:7], 0x0
@@ -1692,24 +1660,22 @@ define amdgpu_kernel void @udot4_acc32_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX9-NODL-NEXT:    s_lshr_b32 s5, s3, 24
 ; GFX9-NODL-NEXT:    s_lshr_b32 s6, s4, 24
 ; GFX9-NODL-NEXT:    s_bfe_u32 s7, s3, 0x80010
-; GFX9-NODL-NEXT:    v_lshrrev_b16_e64 v0, 8, s3
+; GFX9-NODL-NEXT:    v_lshrrev_b16_e64 v1, 8, s3
 ; GFX9-NODL-NEXT:    s_and_b32 s3, s3, s2
 ; GFX9-NODL-NEXT:    s_and_b32 s2, s4, s2
 ; GFX9-NODL-NEXT:    s_bfe_u32 s8, s4, 0x80010
-; GFX9-NODL-NEXT:    v_lshrrev_b16_e64 v1, 8, s4
+; GFX9-NODL-NEXT:    v_lshrrev_b16_e64 v2, 8, s4
 ; GFX9-NODL-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s4
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s3, v2, v3
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v0, v0, v1, v2
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s8
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v0, s7, v1, v0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s5, v1, v0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v4, s4
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v3, s3, v3, v4
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, v1, v2, v3
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s8
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s7, v2, v1
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s5, v2, v1
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: udot4_acc32_vecMul:
@@ -1717,6 +1683,7 @@ define amdgpu_kernel void @udot4_acc32_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-DL-NEXT:    s_movk_i32 s2, 0xff
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    s_load_dword s3, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
@@ -1724,24 +1691,22 @@ define amdgpu_kernel void @udot4_acc32_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_lshr_b32 s5, s3, 24
 ; GFX9-DL-NEXT:    s_lshr_b32 s6, s4, 24
 ; GFX9-DL-NEXT:    s_bfe_u32 s7, s3, 0x80010
-; GFX9-DL-NEXT:    v_lshrrev_b16_e64 v0, 8, s3
+; GFX9-DL-NEXT:    v_lshrrev_b16_e64 v1, 8, s3
 ; GFX9-DL-NEXT:    s_and_b32 s3, s3, s2
 ; GFX9-DL-NEXT:    s_and_b32 s2, s4, s2
 ; GFX9-DL-NEXT:    s_bfe_u32 s8, s4, 0x80010
-; GFX9-DL-NEXT:    v_lshrrev_b16_e64 v1, 8, s4
+; GFX9-DL-NEXT:    v_lshrrev_b16_e64 v2, 8, s4
 ; GFX9-DL-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s4
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s3, v2, v3
-; GFX9-DL-NEXT:    v_mad_u32_u24 v0, v0, v1, v2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s8
-; GFX9-DL-NEXT:    v_mad_u32_u24 v0, s7, v1, v0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s5, v1, v0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s4
+; GFX9-DL-NEXT:    v_mad_u32_u24 v3, s3, v3, v4
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, v1, v2, v3
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s8
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s7, v2, v1
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s5, v2, v1
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: udot4_acc32_vecMul:
@@ -1770,11 +1735,10 @@ define amdgpu_kernel void @udot4_acc32_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_lshr_b32 s2, s2, 24
 ; GFX10-DL-NEXT:    s_lshr_b32 s3, s3, 24
 ; GFX10-DL-NEXT:    v_mad_u32_u24 v0, v0, v1, v2
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s4, s5, v0
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s3, v0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s2, s3, v0
+; GFX10-DL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
                                               <4 x i8> addrspace(1)* %src2,
                                               i32 addrspace(1)* nocapture %dst) {
@@ -1870,7 +1834,8 @@ define amdgpu_kernel void @udot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX9-NODL:       ; %bb.0: ; %entry
 ; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0xffff
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, 0xffff
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NODL-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-NODL-NEXT:    s_load_dword s3, s[6:7], 0x0
@@ -1878,35 +1843,34 @@ define amdgpu_kernel void @udot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX9-NODL-NEXT:    s_lshr_b32 s5, s2, 16
 ; GFX9-NODL-NEXT:    s_lshr_b32 s7, s3, 16
 ; GFX9-NODL-NEXT:    s_lshr_b32 s4, s2, 24
-; GFX9-NODL-NEXT:    v_and_b32_sdwa v4, v0, s5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NODL-NEXT:    v_and_b32_sdwa v5, v3, s5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NODL-NEXT:    s_lshr_b32 s6, s3, 24
-; GFX9-NODL-NEXT:    v_and_b32_sdwa v3, v0, s7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NODL-NEXT:    v_lshl_or_b32 v3, s6, 16, v3
-; GFX9-NODL-NEXT:    v_lshl_or_b32 v4, s4, 16, v4
-; GFX9-NODL-NEXT:    v_pk_mul_lo_u16 v3, v4, v3
-; GFX9-NODL-NEXT:    v_and_b32_sdwa v4, v0, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NODL-NEXT:    v_and_b32_sdwa v4, v3, s7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NODL-NEXT:    v_lshl_or_b32 v4, s6, 16, v4
+; GFX9-NODL-NEXT:    v_lshl_or_b32 v5, s4, 16, v5
+; GFX9-NODL-NEXT:    v_pk_mul_lo_u16 v4, v5, v4
+; GFX9-NODL-NEXT:    v_and_b32_sdwa v5, v3, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NODL-NEXT:    v_lshrrev_b16_e64 v2, 8, s3
 ; GFX9-NODL-NEXT:    v_lshrrev_b16_e64 v1, 8, s2
-; GFX9-NODL-NEXT:    v_and_b32_sdwa v0, v0, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NODL-NEXT:    v_lshl_or_b32 v2, v2, 16, v4
-; GFX9-NODL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-NODL-NEXT:    v_pk_mul_lo_u16 v2, v0, v2
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT:    global_load_ushort v4, v[0:1], off
+; GFX9-NODL-NEXT:    v_and_b32_sdwa v3, v3, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NODL-NEXT:    v_lshl_or_b32 v2, v2, 16, v5
+; GFX9-NODL-NEXT:    v_lshl_or_b32 v1, v1, 16, v3
+; GFX9-NODL-NEXT:    v_pk_mul_lo_u16 v1, v1, v2
+; GFX9-NODL-NEXT:    global_load_ushort v2, v0, s[0:1]
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT:    v_add_u32_e32 v4, v2, v4
-; GFX9-NODL-NEXT:    v_add_u32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NODL-NEXT:    v_add_u32_e32 v2, v2, v3
-; GFX9-NODL-NEXT:    v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NODL-NEXT:    global_store_short v[0:1], v2, off
+; GFX9-NODL-NEXT:    v_add_u32_e32 v2, v1, v2
+; GFX9-NODL-NEXT:    v_add_u32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NODL-NEXT:    v_add_u32_e32 v1, v1, v4
+; GFX9-NODL-NEXT:    v_add_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NODL-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: udot4_acc16_vecMul:
 ; GFX9-DL:       ; %bb.0: ; %entry
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0xffff
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, 0xffff
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
@@ -1914,66 +1878,62 @@ define amdgpu_kernel void @udot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_lshr_b32 s5, s2, 16
 ; GFX9-DL-NEXT:    s_lshr_b32 s7, s3, 16
 ; GFX9-DL-NEXT:    s_lshr_b32 s4, s2, 24
-; GFX9-DL-NEXT:    v_and_b32_sdwa v4, v0, s5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-DL-NEXT:    v_and_b32_sdwa v5, v3, s5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-DL-NEXT:    s_lshr_b32 s6, s3, 24
-; GFX9-DL-NEXT:    v_and_b32_sdwa v3, v0, s7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-DL-NEXT:    v_lshl_or_b32 v3, s6, 16, v3
-; GFX9-DL-NEXT:    v_lshl_or_b32 v4, s4, 16, v4
-; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v3, v4, v3
-; GFX9-DL-NEXT:    v_and_b32_sdwa v4, v0, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-DL-NEXT:    v_and_b32_sdwa v4, v3, s7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-DL-NEXT:    v_lshl_or_b32 v4, s6, 16, v4
+; GFX9-DL-NEXT:    v_lshl_or_b32 v5, s4, 16, v5
+; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v4, v5, v4
+; GFX9-DL-NEXT:    v_and_b32_sdwa v5, v3, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-DL-NEXT:    v_lshrrev_b16_e64 v2, 8, s3
 ; GFX9-DL-NEXT:    v_lshrrev_b16_e64 v1, 8, s2
-; GFX9-DL-NEXT:    v_and_b32_sdwa v0, v0, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-DL-NEXT:    v_lshl_or_b32 v2, v2, 16, v4
-; GFX9-DL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v2, v0, v2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT:    global_load_ushort v4, v[0:1], off
+; GFX9-DL-NEXT:    v_and_b32_sdwa v3, v3, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-DL-NEXT:    v_lshl_or_b32 v2, v2, 16, v5
+; GFX9-DL-NEXT:    v_lshl_or_b32 v1, v1, 16, v3
+; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v1, v1, v2
+; GFX9-DL-NEXT:    global_load_ushort v2, v0, s[0:1]
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_add_u32_e32 v4, v2, v4
-; GFX9-DL-NEXT:    v_add_u32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-DL-NEXT:    v_add_u32_e32 v2, v2, v3
-; GFX9-DL-NEXT:    v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-DL-NEXT:    global_store_short v[0:1], v2, off
+; GFX9-DL-NEXT:    v_add_u32_e32 v2, v1, v2
+; GFX9-DL-NEXT:    v_add_u32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-DL-NEXT:    v_add_u32_e32 v1, v1, v4
+; GFX9-DL-NEXT:    v_add_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-DL-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: udot4_acc16_vecMul:
 ; GFX10-DL:       ; %bb.0: ; %entry
-; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX10-DL-NEXT:    v_mov_b32_e32 v3, 0xffff
-; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-DL-NEXT:    global_load_ushort v2, v[0:1], off
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0xffff
+; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    global_load_ushort v1, v0, s[4:5]
 ; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_lshrrev_b16_e64 v4, 8, s0
-; GFX10-DL-NEXT:    v_and_b32_sdwa v7, v3, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-DL-NEXT:    v_lshrrev_b16_e64 v5, 8, s1
-; GFX10-DL-NEXT:    v_and_b32_sdwa v6, v3, s1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-DL-NEXT:    v_lshrrev_b16_e64 v3, 8, s0
+; GFX10-DL-NEXT:    v_and_b32_sdwa v6, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-DL-NEXT:    v_lshrrev_b16_e64 v4, 8, s1
+; GFX10-DL-NEXT:    v_and_b32_sdwa v5, v2, s1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-DL-NEXT:    s_lshr_b32 s2, s1, 16
 ; GFX10-DL-NEXT:    s_lshr_b32 s3, s0, 16
-; GFX10-DL-NEXT:    v_lshl_or_b32 v4, v4, 16, v7
+; GFX10-DL-NEXT:    v_lshl_or_b32 v3, v3, 16, v6
 ; GFX10-DL-NEXT:    s_lshr_b32 s1, s1, 24
-; GFX10-DL-NEXT:    v_lshl_or_b32 v5, v5, 16, v6
-; GFX10-DL-NEXT:    v_and_b32_sdwa v6, v3, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-DL-NEXT:    v_and_b32_sdwa v3, v3, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-DL-NEXT:    v_lshl_or_b32 v4, v4, 16, v5
+; GFX10-DL-NEXT:    v_and_b32_sdwa v5, v2, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-DL-NEXT:    v_and_b32_sdwa v2, v2, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-DL-NEXT:    s_lshr_b32 s0, s0, 24
-; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v4, v4, v5
-; GFX10-DL-NEXT:    v_lshl_or_b32 v5, s1, 16, v6
-; GFX10-DL-NEXT:    v_lshl_or_b32 v3, s0, 16, v3
-; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v3, v3, v5
+; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v3, v3, v4
+; GFX10-DL-NEXT:    v_lshl_or_b32 v4, s1, 16, v5
+; GFX10-DL-NEXT:    v_lshl_or_b32 v2, s0, 16, v2
+; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v2, v2, v4
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v4, v2
-; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v2, v3
-; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-DL-NEXT:    global_store_short v[0:1], v2, off
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v1, v3, v1
+; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v1, v1, v2
+; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-DL-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-DL-NEXT:    s_endpgm
                                               <4 x i8> addrspace(1)* %src2,
                                               i16 addrspace(1)* nocapture %dst) {
@@ -2091,110 +2051,106 @@ define amdgpu_kernel void @udot4_acc8_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX9-NODL:       ; %bb.0: ; %entry
 ; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NODL-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-NODL-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX9-NODL-NEXT:    global_load_ubyte v4, v0, s[0:1]
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NODL-NEXT:    s_lshr_b32 s4, s2, 16
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NODL-NEXT:    v_mul_lo_u16_e32 v0, s2, v0
-; GFX9-NODL-NEXT:    v_mul_lo_u16_sdwa v1, s2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX9-NODL-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX9-NODL-NEXT:    s_lshr_b32 s6, s3, 16
 ; GFX9-NODL-NEXT:    s_lshr_b32 s7, s3, 24
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-NODL-NEXT:    v_and_b32_e32 v2, 0xffff, v0
+; GFX9-NODL-NEXT:    v_mul_lo_u16_e32 v1, s2, v1
+; GFX9-NODL-NEXT:    v_mul_lo_u16_sdwa v2, s2, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX9-NODL-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX9-NODL-NEXT:    s_lshr_b32 s5, s2, 24
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s7
-; GFX9-NODL-NEXT:    v_mul_lo_u16_sdwa v0, s5, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NODL-NEXT:    v_mul_lo_u16_e32 v1, s4, v1
-; GFX9-NODL-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NODL-NEXT:    v_or_b32_e32 v3, v2, v0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT:    global_load_ubyte v5, v[0:1], off
-; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v4, 8, v3
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s7
+; GFX9-NODL-NEXT:    v_mul_lo_u16_sdwa v2, s5, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NODL-NEXT:    v_mul_lo_u16_e32 v3, s4, v3
+; GFX9-NODL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX9-NODL-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NODL-NEXT:    v_or_b32_e32 v2, v1, v2
+; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v3, 8, v2
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT:    v_add_u32_e32 v2, v2, v5
-; GFX9-NODL-NEXT:    v_add_u32_e32 v2, v2, v4
-; GFX9-NODL-NEXT:    v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NODL-NEXT:    v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-NODL-NEXT:    global_store_byte v[0:1], v2, off
+; GFX9-NODL-NEXT:    v_add_u32_e32 v1, v1, v4
+; GFX9-NODL-NEXT:    v_add_u32_e32 v1, v1, v3
+; GFX9-NODL-NEXT:    v_add_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NODL-NEXT:    v_add_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX9-NODL-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: udot4_acc8_vecMul:
 ; GFX9-DL:       ; %bb.0: ; %entry
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX9-DL-NEXT:    global_load_ubyte v4, v0, s[0:1]
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    s_lshr_b32 s4, s2, 16
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v0, s2, v0
-; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v1, s2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX9-DL-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX9-DL-NEXT:    s_lshr_b32 s6, s3, 16
 ; GFX9-DL-NEXT:    s_lshr_b32 s7, s3, 24
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-DL-NEXT:    v_and_b32_e32 v2, 0xffff, v0
+; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v1, s2, v1
+; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v2, s2, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX9-DL-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX9-DL-NEXT:    s_lshr_b32 s5, s2, 24
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s7
-; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v0, s5, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v1, s4, v1
-; GFX9-DL-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-DL-NEXT:    v_or_b32_e32 v3, v2, v0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT:    global_load_ubyte v5, v[0:1], off
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v4, 8, v3
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s7
+; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v2, s5, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v3, s4, v3
+; GFX9-DL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX9-DL-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-DL-NEXT:    v_or_b32_e32 v2, v1, v2
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v3, 8, v2
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_add_u32_e32 v2, v2, v5
-; GFX9-DL-NEXT:    v_add_u32_e32 v2, v2, v4
-; GFX9-DL-NEXT:    v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-DL-NEXT:    v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-DL-NEXT:    global_store_byte v[0:1], v2, off
+; GFX9-DL-NEXT:    v_add_u32_e32 v1, v1, v4
+; GFX9-DL-NEXT:    v_add_u32_e32 v1, v1, v3
+; GFX9-DL-NEXT:    v_add_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-DL-NEXT:    v_add_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX9-DL-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: udot4_acc8_vecMul:
 ; GFX10-DL:       ; %bb.0: ; %entry
-; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-DL-NEXT:    global_load_ubyte v2, v[0:1], off
+; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    global_load_ubyte v1, v0, s[4:5]
 ; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_lshrrev_b16_e64 v3, 8, s0
-; GFX10-DL-NEXT:    v_lshrrev_b16_e64 v4, 8, s1
+; GFX10-DL-NEXT:    v_lshrrev_b16_e64 v2, 8, s0
+; GFX10-DL-NEXT:    v_lshrrev_b16_e64 v3, 8, s1
 ; GFX10-DL-NEXT:    s_lshr_b32 s2, s0, 24
 ; GFX10-DL-NEXT:    s_lshr_b32 s3, s1, 24
-; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v5, s2, s3
-; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v3, v3, v4
-; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v4, s0, s1
+; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v4, s2, s3
+; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v2, v2, v3
+; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v3, s0, s1
 ; GFX10-DL-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX10-DL-NEXT:    s_lshr_b32 s1, s1, 16
-; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v3, 8, v3
-; GFX10-DL-NEXT:    v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v4, 8, v5
-; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v5, s0, s1
-; GFX10-DL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX10-DL-NEXT:    v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-DL-NEXT:    v_or_b32_e32 v4, v3, v4
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v2, 8, v2
+; GFX10-DL-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v3, 8, v4
+; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v4, s0, s1
+; GFX10-DL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX10-DL-NEXT:    v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-DL-NEXT:    v_or_b32_e32 v3, v2, v3
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v4, 8, v3
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v3, v2
-; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v2, v5
-; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX10-DL-NEXT:    global_store_byte v[0:1], v2, off
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v1, v2, v1
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v1, v1, v4
+; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX10-DL-NEXT:    global_store_byte v0, v1, s[4:5]
 ; GFX10-DL-NEXT:    s_endpgm
                                              <4 x i8> addrspace(1)* %src2,
                                              i8 addrspace(1)* nocapture %dst) {

diff  --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll
index f6252c10c05a..2e3411fe140a 100644
--- a/llvm/test/CodeGen/AMDGPU/idot8s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll
@@ -129,43 +129,42 @@ define amdgpu_kernel void @idot8_acc32(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    s_load_dword s3, s[6:7], 0x0
 ; GFX9-NEXT:    s_load_dword s18, s[0:1], 0x0
 ; GFX9-NEXT:    s_addc_u32 s21, s21, 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_bfe_i32 s4, s2, 0x40000
 ; GFX9-NEXT:    s_bfe_i32 s5, s3, 0x40000
 ; GFX9-NEXT:    s_bfe_i32 s7, s3, 0x40004
-; GFX9-NEXT:    v_mov_b32_e32 v0, s5
-; GFX9-NEXT:    v_mov_b32_e32 v1, s18
-; GFX9-NEXT:    v_mad_i32_i24 v0, s4, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s18
+; GFX9-NEXT:    v_mad_i32_i24 v1, s4, v1, v2
 ; GFX9-NEXT:    s_bfe_i32 s6, s2, 0x40004
-; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    v_mov_b32_e32 v2, s7
 ; GFX9-NEXT:    s_bfe_i32 s9, s3, 0x40008
-; GFX9-NEXT:    v_mad_i32_i24 v0, s6, v1, v0
+; GFX9-NEXT:    v_mad_i32_i24 v1, s6, v2, v1
 ; GFX9-NEXT:    s_bfe_i32 s8, s2, 0x40008
-; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    v_mov_b32_e32 v2, s9
 ; GFX9-NEXT:    s_bfe_i32 s11, s3, 0x4000c
-; GFX9-NEXT:    v_mad_i32_i24 v0, s8, v1, v0
+; GFX9-NEXT:    v_mad_i32_i24 v1, s8, v2, v1
 ; GFX9-NEXT:    s_bfe_i32 s10, s2, 0x4000c
-; GFX9-NEXT:    v_mov_b32_e32 v1, s11
+; GFX9-NEXT:    v_mov_b32_e32 v2, s11
 ; GFX9-NEXT:    s_bfe_i32 s13, s3, 0x40010
-; GFX9-NEXT:    v_mad_i32_i24 v0, s10, v1, v0
+; GFX9-NEXT:    v_mad_i32_i24 v1, s10, v2, v1
 ; GFX9-NEXT:    s_bfe_i32 s12, s2, 0x40010
-; GFX9-NEXT:    v_mov_b32_e32 v1, s13
+; GFX9-NEXT:    v_mov_b32_e32 v2, s13
 ; GFX9-NEXT:    s_bfe_i32 s15, s3, 0x40014
 ; GFX9-NEXT:    s_bfe_i32 s17, s3, 0x40018
-; GFX9-NEXT:    v_mad_i32_i24 v0, s12, v1, v0
+; GFX9-NEXT:    v_mad_i32_i24 v1, s12, v2, v1
 ; GFX9-NEXT:    s_bfe_i32 s14, s2, 0x40014
-; GFX9-NEXT:    v_mov_b32_e32 v1, s15
+; GFX9-NEXT:    v_mov_b32_e32 v2, s15
 ; GFX9-NEXT:    s_bfe_i32 s16, s2, 0x40018
-; GFX9-NEXT:    v_mad_i32_i24 v0, s14, v1, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s17
+; GFX9-NEXT:    v_mad_i32_i24 v1, s14, v2, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s17
 ; GFX9-NEXT:    s_ashr_i32 s3, s3, 28
-; GFX9-NEXT:    v_mad_i32_i24 v0, s16, v1, v0
+; GFX9-NEXT:    v_mad_i32_i24 v1, s16, v2, v1
 ; GFX9-NEXT:    s_ashr_i32 s2, s2, 28
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    v_mad_i32_i24 v2, s2, v1, v0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-NEXT:    v_mad_i32_i24 v1, s2, v2, v1
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: idot8_acc32:
@@ -181,14 +180,13 @@ define amdgpu_kernel void @idot8_acc32(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s3, s[0:1], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-DL-NEXT:    v_dot8_i32_i4 v2, s4, v0, v1
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-DL-NEXT:    v_dot8_i32_i4 v1, s4, v1, v2
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: idot8_acc32:
@@ -201,6 +199,7 @@ define amdgpu_kernel void @idot8_acc32(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_clause 0x1
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-DL-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -209,10 +208,8 @@ define amdgpu_kernel void @idot8_acc32(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s6
-; GFX10-DL-NEXT:    v_dot8_i32_i4 v2, s0, s1, v0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT:    v_dot8_i32_i4 v0, s0, s1, v0
+; GFX10-DL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-DL-NEXT:    s_endpgm
                                        <8 x i4> addrspace(1)* %src2,
                                        i32 addrspace(1)* nocapture %dst) {
@@ -416,169 +413,165 @@ define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1,
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX9-NEXT:    s_mov_b32 s16, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s17, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s18, -1
-; GFX9-NEXT:    s_mov_b32 s19, 0xe00000
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s22, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_load_ushort v2, v[0:1], off
-; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX9-NEXT:    s_load_dword s1, s[6:7], 0x0
-; GFX9-NEXT:    s_add_u32 s16, s16, s3
-; GFX9-NEXT:    s_addc_u32 s17, s17, 0
+; GFX9-NEXT:    global_load_ushort v1, v0, s[0:1]
+; GFX9-NEXT:    s_mov_b32 s23, 0xe00000
+; GFX9-NEXT:    s_add_u32 s20, s20, s3
+; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX9-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX9-NEXT:    s_addc_u32 s21, s21, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_bfe_i32 s4, s0, 0x40000
-; GFX9-NEXT:    s_bfe_i32 s5, s1, 0x40000
-; GFX9-NEXT:    s_bfe_i32 s7, s1, 0x40004
-; GFX9-NEXT:    s_bfe_i32 s9, s1, 0x40008
-; GFX9-NEXT:    v_mov_b32_e32 v6, s5
-; GFX9-NEXT:    s_lshr_b32 s2, s0, 12
-; GFX9-NEXT:    s_lshr_b32 s3, s1, 12
-; GFX9-NEXT:    s_bfe_i32 s6, s0, 0x40004
-; GFX9-NEXT:    s_bfe_i32 s8, s0, 0x40008
-; GFX9-NEXT:    v_mov_b32_e32 v3, s9
-; GFX9-NEXT:    v_mov_b32_e32 v7, s7
-; GFX9-NEXT:    v_lshlrev_b16_e64 v4, 12, s2
-; GFX9-NEXT:    v_lshlrev_b16_e64 v5, 12, s3
-; GFX9-NEXT:    v_mul_i32_i24_e32 v3, s8, v3
-; GFX9-NEXT:    s_bfe_i32 s11, s1, 0x40010
+; GFX9-NEXT:    s_bfe_i32 s6, s2, 0x40000
+; GFX9-NEXT:    s_bfe_i32 s7, s3, 0x40000
+; GFX9-NEXT:    s_bfe_i32 s9, s3, 0x40004
+; GFX9-NEXT:    s_bfe_i32 s11, s3, 0x40008
+; GFX9-NEXT:    v_mov_b32_e32 v5, s7
+; GFX9-NEXT:    s_lshr_b32 s4, s2, 12
+; GFX9-NEXT:    s_lshr_b32 s5, s3, 12
+; GFX9-NEXT:    s_bfe_i32 s8, s2, 0x40004
+; GFX9-NEXT:    s_bfe_i32 s10, s2, 0x40008
+; GFX9-NEXT:    v_mov_b32_e32 v2, s11
+; GFX9-NEXT:    v_mov_b32_e32 v6, s9
+; GFX9-NEXT:    v_lshlrev_b16_e64 v3, 12, s4
+; GFX9-NEXT:    v_lshlrev_b16_e64 v4, 12, s5
+; GFX9-NEXT:    v_mul_i32_i24_e32 v2, s10, v2
+; GFX9-NEXT:    s_bfe_i32 s13, s3, 0x40010
+; GFX9-NEXT:    v_ashrrev_i16_e32 v3, 12, v3
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v4, 12, v4
-; GFX9-NEXT:    v_ashrrev_i16_e32 v5, 12, v5
-; GFX9-NEXT:    s_bfe_i32 s13, s1, 0x40014
-; GFX9-NEXT:    s_bfe_i32 s10, s0, 0x40010
-; GFX9-NEXT:    v_mov_b32_e32 v8, s11
-; GFX9-NEXT:    s_bfe_i32 s15, s1, 0x40018
-; GFX9-NEXT:    s_bfe_i32 s12, s0, 0x40014
-; GFX9-NEXT:    v_mov_b32_e32 v9, s13
-; GFX9-NEXT:    s_bfe_i32 s14, s0, 0x40018
-; GFX9-NEXT:    s_ashr_i32 s1, s1, 28
-; GFX9-NEXT:    v_mov_b32_e32 v10, s15
-; GFX9-NEXT:    s_ashr_i32 s0, s0, 28
+; GFX9-NEXT:    s_bfe_i32 s15, s3, 0x40014
+; GFX9-NEXT:    s_bfe_i32 s12, s2, 0x40010
+; GFX9-NEXT:    v_mov_b32_e32 v7, s13
+; GFX9-NEXT:    s_bfe_i32 s17, s3, 0x40018
+; GFX9-NEXT:    s_bfe_i32 s14, s2, 0x40014
+; GFX9-NEXT:    v_mov_b32_e32 v8, s15
+; GFX9-NEXT:    s_bfe_i32 s16, s2, 0x40018
+; GFX9-NEXT:    s_ashr_i32 s3, s3, 28
+; GFX9-NEXT:    v_mov_b32_e32 v9, s17
+; GFX9-NEXT:    s_ashr_i32 s2, s2, 28
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mad_i32_i24 v2, s4, v6, v2
-; GFX9-NEXT:    v_mad_i32_i24 v2, s6, v7, v2
-; GFX9-NEXT:    v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX9-NEXT:    v_mad_u32_u24 v2, v4, v5, v2
-; GFX9-NEXT:    v_mad_i32_i24 v2, s10, v8, v2
-; GFX9-NEXT:    v_mad_i32_i24 v2, s12, v9, v2
-; GFX9-NEXT:    v_mad_i32_i24 v2, s14, v10, v2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    v_mad_i32_i24 v2, s0, v3, v2
-; GFX9-NEXT:    global_store_short v[0:1], v2, off
+; GFX9-NEXT:    v_mad_i32_i24 v1, s6, v5, v1
+; GFX9-NEXT:    v_mad_i32_i24 v1, s8, v6, v1
+; GFX9-NEXT:    v_add_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX9-NEXT:    v_mad_u32_u24 v1, v3, v4, v1
+; GFX9-NEXT:    v_mad_i32_i24 v1, s12, v7, v1
+; GFX9-NEXT:    v_mad_i32_i24 v1, s14, v8, v1
+; GFX9-NEXT:    v_mad_i32_i24 v1, s16, v9, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-NEXT:    v_mad_i32_i24 v1, s2, v2, v1
+; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: idot8_acc16:
 ; GFX9-DL:       ; %bb.0: ; %entry
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX9-DL-NEXT:    s_mov_b32 s16, SCRATCH_RSRC_DWORD0
-; GFX9-DL-NEXT:    s_mov_b32 s17, SCRATCH_RSRC_DWORD1
-; GFX9-DL-NEXT:    s_mov_b32 s18, -1
-; GFX9-DL-NEXT:    s_mov_b32 s19, 0xe00000
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
+; GFX9-DL-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
+; GFX9-DL-NEXT:    s_mov_b32 s22, -1
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT:    global_load_ushort v2, v[0:1], off
-; GFX9-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX9-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
-; GFX9-DL-NEXT:    s_add_u32 s16, s16, s3
-; GFX9-DL-NEXT:    s_addc_u32 s17, s17, 0
+; GFX9-DL-NEXT:    global_load_ushort v1, v0, s[0:1]
+; GFX9-DL-NEXT:    s_mov_b32 s23, 0xe00000
+; GFX9-DL-NEXT:    s_add_u32 s20, s20, s3
+; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX9-DL-NEXT:    s_addc_u32 s21, s21, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_bfe_i32 s4, s0, 0x40000
-; GFX9-DL-NEXT:    s_bfe_i32 s5, s1, 0x40000
-; GFX9-DL-NEXT:    s_bfe_i32 s7, s1, 0x40004
-; GFX9-DL-NEXT:    s_bfe_i32 s9, s1, 0x40008
-; GFX9-DL-NEXT:    v_mov_b32_e32 v6, s5
-; GFX9-DL-NEXT:    s_lshr_b32 s2, s0, 12
-; GFX9-DL-NEXT:    s_lshr_b32 s3, s1, 12
-; GFX9-DL-NEXT:    s_bfe_i32 s6, s0, 0x40004
-; GFX9-DL-NEXT:    s_bfe_i32 s8, s0, 0x40008
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s9
-; GFX9-DL-NEXT:    v_mov_b32_e32 v7, s7
-; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v4, 12, s2
-; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v5, 12, s3
-; GFX9-DL-NEXT:    v_mul_i32_i24_e32 v3, s8, v3
-; GFX9-DL-NEXT:    s_bfe_i32 s11, s1, 0x40010
+; GFX9-DL-NEXT:    s_bfe_i32 s6, s2, 0x40000
+; GFX9-DL-NEXT:    s_bfe_i32 s7, s3, 0x40000
+; GFX9-DL-NEXT:    s_bfe_i32 s9, s3, 0x40004
+; GFX9-DL-NEXT:    s_bfe_i32 s11, s3, 0x40008
+; GFX9-DL-NEXT:    v_mov_b32_e32 v5, s7
+; GFX9-DL-NEXT:    s_lshr_b32 s4, s2, 12
+; GFX9-DL-NEXT:    s_lshr_b32 s5, s3, 12
+; GFX9-DL-NEXT:    s_bfe_i32 s8, s2, 0x40004
+; GFX9-DL-NEXT:    s_bfe_i32 s10, s2, 0x40008
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s11
+; GFX9-DL-NEXT:    v_mov_b32_e32 v6, s9
+; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v3, 12, s4
+; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v4, 12, s5
+; GFX9-DL-NEXT:    v_mul_i32_i24_e32 v2, s10, v2
+; GFX9-DL-NEXT:    s_bfe_i32 s13, s3, 0x40010
+; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v3, 12, v3
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v4, 12, v4
-; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v5, 12, v5
-; GFX9-DL-NEXT:    s_bfe_i32 s13, s1, 0x40014
-; GFX9-DL-NEXT:    s_bfe_i32 s10, s0, 0x40010
-; GFX9-DL-NEXT:    v_mov_b32_e32 v8, s11
-; GFX9-DL-NEXT:    s_bfe_i32 s15, s1, 0x40018
-; GFX9-DL-NEXT:    s_bfe_i32 s12, s0, 0x40014
-; GFX9-DL-NEXT:    v_mov_b32_e32 v9, s13
-; GFX9-DL-NEXT:    s_bfe_i32 s14, s0, 0x40018
-; GFX9-DL-NEXT:    s_ashr_i32 s1, s1, 28
-; GFX9-DL-NEXT:    v_mov_b32_e32 v10, s15
-; GFX9-DL-NEXT:    s_ashr_i32 s0, s0, 28
+; GFX9-DL-NEXT:    s_bfe_i32 s15, s3, 0x40014
+; GFX9-DL-NEXT:    s_bfe_i32 s12, s2, 0x40010
+; GFX9-DL-NEXT:    v_mov_b32_e32 v7, s13
+; GFX9-DL-NEXT:    s_bfe_i32 s17, s3, 0x40018
+; GFX9-DL-NEXT:    s_bfe_i32 s14, s2, 0x40014
+; GFX9-DL-NEXT:    v_mov_b32_e32 v8, s15
+; GFX9-DL-NEXT:    s_bfe_i32 s16, s2, 0x40018
+; GFX9-DL-NEXT:    s_ashr_i32 s3, s3, 28
+; GFX9-DL-NEXT:    v_mov_b32_e32 v9, s17
+; GFX9-DL-NEXT:    s_ashr_i32 s2, s2, 28
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s4, v6, v2
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s6, v7, v2
-; GFX9-DL-NEXT:    v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, v4, v5, v2
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s10, v8, v2
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s12, v9, v2
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s14, v10, v2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s0, v3, v2
-; GFX9-DL-NEXT:    global_store_short v[0:1], v2, off
+; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s6, v5, v1
+; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s8, v6, v1
+; GFX9-DL-NEXT:    v_add_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, v3, v4, v1
+; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s12, v7, v1
+; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s14, v8, v1
+; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s16, v9, v1
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s2, v2, v1
+; GFX9-DL-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: idot8_acc16:
 ; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-DL-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
 ; GFX10-DL-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GFX10-DL-NEXT:    s_mov_b32 s14, -1
 ; GFX10-DL-NEXT:    s_mov_b32 s15, 0x31c16000
 ; GFX10-DL-NEXT:    s_add_u32 s12, s12, s3
-; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-DL-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-DL-NEXT:    global_load_ushort v2, v[0:1], off
-; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    global_load_ushort v1, v0, s[4:5]
 ; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_lshr_b32 s2, s0, 12
 ; GFX10-DL-NEXT:    s_lshr_b32 s3, s1, 12
-; GFX10-DL-NEXT:    s_bfe_i32 s4, s0, 0x40000
-; GFX10-DL-NEXT:    s_bfe_i32 s5, s1, 0x40000
-; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v3, 12, s2
-; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v4, 12, s3
-; GFX10-DL-NEXT:    s_bfe_i32 s6, s0, 0x40004
-; GFX10-DL-NEXT:    s_bfe_i32 s7, s0, 0x40008
-; GFX10-DL-NEXT:    s_bfe_i32 s8, s1, 0x40008
+; GFX10-DL-NEXT:    s_bfe_i32 s6, s0, 0x40000
+; GFX10-DL-NEXT:    s_bfe_i32 s7, s1, 0x40000
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v2, 12, s2
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v3, 12, s3
+; GFX10-DL-NEXT:    s_bfe_i32 s8, s0, 0x40004
+; GFX10-DL-NEXT:    s_bfe_i32 s9, s0, 0x40008
+; GFX10-DL-NEXT:    s_bfe_i32 s10, s1, 0x40008
 ; GFX10-DL-NEXT:    s_bfe_i32 s2, s1, 0x40004
-; GFX10-DL-NEXT:    v_mul_i32_i24_e64 v5, s7, s8
+; GFX10-DL-NEXT:    v_mul_i32_i24_e64 v4, s9, s10
+; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v2, 12, v2
 ; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v3, 12, v3
-; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v4, 12, v4
 ; GFX10-DL-NEXT:    s_bfe_i32 s3, s1, 0x40010
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s4, s5, v2
-; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s6, s2, v2
+; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s6, s7, v1
+; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s8, s2, v1
 ; GFX10-DL-NEXT:    s_mov_b32 s2, 0xffff
+; GFX10-DL-NEXT:    v_and_b32_e32 v2, s2, v2
 ; GFX10-DL-NEXT:    v_and_b32_e32 v3, s2, v3
-; GFX10-DL-NEXT:    v_and_b32_e32 v4, s2, v4
-; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
 ; GFX10-DL-NEXT:    s_bfe_i32 s2, s0, 0x40010
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, v3, v4, v2
-; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s2, s3, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v1, v2, v3, v1
+; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s2, s3, v1
 ; GFX10-DL-NEXT:    s_bfe_i32 s2, s0, 0x40014
 ; GFX10-DL-NEXT:    s_bfe_i32 s3, s1, 0x40014
-; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s2, s3, v2
+; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s2, s3, v1
 ; GFX10-DL-NEXT:    s_bfe_i32 s2, s0, 0x40018
 ; GFX10-DL-NEXT:    s_bfe_i32 s3, s1, 0x40018
 ; GFX10-DL-NEXT:    s_ashr_i32 s0, s0, 28
 ; GFX10-DL-NEXT:    s_ashr_i32 s1, s1, 28
-; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s2, s3, v2
-; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s0, s1, v2
-; GFX10-DL-NEXT:    global_store_short v[0:1], v2, off
+; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s2, s3, v1
+; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s0, s1, v1
+; GFX10-DL-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-DL-NEXT:    s_endpgm
                                        <8 x i4> addrspace(1)* %src2,
                                        i16 addrspace(1)* nocapture %dst) {
@@ -784,175 +777,171 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1,
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
 ; GFX9-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s22, -1
-; GFX9-NEXT:    s_mov_b32 s23, 0xe00000
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_load_ubyte v2, v[0:1], off
-; GFX9-NEXT:    s_load_dword s1, s[4:5], 0x0
-; GFX9-NEXT:    s_load_dword s2, s[6:7], 0x0
+; GFX9-NEXT:    global_load_ubyte v1, v0, s[0:1]
+; GFX9-NEXT:    s_mov_b32 s23, 0xe00000
 ; GFX9-NEXT:    s_add_u32 s20, s20, s3
+; GFX9-NEXT:    s_load_dword s3, s[4:5], 0x0
+; GFX9-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX9-NEXT:    s_addc_u32 s21, s21, 0
-; GFX9-NEXT:    s_movk_i32 s0, 0xff
+; GFX9-NEXT:    s_movk_i32 s2, 0xff
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshr_b32 s3, s1, 12
-; GFX9-NEXT:    s_bfe_i32 s6, s2, 0x40000
-; GFX9-NEXT:    s_lshr_b32 s4, s2, 12
-; GFX9-NEXT:    s_bfe_i32 s8, s2, 0x40004
-; GFX9-NEXT:    s_bfe_i32 s10, s2, 0x40008
-; GFX9-NEXT:    s_bfe_i32 s5, s1, 0x40000
-; GFX9-NEXT:    v_mov_b32_e32 v6, s6
-; GFX9-NEXT:    v_lshlrev_b16_e64 v4, 12, s3
-; GFX9-NEXT:    v_lshlrev_b16_e64 v5, 12, s4
-; GFX9-NEXT:    s_bfe_i32 s7, s1, 0x40004
-; GFX9-NEXT:    s_bfe_i32 s9, s1, 0x40008
-; GFX9-NEXT:    v_mov_b32_e32 v3, s10
-; GFX9-NEXT:    v_mov_b32_e32 v7, s8
+; GFX9-NEXT:    s_lshr_b32 s5, s3, 12
+; GFX9-NEXT:    s_bfe_i32 s8, s4, 0x40000
+; GFX9-NEXT:    s_lshr_b32 s6, s4, 12
+; GFX9-NEXT:    s_bfe_i32 s10, s4, 0x40004
+; GFX9-NEXT:    s_bfe_i32 s12, s4, 0x40008
+; GFX9-NEXT:    s_bfe_i32 s7, s3, 0x40000
+; GFX9-NEXT:    v_mov_b32_e32 v5, s8
+; GFX9-NEXT:    v_lshlrev_b16_e64 v3, 12, s5
+; GFX9-NEXT:    v_lshlrev_b16_e64 v4, 12, s6
+; GFX9-NEXT:    s_bfe_i32 s9, s3, 0x40004
+; GFX9-NEXT:    s_bfe_i32 s11, s3, 0x40008
+; GFX9-NEXT:    v_mov_b32_e32 v2, s12
+; GFX9-NEXT:    v_mov_b32_e32 v6, s10
+; GFX9-NEXT:    v_ashrrev_i16_e32 v3, 12, v3
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v4, 12, v4
-; GFX9-NEXT:    v_ashrrev_i16_e32 v5, 12, v5
-; GFX9-NEXT:    v_mul_i32_i24_e32 v3, s9, v3
-; GFX9-NEXT:    s_bfe_i32 s12, s2, 0x40010
-; GFX9-NEXT:    v_and_b32_e32 v4, s0, v4
-; GFX9-NEXT:    v_and_b32_e32 v5, s0, v5
-; GFX9-NEXT:    s_bfe_i32 s14, s2, 0x40014
-; GFX9-NEXT:    s_bfe_i32 s11, s1, 0x40010
-; GFX9-NEXT:    v_mov_b32_e32 v8, s12
-; GFX9-NEXT:    s_bfe_i32 s16, s2, 0x40018
-; GFX9-NEXT:    s_bfe_i32 s13, s1, 0x40014
-; GFX9-NEXT:    v_mov_b32_e32 v9, s14
-; GFX9-NEXT:    s_bfe_i32 s15, s1, 0x40018
-; GFX9-NEXT:    s_ashr_i32 s2, s2, 28
-; GFX9-NEXT:    v_mov_b32_e32 v10, s16
-; GFX9-NEXT:    s_ashr_i32 s1, s1, 28
+; GFX9-NEXT:    v_mul_i32_i24_e32 v2, s11, v2
+; GFX9-NEXT:    s_bfe_i32 s14, s4, 0x40010
+; GFX9-NEXT:    v_and_b32_e32 v3, s2, v3
+; GFX9-NEXT:    v_and_b32_e32 v4, s2, v4
+; GFX9-NEXT:    s_bfe_i32 s16, s4, 0x40014
+; GFX9-NEXT:    s_bfe_i32 s13, s3, 0x40010
+; GFX9-NEXT:    v_mov_b32_e32 v7, s14
+; GFX9-NEXT:    s_bfe_i32 s18, s4, 0x40018
+; GFX9-NEXT:    s_bfe_i32 s15, s3, 0x40014
+; GFX9-NEXT:    v_mov_b32_e32 v8, s16
+; GFX9-NEXT:    s_bfe_i32 s17, s3, 0x40018
+; GFX9-NEXT:    s_ashr_i32 s4, s4, 28
+; GFX9-NEXT:    v_mov_b32_e32 v9, s18
+; GFX9-NEXT:    s_ashr_i32 s3, s3, 28
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mad_i32_i24 v2, s5, v6, v2
-; GFX9-NEXT:    v_mad_i32_i24 v2, s7, v7, v2
-; GFX9-NEXT:    v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-NEXT:    v_mad_u32_u24 v2, v4, v5, v2
-; GFX9-NEXT:    v_mad_i32_i24 v2, s11, v8, v2
-; GFX9-NEXT:    v_mad_i32_i24 v2, s13, v9, v2
-; GFX9-NEXT:    v_mad_i32_i24 v2, s15, v10, v2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s2
-; GFX9-NEXT:    v_mad_i32_i24 v2, s1, v3, v2
-; GFX9-NEXT:    global_store_byte v[0:1], v2, off
+; GFX9-NEXT:    v_mad_i32_i24 v1, s7, v5, v1
+; GFX9-NEXT:    v_mad_i32_i24 v1, s9, v6, v1
+; GFX9-NEXT:    v_add_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-NEXT:    v_mad_u32_u24 v1, v3, v4, v1
+; GFX9-NEXT:    v_mad_i32_i24 v1, s13, v7, v1
+; GFX9-NEXT:    v_mad_i32_i24 v1, s15, v8, v1
+; GFX9-NEXT:    v_mad_i32_i24 v1, s17, v9, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_mad_i32_i24 v1, s3, v2, v1
+; GFX9-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: idot8_acc8:
 ; GFX9-DL:       ; %bb.0: ; %entry
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
 ; GFX9-DL-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
 ; GFX9-DL-NEXT:    s_mov_b32 s22, -1
-; GFX9-DL-NEXT:    s_mov_b32 s23, 0xe00000
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT:    global_load_ubyte v2, v[0:1], off
-; GFX9-DL-NEXT:    s_load_dword s1, s[4:5], 0x0
-; GFX9-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
+; GFX9-DL-NEXT:    global_load_ubyte v1, v0, s[0:1]
+; GFX9-DL-NEXT:    s_mov_b32 s23, 0xe00000
 ; GFX9-DL-NEXT:    s_add_u32 s20, s20, s3
+; GFX9-DL-NEXT:    s_load_dword s3, s[4:5], 0x0
+; GFX9-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX9-DL-NEXT:    s_addc_u32 s21, s21, 0
-; GFX9-DL-NEXT:    s_movk_i32 s0, 0xff
+; GFX9-DL-NEXT:    s_movk_i32 s2, 0xff
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_lshr_b32 s3, s1, 12
-; GFX9-DL-NEXT:    s_bfe_i32 s6, s2, 0x40000
-; GFX9-DL-NEXT:    s_lshr_b32 s4, s2, 12
-; GFX9-DL-NEXT:    s_bfe_i32 s8, s2, 0x40004
-; GFX9-DL-NEXT:    s_bfe_i32 s10, s2, 0x40008
-; GFX9-DL-NEXT:    s_bfe_i32 s5, s1, 0x40000
-; GFX9-DL-NEXT:    v_mov_b32_e32 v6, s6
-; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v4, 12, s3
-; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v5, 12, s4
-; GFX9-DL-NEXT:    s_bfe_i32 s7, s1, 0x40004
-; GFX9-DL-NEXT:    s_bfe_i32 s9, s1, 0x40008
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s10
-; GFX9-DL-NEXT:    v_mov_b32_e32 v7, s8
+; GFX9-DL-NEXT:    s_lshr_b32 s5, s3, 12
+; GFX9-DL-NEXT:    s_bfe_i32 s8, s4, 0x40000
+; GFX9-DL-NEXT:    s_lshr_b32 s6, s4, 12
+; GFX9-DL-NEXT:    s_bfe_i32 s10, s4, 0x40004
+; GFX9-DL-NEXT:    s_bfe_i32 s12, s4, 0x40008
+; GFX9-DL-NEXT:    s_bfe_i32 s7, s3, 0x40000
+; GFX9-DL-NEXT:    v_mov_b32_e32 v5, s8
+; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v3, 12, s5
+; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v4, 12, s6
+; GFX9-DL-NEXT:    s_bfe_i32 s9, s3, 0x40004
+; GFX9-DL-NEXT:    s_bfe_i32 s11, s3, 0x40008
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s12
+; GFX9-DL-NEXT:    v_mov_b32_e32 v6, s10
+; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v3, 12, v3
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v4, 12, v4
-; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v5, 12, v5
-; GFX9-DL-NEXT:    v_mul_i32_i24_e32 v3, s9, v3
-; GFX9-DL-NEXT:    s_bfe_i32 s12, s2, 0x40010
-; GFX9-DL-NEXT:    v_and_b32_e32 v4, s0, v4
-; GFX9-DL-NEXT:    v_and_b32_e32 v5, s0, v5
-; GFX9-DL-NEXT:    s_bfe_i32 s14, s2, 0x40014
-; GFX9-DL-NEXT:    s_bfe_i32 s11, s1, 0x40010
-; GFX9-DL-NEXT:    v_mov_b32_e32 v8, s12
-; GFX9-DL-NEXT:    s_bfe_i32 s16, s2, 0x40018
-; GFX9-DL-NEXT:    s_bfe_i32 s13, s1, 0x40014
-; GFX9-DL-NEXT:    v_mov_b32_e32 v9, s14
-; GFX9-DL-NEXT:    s_bfe_i32 s15, s1, 0x40018
-; GFX9-DL-NEXT:    s_ashr_i32 s2, s2, 28
-; GFX9-DL-NEXT:    v_mov_b32_e32 v10, s16
-; GFX9-DL-NEXT:    s_ashr_i32 s1, s1, 28
+; GFX9-DL-NEXT:    v_mul_i32_i24_e32 v2, s11, v2
+; GFX9-DL-NEXT:    s_bfe_i32 s14, s4, 0x40010
+; GFX9-DL-NEXT:    v_and_b32_e32 v3, s2, v3
+; GFX9-DL-NEXT:    v_and_b32_e32 v4, s2, v4
+; GFX9-DL-NEXT:    s_bfe_i32 s16, s4, 0x40014
+; GFX9-DL-NEXT:    s_bfe_i32 s13, s3, 0x40010
+; GFX9-DL-NEXT:    v_mov_b32_e32 v7, s14
+; GFX9-DL-NEXT:    s_bfe_i32 s18, s4, 0x40018
+; GFX9-DL-NEXT:    s_bfe_i32 s15, s3, 0x40014
+; GFX9-DL-NEXT:    v_mov_b32_e32 v8, s16
+; GFX9-DL-NEXT:    s_bfe_i32 s17, s3, 0x40018
+; GFX9-DL-NEXT:    s_ashr_i32 s4, s4, 28
+; GFX9-DL-NEXT:    v_mov_b32_e32 v9, s18
+; GFX9-DL-NEXT:    s_ashr_i32 s3, s3, 28
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s5, v6, v2
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s7, v7, v2
-; GFX9-DL-NEXT:    v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, v4, v5, v2
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s11, v8, v2
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s13, v9, v2
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s15, v10, v2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s2
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s1, v3, v2
-; GFX9-DL-NEXT:    global_store_byte v[0:1], v2, off
+; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s7, v5, v1
+; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s9, v6, v1
+; GFX9-DL-NEXT:    v_add_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, v3, v4, v1
+; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s13, v7, v1
+; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s15, v8, v1
+; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s17, v9, v1
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s3, v2, v1
+; GFX9-DL-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: idot8_acc8:
 ; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-DL-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
 ; GFX10-DL-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GFX10-DL-NEXT:    s_mov_b32 s14, -1
 ; GFX10-DL-NEXT:    s_mov_b32 s15, 0x31c16000
 ; GFX10-DL-NEXT:    s_add_u32 s12, s12, s3
-; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-DL-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-DL-NEXT:    global_load_ubyte v2, v[0:1], off
-; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    global_load_ubyte v1, v0, s[4:5]
 ; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_lshr_b32 s2, s0, 12
 ; GFX10-DL-NEXT:    s_lshr_b32 s3, s1, 12
-; GFX10-DL-NEXT:    s_bfe_i32 s4, s0, 0x40000
-; GFX10-DL-NEXT:    s_bfe_i32 s5, s1, 0x40000
-; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v3, 12, s2
-; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v4, 12, s3
-; GFX10-DL-NEXT:    s_bfe_i32 s6, s0, 0x40004
-; GFX10-DL-NEXT:    s_bfe_i32 s7, s0, 0x40008
-; GFX10-DL-NEXT:    s_bfe_i32 s8, s1, 0x40008
+; GFX10-DL-NEXT:    s_bfe_i32 s6, s0, 0x40000
+; GFX10-DL-NEXT:    s_bfe_i32 s7, s1, 0x40000
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v2, 12, s2
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v3, 12, s3
+; GFX10-DL-NEXT:    s_bfe_i32 s8, s0, 0x40004
+; GFX10-DL-NEXT:    s_bfe_i32 s9, s0, 0x40008
+; GFX10-DL-NEXT:    s_bfe_i32 s10, s1, 0x40008
 ; GFX10-DL-NEXT:    s_bfe_i32 s2, s1, 0x40004
-; GFX10-DL-NEXT:    v_mul_i32_i24_e64 v5, s7, s8
+; GFX10-DL-NEXT:    v_mul_i32_i24_e64 v4, s9, s10
+; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v2, 12, v2
 ; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v3, 12, v3
-; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v4, 12, v4
 ; GFX10-DL-NEXT:    s_bfe_i32 s3, s1, 0x40010
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s4, s5, v2
-; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s6, s2, v2
+; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s6, s7, v1
+; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s8, s2, v1
 ; GFX10-DL-NEXT:    s_movk_i32 s2, 0xff
+; GFX10-DL-NEXT:    v_and_b32_e32 v2, s2, v2
 ; GFX10-DL-NEXT:    v_and_b32_e32 v3, s2, v3
-; GFX10-DL-NEXT:    v_and_b32_e32 v4, s2, v4
-; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
 ; GFX10-DL-NEXT:    s_bfe_i32 s2, s0, 0x40010
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, v3, v4, v2
-; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s2, s3, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v1, v2, v3, v1
+; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s2, s3, v1
 ; GFX10-DL-NEXT:    s_bfe_i32 s2, s0, 0x40014
 ; GFX10-DL-NEXT:    s_bfe_i32 s3, s1, 0x40014
-; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s2, s3, v2
+; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s2, s3, v1
 ; GFX10-DL-NEXT:    s_bfe_i32 s2, s0, 0x40018
 ; GFX10-DL-NEXT:    s_bfe_i32 s3, s1, 0x40018
 ; GFX10-DL-NEXT:    s_ashr_i32 s0, s0, 28
 ; GFX10-DL-NEXT:    s_ashr_i32 s1, s1, 28
-; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s2, s3, v2
-; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s0, s1, v2
-; GFX10-DL-NEXT:    global_store_byte v[0:1], v2, off
+; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s2, s3, v1
+; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s0, s1, v1
+; GFX10-DL-NEXT:    global_store_byte v0, v1, s[4:5]
 ; GFX10-DL-NEXT:    s_endpgm
                                        <8 x i4> addrspace(1)* %src2,
                                        i8 addrspace(1)* nocapture %dst) {
@@ -1149,45 +1138,44 @@ define amdgpu_kernel void @idot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    s_load_dword s3, s[6:7], 0x0
 ; GFX9-NEXT:    s_load_dword s18, s[0:1], 0x0
 ; GFX9-NEXT:    s_addc_u32 s21, s21, 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_bfe_i32 s4, s2, 0x40000
 ; GFX9-NEXT:    s_bfe_i32 s5, s3, 0x40000
-; GFX9-NEXT:    v_mov_b32_e32 v0, s5
-; GFX9-NEXT:    v_mov_b32_e32 v1, s18
-; GFX9-NEXT:    v_mad_i32_i24 v1, s4, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s18
+; GFX9-NEXT:    v_mad_i32_i24 v2, s4, v1, v2
 ; GFX9-NEXT:    s_bfe_i32 s7, s3, 0x40004
 ; GFX9-NEXT:    s_bfe_i32 s6, s2, 0x40004
 ; GFX9-NEXT:    s_bfe_i32 s9, s3, 0x40008
-; GFX9-NEXT:    v_mad_i32_i24 v0, s4, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v2, s7
-; GFX9-NEXT:    v_mad_i32_i24 v0, s6, v2, v0
+; GFX9-NEXT:    v_mad_i32_i24 v1, s4, v1, v2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    v_mad_i32_i24 v1, s6, v3, v1
 ; GFX9-NEXT:    s_bfe_i32 s8, s2, 0x40008
-; GFX9-NEXT:    v_mov_b32_e32 v2, s9
+; GFX9-NEXT:    v_mov_b32_e32 v3, s9
 ; GFX9-NEXT:    s_bfe_i32 s11, s3, 0x4000c
-; GFX9-NEXT:    v_mad_i32_i24 v0, s8, v2, v0
+; GFX9-NEXT:    v_mad_i32_i24 v1, s8, v3, v1
 ; GFX9-NEXT:    s_bfe_i32 s10, s2, 0x4000c
-; GFX9-NEXT:    v_mov_b32_e32 v2, s11
+; GFX9-NEXT:    v_mov_b32_e32 v3, s11
 ; GFX9-NEXT:    s_bfe_i32 s13, s3, 0x40010
-; GFX9-NEXT:    v_mad_i32_i24 v0, s10, v2, v0
+; GFX9-NEXT:    v_mad_i32_i24 v1, s10, v3, v1
 ; GFX9-NEXT:    s_bfe_i32 s12, s2, 0x40010
-; GFX9-NEXT:    v_mov_b32_e32 v2, s13
+; GFX9-NEXT:    v_mov_b32_e32 v3, s13
 ; GFX9-NEXT:    s_bfe_i32 s15, s3, 0x40014
 ; GFX9-NEXT:    s_bfe_i32 s17, s3, 0x40018
-; GFX9-NEXT:    v_mad_i32_i24 v0, s12, v2, v0
+; GFX9-NEXT:    v_mad_i32_i24 v1, s12, v3, v1
 ; GFX9-NEXT:    s_bfe_i32 s14, s2, 0x40014
-; GFX9-NEXT:    v_mov_b32_e32 v2, s15
+; GFX9-NEXT:    v_mov_b32_e32 v3, s15
 ; GFX9-NEXT:    s_bfe_i32 s16, s2, 0x40018
-; GFX9-NEXT:    v_mad_i32_i24 v0, s14, v2, v0
-; GFX9-NEXT:    v_mov_b32_e32 v2, s17
+; GFX9-NEXT:    v_mad_i32_i24 v1, s14, v3, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, s17
 ; GFX9-NEXT:    s_ashr_i32 s3, s3, 28
-; GFX9-NEXT:    v_mad_i32_i24 v0, s16, v2, v0
+; GFX9-NEXT:    v_mad_i32_i24 v1, s16, v3, v1
 ; GFX9-NEXT:    s_ashr_i32 s2, s2, 28
-; GFX9-NEXT:    v_mov_b32_e32 v2, s3
-; GFX9-NEXT:    v_mad_i32_i24 v0, s2, v2, v0
-; GFX9-NEXT:    v_add_u32_e32 v2, v1, v0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-NEXT:    v_mad_i32_i24 v1, s2, v3, v1
+; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: idot8_multiuses_mul1:
@@ -1204,45 +1192,44 @@ define amdgpu_kernel void @idot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s18, s[0:1], 0x0
 ; GFX9-DL-NEXT:    s_addc_u32 s21, s21, 0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    s_bfe_i32 s4, s2, 0x40000
 ; GFX9-DL-NEXT:    s_bfe_i32 s5, s3, 0x40000
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s5
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s18
-; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s4, v0, v1
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s18
+; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s4, v1, v2
 ; GFX9-DL-NEXT:    s_bfe_i32 s7, s3, 0x40004
 ; GFX9-DL-NEXT:    s_bfe_i32 s6, s2, 0x40004
 ; GFX9-DL-NEXT:    s_bfe_i32 s9, s3, 0x40008
-; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s4, v0, v1
-; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s7
-; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s6, v2, v0
+; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s4, v1, v2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s6, v3, v1
 ; GFX9-DL-NEXT:    s_bfe_i32 s8, s2, 0x40008
-; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s9
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s9
 ; GFX9-DL-NEXT:    s_bfe_i32 s11, s3, 0x4000c
-; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s8, v2, v0
+; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s8, v3, v1
 ; GFX9-DL-NEXT:    s_bfe_i32 s10, s2, 0x4000c
-; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s11
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s11
 ; GFX9-DL-NEXT:    s_bfe_i32 s13, s3, 0x40010
-; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s10, v2, v0
+; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s10, v3, v1
 ; GFX9-DL-NEXT:    s_bfe_i32 s12, s2, 0x40010
-; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s13
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s13
 ; GFX9-DL-NEXT:    s_bfe_i32 s15, s3, 0x40014
 ; GFX9-DL-NEXT:    s_bfe_i32 s17, s3, 0x40018
-; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s12, v2, v0
+; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s12, v3, v1
 ; GFX9-DL-NEXT:    s_bfe_i32 s14, s2, 0x40014
-; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s15
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s15
 ; GFX9-DL-NEXT:    s_bfe_i32 s16, s2, 0x40018
-; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s14, v2, v0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s17
+; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s14, v3, v1
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s17
 ; GFX9-DL-NEXT:    s_ashr_i32 s3, s3, 28
-; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s16, v2, v0
+; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s16, v3, v1
 ; GFX9-DL-NEXT:    s_ashr_i32 s2, s2, 28
-; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s3
-; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s2, v2, v0
-; GFX9-DL-NEXT:    v_add_u32_e32 v2, v1, v0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s2, v3, v1
+; GFX9-DL-NEXT:    v_add_u32_e32 v1, v2, v1
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: idot8_multiuses_mul1:
@@ -1256,6 +1243,7 @@ define amdgpu_kernel void @idot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-DL-NEXT:    s_addc_u32 s9, s9, 0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -1288,10 +1276,8 @@ define amdgpu_kernel void @idot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_ashr_i32 s1, s1, 28
 ; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s2, s3, v1
 ; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s0, s1, v1
-; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v0, v1
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v0, v0, v1
+; GFX10-DL-NEXT:    global_store_dword v2, v0, s[4:5]
 ; GFX10-DL-NEXT:    s_endpgm
                                                 <8 x i4> addrspace(1)* %src2,
                                                 i32 addrspace(1)* nocapture %dst) {
@@ -1485,6 +1471,7 @@ define amdgpu_kernel void @idot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    s_load_dword s3, s[6:7], 0x0
 ; GFX9-NEXT:    s_load_dword s18, s[0:1], 0x0
 ; GFX9-NEXT:    s_addc_u32 s21, s21, 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_ashr_i32 s4, s2, 28
 ; GFX9-NEXT:    s_ashr_i32 s11, s3, 28
@@ -1502,26 +1489,24 @@ define amdgpu_kernel void @idot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    s_bfe_i32 s9, s2, 0x40008
 ; GFX9-NEXT:    s_bfe_i32 s10, s2, 0x40004
 ; GFX9-NEXT:    s_bfe_i32 s2, s2, 0x40000
-; GFX9-NEXT:    v_mov_b32_e32 v0, s3
-; GFX9-NEXT:    v_mov_b32_e32 v1, s18
-; GFX9-NEXT:    v_mad_i32_i24 v0, s2, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v1, s17
-; GFX9-NEXT:    v_mad_i32_i24 v0, s10, v1, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s16
-; GFX9-NEXT:    v_mad_i32_i24 v0, s9, v1, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s15
-; GFX9-NEXT:    v_mad_i32_i24 v0, s8, v1, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s14
-; GFX9-NEXT:    v_mad_i32_i24 v0, s7, v1, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s13
-; GFX9-NEXT:    v_mad_i32_i24 v0, s6, v1, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s12
-; GFX9-NEXT:    v_mad_i32_i24 v0, s5, v1, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s11
-; GFX9-NEXT:    v_mad_i32_i24 v2, s4, v1, v0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_mov_b32_e32 v2, s18
+; GFX9-NEXT:    v_mad_i32_i24 v1, s2, v1, v2
+; GFX9-NEXT:    v_mov_b32_e32 v2, s17
+; GFX9-NEXT:    v_mad_i32_i24 v1, s10, v2, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s16
+; GFX9-NEXT:    v_mad_i32_i24 v1, s9, v2, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s15
+; GFX9-NEXT:    v_mad_i32_i24 v1, s8, v2, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s14
+; GFX9-NEXT:    v_mad_i32_i24 v1, s7, v2, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s13
+; GFX9-NEXT:    v_mad_i32_i24 v1, s6, v2, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s12
+; GFX9-NEXT:    v_mad_i32_i24 v1, s5, v2, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s11
+; GFX9-NEXT:    v_mad_i32_i24 v1, s4, v2, v1
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: idot8_acc32_vecMul:
@@ -1537,14 +1522,13 @@ define amdgpu_kernel void @idot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s3, s[0:1], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-DL-NEXT:    v_dot8_i32_i4 v2, s4, v0, v1
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-DL-NEXT:    v_dot8_i32_i4 v1, s4, v1, v2
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: idot8_acc32_vecMul:
@@ -1557,6 +1541,7 @@ define amdgpu_kernel void @idot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_clause 0x1
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-DL-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1565,10 +1550,8 @@ define amdgpu_kernel void @idot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s6
-; GFX10-DL-NEXT:    v_dot8_i32_i4 v2, s0, s1, v0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT:    v_dot8_i32_i4 v0, s0, s1, v0
+; GFX10-DL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-DL-NEXT:    s_endpgm
                                               <8 x i4> addrspace(1)* %src2,
                                               i32 addrspace(1)* nocapture %dst) {
@@ -1748,11 +1731,11 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    s_and_b32 s11, s2, 15
 ; GFX9-NEXT:    s_bfe_u32 s2, s2, 0x40004
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s11, s2
-; GFX9-NEXT:    v_pk_lshlrev_b16 v0, 12, s2 op_sel_hi:[0,1]
-; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s9, s10
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v1, 12, s2 op_sel_hi:[0,1]
-; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s5, s8
+; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s9, s10
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v2, 12, s2 op_sel_hi:[0,1]
+; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s5, s8
+; GFX9-NEXT:    v_pk_lshlrev_b16 v3, 12, s2 op_sel_hi:[0,1]
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s3, s4
 ; GFX9-NEXT:    s_bfe_u32 s7, s6, 0x40018
 ; GFX9-NEXT:    s_lshr_b32 s12, s6, 28
@@ -1762,40 +1745,39 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    s_bfe_u32 s16, s6, 0x4000c
 ; GFX9-NEXT:    s_and_b32 s17, s6, 15
 ; GFX9-NEXT:    s_bfe_u32 s6, s6, 0x40004
-; GFX9-NEXT:    v_pk_lshlrev_b16 v3, 12, s2 op_sel_hi:[0,1]
-; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s17, s6
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v4, 12, s2 op_sel_hi:[0,1]
-; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s15, s16
+; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s17, s6
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v5, 12, s2 op_sel_hi:[0,1]
-; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s13, s14
-; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 12, v0 op_sel_hi:[0,1]
-; GFX9-NEXT:    v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1]
+; GFX9-NEXT:    v_pk_mul_lo_u16 v1, v1, v5
+; GFX9-NEXT:    global_load_ushort v5, v0, s[0:1]
+; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s15, s16
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v6, 12, s2 op_sel_hi:[0,1]
-; GFX9-NEXT:    v_pk_mul_lo_u16 v5, v1, v5
-; GFX9-NEXT:    v_pk_mul_lo_u16 v4, v0, v4
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s13, s14
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1]
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_pk_lshlrev_b16 v7, 12, s2 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_pk_mul_lo_u16 v2, v2, v6
-; GFX9-NEXT:    global_load_ushort v6, v[0:1], off
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s7, s12
-; GFX9-NEXT:    v_pk_lshlrev_b16 v7, 12, s2 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1]
+; GFX9-NEXT:    v_pk_lshlrev_b16 v8, 12, s2 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_pk_mul_lo_u16 v3, v3, v7
+; GFX9-NEXT:    v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1]
+; GFX9-NEXT:    v_pk_ashrrev_i16 v8, 12, v8 op_sel_hi:[0,1]
+; GFX9-NEXT:    v_pk_mul_lo_u16 v4, v4, v8
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_u32_e32 v6, v4, v6
-; GFX9-NEXT:    v_add_u32_sdwa v4, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT:    v_add_u32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX9-NEXT:    v_add_u32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT:    v_add_u32_e32 v4, v4, v2
-; GFX9-NEXT:    v_add_u32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT:    v_add_u32_e32 v2, v2, v3
-; GFX9-NEXT:    v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT:    global_store_short v[0:1], v2, off
+; GFX9-NEXT:    v_add_u32_e32 v5, v1, v5
+; GFX9-NEXT:    v_add_u32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_add_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX9-NEXT:    v_add_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
+; GFX9-NEXT:    v_add_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_add_u32_e32 v1, v1, v4
+; GFX9-NEXT:    v_add_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: idot8_acc16_vecMul:
@@ -1821,11 +1803,11 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_and_b32 s11, s2, 15
 ; GFX9-DL-NEXT:    s_bfe_u32 s2, s2, 0x40004
 ; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s2, s11, s2
-; GFX9-DL-NEXT:    v_pk_lshlrev_b16 v0, 12, s2 op_sel_hi:[0,1]
-; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s2, s9, s10
 ; GFX9-DL-NEXT:    v_pk_lshlrev_b16 v1, 12, s2 op_sel_hi:[0,1]
-; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s2, s5, s8
+; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s2, s9, s10
 ; GFX9-DL-NEXT:    v_pk_lshlrev_b16 v2, 12, s2 op_sel_hi:[0,1]
+; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s2, s5, s8
+; GFX9-DL-NEXT:    v_pk_lshlrev_b16 v3, 12, s2 op_sel_hi:[0,1]
 ; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s2, s3, s4
 ; GFX9-DL-NEXT:    s_bfe_u32 s7, s6, 0x40018
 ; GFX9-DL-NEXT:    s_lshr_b32 s12, s6, 28
@@ -1835,115 +1817,112 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_bfe_u32 s16, s6, 0x4000c
 ; GFX9-DL-NEXT:    s_and_b32 s17, s6, 15
 ; GFX9-DL-NEXT:    s_bfe_u32 s6, s6, 0x40004
-; GFX9-DL-NEXT:    v_pk_lshlrev_b16 v3, 12, s2 op_sel_hi:[0,1]
-; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s2, s17, s6
 ; GFX9-DL-NEXT:    v_pk_lshlrev_b16 v4, 12, s2 op_sel_hi:[0,1]
-; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s2, s15, s16
+; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s2, s17, s6
 ; GFX9-DL-NEXT:    v_pk_lshlrev_b16 v5, 12, s2 op_sel_hi:[0,1]
-; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s2, s13, s14
-; GFX9-DL-NEXT:    v_pk_ashrrev_i16 v0, 12, v0 op_sel_hi:[0,1]
-; GFX9-DL-NEXT:    v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1]
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1]
 ; GFX9-DL-NEXT:    v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1]
+; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v1, v1, v5
+; GFX9-DL-NEXT:    global_load_ushort v5, v0, s[0:1]
+; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s2, s15, s16
 ; GFX9-DL-NEXT:    v_pk_lshlrev_b16 v6, 12, s2 op_sel_hi:[0,1]
-; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v5, v1, v5
-; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v4, v0, v4
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s2, s13, s14
 ; GFX9-DL-NEXT:    v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1]
 ; GFX9-DL-NEXT:    v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1]
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-DL-NEXT:    v_pk_lshlrev_b16 v7, 12, s2 op_sel_hi:[0,1]
 ; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v2, v2, v6
-; GFX9-DL-NEXT:    global_load_ushort v6, v[0:1], off
 ; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s2, s7, s12
-; GFX9-DL-NEXT:    v_pk_lshlrev_b16 v7, 12, s2 op_sel_hi:[0,1]
 ; GFX9-DL-NEXT:    v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1]
 ; GFX9-DL-NEXT:    v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1]
+; GFX9-DL-NEXT:    v_pk_lshlrev_b16 v8, 12, s2 op_sel_hi:[0,1]
 ; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v3, v3, v7
+; GFX9-DL-NEXT:    v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1]
+; GFX9-DL-NEXT:    v_pk_ashrrev_i16 v8, 12, v8 op_sel_hi:[0,1]
+; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v4, v4, v8
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_add_u32_e32 v6, v4, v6
-; GFX9-DL-NEXT:    v_add_u32_sdwa v4, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-DL-NEXT:    v_add_u32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX9-DL-NEXT:    v_add_u32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-DL-NEXT:    v_add_u32_e32 v4, v4, v2
-; GFX9-DL-NEXT:    v_add_u32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-DL-NEXT:    v_add_u32_e32 v2, v2, v3
-; GFX9-DL-NEXT:    v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-DL-NEXT:    global_store_short v[0:1], v2, off
+; GFX9-DL-NEXT:    v_add_u32_e32 v5, v1, v5
+; GFX9-DL-NEXT:    v_add_u32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-DL-NEXT:    v_add_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX9-DL-NEXT:    v_add_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-DL-NEXT:    v_add_u32_e32 v1, v1, v3
+; GFX9-DL-NEXT:    v_add_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-DL-NEXT:    v_add_u32_e32 v1, v1, v4
+; GFX9-DL-NEXT:    v_add_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-DL-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: idot8_acc16_vecMul:
 ; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-DL-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
 ; GFX10-DL-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GFX10-DL-NEXT:    s_mov_b32 s14, -1
 ; GFX10-DL-NEXT:    s_mov_b32 s15, 0x31c16000
 ; GFX10-DL-NEXT:    s_add_u32 s12, s12, s3
-; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-DL-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-DL-NEXT:    global_load_ushort v2, v[0:1], off
-; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    global_load_ushort v1, v0, s[4:5]
 ; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40018
 ; GFX10-DL-NEXT:    s_lshr_b32 s3, s0, 28
-; GFX10-DL-NEXT:    s_bfe_u32 s4, s0, 0x40010
-; GFX10-DL-NEXT:    s_bfe_u32 s5, s0, 0x40014
-; GFX10-DL-NEXT:    s_bfe_u32 s6, s0, 0x40008
-; GFX10-DL-NEXT:    s_bfe_u32 s7, s0, 0x4000c
-; GFX10-DL-NEXT:    s_and_b32 s8, s0, 15
+; GFX10-DL-NEXT:    s_bfe_u32 s6, s0, 0x40010
+; GFX10-DL-NEXT:    s_bfe_u32 s7, s0, 0x40014
+; GFX10-DL-NEXT:    s_bfe_u32 s8, s0, 0x40008
+; GFX10-DL-NEXT:    s_bfe_u32 s9, s0, 0x4000c
+; GFX10-DL-NEXT:    s_and_b32 s10, s0, 15
 ; GFX10-DL-NEXT:    s_bfe_u32 s0, s0, 0x40004
-; GFX10-DL-NEXT:    s_and_b32 s9, s1, 15
-; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s0, s8, s0
-; GFX10-DL-NEXT:    s_bfe_u32 s8, s1, 0x40004
+; GFX10-DL-NEXT:    s_and_b32 s11, s1, 15
+; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s0, s10, s0
+; GFX10-DL-NEXT:    s_bfe_u32 s10, s1, 0x40004
+; GFX10-DL-NEXT:    v_pk_lshlrev_b16 v2, 12, s0 op_sel_hi:[0,1]
+; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s0, s11, s10
+; GFX10-DL-NEXT:    s_bfe_u32 s11, s1, 0x4000c
 ; GFX10-DL-NEXT:    v_pk_lshlrev_b16 v3, 12, s0 op_sel_hi:[0,1]
-; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s0, s9, s8
-; GFX10-DL-NEXT:    s_bfe_u32 s9, s1, 0x4000c
-; GFX10-DL-NEXT:    v_pk_lshlrev_b16 v4, 12, s0 op_sel_hi:[0,1]
 ; GFX10-DL-NEXT:    s_bfe_u32 s0, s1, 0x40008
+; GFX10-DL-NEXT:    v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1]
+; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s8, s8, s9
+; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s0, s0, s11
 ; GFX10-DL-NEXT:    v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1]
+; GFX10-DL-NEXT:    v_pk_lshlrev_b16 v4, 12, s8 op_sel_hi:[0,1]
+; GFX10-DL-NEXT:    v_pk_lshlrev_b16 v5, 12, s0 op_sel_hi:[0,1]
+; GFX10-DL-NEXT:    s_bfe_u32 s8, s1, 0x40010
+; GFX10-DL-NEXT:    s_bfe_u32 s0, s1, 0x40014
+; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v2, v2, v3
+; GFX10-DL-NEXT:    v_pk_ashrrev_i16 v3, 12, v4 op_sel_hi:[0,1]
+; GFX10-DL-NEXT:    v_pk_ashrrev_i16 v4, 12, v5 op_sel_hi:[0,1]
 ; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s6, s6, s7
-; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s0, s0, s9
-; GFX10-DL-NEXT:    v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1]
+; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s0, s8, s0
 ; GFX10-DL-NEXT:    v_pk_lshlrev_b16 v5, 12, s6 op_sel_hi:[0,1]
 ; GFX10-DL-NEXT:    v_pk_lshlrev_b16 v6, 12, s0 op_sel_hi:[0,1]
-; GFX10-DL-NEXT:    s_bfe_u32 s6, s1, 0x40010
-; GFX10-DL-NEXT:    s_bfe_u32 s0, s1, 0x40014
 ; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v3, v3, v4
-; GFX10-DL-NEXT:    v_pk_ashrrev_i16 v4, 12, v5 op_sel_hi:[0,1]
-; GFX10-DL-NEXT:    v_pk_ashrrev_i16 v5, 12, v6 op_sel_hi:[0,1]
-; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s4, s4, s5
-; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s0, s6, s0
-; GFX10-DL-NEXT:    v_pk_lshlrev_b16 v6, 12, s4 op_sel_hi:[0,1]
-; GFX10-DL-NEXT:    v_pk_lshlrev_b16 v7, 12, s0 op_sel_hi:[0,1]
-; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v4, v4, v5
-; GFX10-DL-NEXT:    s_bfe_u32 s8, s1, 0x40018
+; GFX10-DL-NEXT:    s_bfe_u32 s10, s1, 0x40018
 ; GFX10-DL-NEXT:    s_lshr_b32 s0, s1, 28
 ; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s1, s2, s3
-; GFX10-DL-NEXT:    v_pk_ashrrev_i16 v5, 12, v7 op_sel_hi:[0,1]
-; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s0, s8, s0
-; GFX10-DL-NEXT:    v_pk_lshlrev_b16 v7, 12, s0 op_sel_hi:[0,1]
+; GFX10-DL-NEXT:    v_pk_ashrrev_i16 v4, 12, v6 op_sel_hi:[0,1]
+; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s0, s10, s0
+; GFX10-DL-NEXT:    v_pk_lshlrev_b16 v6, 12, s0 op_sel_hi:[0,1]
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v3, v2
-; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-DL-NEXT:    v_pk_ashrrev_i16 v3, 12, v6 op_sel_hi:[0,1]
-; GFX10-DL-NEXT:    v_pk_lshlrev_b16 v6, 12, s1 op_sel_hi:[0,1]
-; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v3, v3, v5
-; GFX10-DL-NEXT:    v_pk_ashrrev_i16 v5, 12, v7 op_sel_hi:[0,1]
-; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v1, v2, v1
+; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-DL-NEXT:    v_pk_ashrrev_i16 v2, 12, v5 op_sel_hi:[0,1]
+; GFX10-DL-NEXT:    v_pk_lshlrev_b16 v5, 12, s1 op_sel_hi:[0,1]
+; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v2, v2, v4
 ; GFX10-DL-NEXT:    v_pk_ashrrev_i16 v4, 12, v6 op_sel_hi:[0,1]
-; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v2, v3
-; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v4, v4, v5
-; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v2, v4
-; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-DL-NEXT:    global_store_short v[0:1], v2, off
+; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-DL-NEXT:    v_pk_ashrrev_i16 v3, 12, v5 op_sel_hi:[0,1]
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v1, v1, v2
+; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v3, v3, v4
+; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v1, v1, v3
+; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-DL-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-DL-NEXT:    s_endpgm
                                               <8 x i4> addrspace(1)* %src2,
                                               i16 addrspace(1)* nocapture %dst) {
@@ -2147,288 +2126,284 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
 ; GFX9-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s22, -1
-; GFX9-NEXT:    s_mov_b32 s23, 0xe00000
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_load_ubyte v2, v[0:1], off
-; GFX9-NEXT:    s_load_dword s1, s[4:5], 0x0
-; GFX9-NEXT:    s_load_dword s2, s[6:7], 0x0
+; GFX9-NEXT:    global_load_ubyte v1, v0, s[0:1]
+; GFX9-NEXT:    s_mov_b32 s23, 0xe00000
 ; GFX9-NEXT:    s_add_u32 s20, s20, s3
+; GFX9-NEXT:    s_load_dword s3, s[4:5], 0x0
+; GFX9-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX9-NEXT:    s_addc_u32 s21, s21, 0
-; GFX9-NEXT:    s_mov_b32 s0, 0xffff
+; GFX9-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshr_b32 s7, s1, 4
-; GFX9-NEXT:    s_lshr_b32 s14, s2, 4
-; GFX9-NEXT:    v_lshlrev_b16_e64 v3, 12, s1
-; GFX9-NEXT:    v_lshlrev_b16_e64 v4, 12, s2
-; GFX9-NEXT:    v_lshlrev_b16_e64 v7, 12, s7
-; GFX9-NEXT:    v_lshlrev_b16_e64 v14, 12, s14
-; GFX9-NEXT:    s_lshr_b32 s8, s1, 12
-; GFX9-NEXT:    s_lshr_b32 s9, s1, 8
-; GFX9-NEXT:    s_lshr_b32 s15, s2, 12
-; GFX9-NEXT:    s_lshr_b32 s16, s2, 8
-; GFX9-NEXT:    v_lshlrev_b16_e64 v5, 12, s9
-; GFX9-NEXT:    v_lshlrev_b16_e64 v6, 12, s8
-; GFX9-NEXT:    v_lshlrev_b16_e64 v12, 12, s16
-; GFX9-NEXT:    v_lshlrev_b16_e64 v13, 12, s15
+; GFX9-NEXT:    s_lshr_b32 s9, s3, 4
+; GFX9-NEXT:    s_lshr_b32 s16, s4, 4
+; GFX9-NEXT:    v_lshlrev_b16_e64 v2, 12, s3
+; GFX9-NEXT:    v_lshlrev_b16_e64 v3, 12, s4
+; GFX9-NEXT:    v_lshlrev_b16_e64 v6, 12, s9
+; GFX9-NEXT:    v_lshlrev_b16_e64 v13, 12, s16
+; GFX9-NEXT:    s_lshr_b32 s10, s3, 12
+; GFX9-NEXT:    s_lshr_b32 s11, s3, 8
+; GFX9-NEXT:    s_lshr_b32 s17, s4, 12
+; GFX9-NEXT:    s_lshr_b32 s18, s4, 8
+; GFX9-NEXT:    v_lshlrev_b16_e64 v4, 12, s11
+; GFX9-NEXT:    v_lshlrev_b16_e64 v5, 12, s10
+; GFX9-NEXT:    v_lshlrev_b16_e64 v11, 12, s18
+; GFX9-NEXT:    v_lshlrev_b16_e64 v12, 12, s17
+; GFX9-NEXT:    v_ashrrev_i16_e32 v2, 12, v2
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v3, 12, v3
+; GFX9-NEXT:    v_ashrrev_i16_e32 v6, 12, v6
+; GFX9-NEXT:    v_ashrrev_i16_e32 v13, 12, v13
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v4, 12, v4
-; GFX9-NEXT:    v_ashrrev_i16_e32 v7, 12, v7
-; GFX9-NEXT:    v_ashrrev_i16_e32 v14, 12, v14
+; GFX9-NEXT:    v_ashrrev_i16_e32 v11, 12, v11
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v5, 12, v5
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v12, 12, v12
-; GFX9-NEXT:    v_ashrrev_i16_e32 v6, 12, v6
-; GFX9-NEXT:    v_ashrrev_i16_e32 v13, 12, v13
-; GFX9-NEXT:    v_mul_lo_u16_e32 v3, v3, v4
-; GFX9-NEXT:    v_mul_lo_u16_sdwa v7, v7, v14 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_lshr_b32 s3, s1, 20
-; GFX9-NEXT:    s_lshr_b32 s4, s1, 16
-; GFX9-NEXT:    s_lshr_b32 s10, s2, 20
-; GFX9-NEXT:    s_lshr_b32 s11, s2, 16
+; GFX9-NEXT:    v_mul_lo_u16_e32 v2, v2, v3
 ; GFX9-NEXT:    v_mul_lo_u16_sdwa v6, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT:    v_mul_lo_u16_e32 v5, v5, v12
-; GFX9-NEXT:    v_lshlrev_b16_e64 v10, 12, s4
-; GFX9-NEXT:    v_lshlrev_b16_e64 v11, 12, s3
-; GFX9-NEXT:    v_lshlrev_b16_e64 v17, 12, s11
-; GFX9-NEXT:    v_lshlrev_b16_e64 v18, 12, s10
-; GFX9-NEXT:    s_lshr_b32 s5, s1, 28
-; GFX9-NEXT:    s_lshr_b32 s6, s1, 24
-; GFX9-NEXT:    s_lshr_b32 s12, s2, 28
-; GFX9-NEXT:    s_lshr_b32 s13, s2, 24
-; GFX9-NEXT:    v_and_b32_e32 v3, s0, v3
-; GFX9-NEXT:    v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b16_e64 v8, 12, s6
-; GFX9-NEXT:    v_lshlrev_b16_e64 v9, 12, s5
-; GFX9-NEXT:    v_lshlrev_b16_e64 v15, 12, s13
-; GFX9-NEXT:    v_lshlrev_b16_e64 v16, 12, s12
-; GFX9-NEXT:    v_or_b32_e32 v5, v3, v5
+; GFX9-NEXT:    v_or_b32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_lshr_b32 s5, s3, 20
+; GFX9-NEXT:    s_lshr_b32 s6, s3, 16
+; GFX9-NEXT:    s_lshr_b32 s12, s4, 20
+; GFX9-NEXT:    s_lshr_b32 s13, s4, 16
+; GFX9-NEXT:    v_mul_lo_u16_sdwa v5, v5, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_mul_lo_u16_e32 v4, v4, v11
+; GFX9-NEXT:    v_lshlrev_b16_e64 v9, 12, s6
+; GFX9-NEXT:    v_lshlrev_b16_e64 v10, 12, s5
+; GFX9-NEXT:    v_lshlrev_b16_e64 v16, 12, s13
+; GFX9-NEXT:    v_lshlrev_b16_e64 v17, 12, s12
+; GFX9-NEXT:    s_lshr_b32 s7, s3, 28
+; GFX9-NEXT:    s_lshr_b32 s8, s3, 24
+; GFX9-NEXT:    s_lshr_b32 s14, s4, 28
+; GFX9-NEXT:    s_lshr_b32 s15, s4, 24
+; GFX9-NEXT:    v_and_b32_e32 v2, s2, v2
+; GFX9-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b16_e64 v7, 12, s8
+; GFX9-NEXT:    v_lshlrev_b16_e64 v8, 12, s7
+; GFX9-NEXT:    v_lshlrev_b16_e64 v14, 12, s15
+; GFX9-NEXT:    v_lshlrev_b16_e64 v15, 12, s14
+; GFX9-NEXT:    v_or_b32_e32 v4, v2, v4
+; GFX9-NEXT:    v_ashrrev_i16_e32 v9, 12, v9
+; GFX9-NEXT:    v_ashrrev_i16_e32 v16, 12, v16
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v10, 12, v10
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v17, 12, v17
-; GFX9-NEXT:    v_ashrrev_i16_e32 v11, 12, v11
-; GFX9-NEXT:    v_ashrrev_i16_e32 v18, 12, v18
+; GFX9-NEXT:    v_ashrrev_i16_e32 v7, 12, v7
+; GFX9-NEXT:    v_ashrrev_i16_e32 v14, 12, v14
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v8, 12, v8
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v15, 12, v15
-; GFX9-NEXT:    v_ashrrev_i16_e32 v9, 12, v9
-; GFX9-NEXT:    v_ashrrev_i16_e32 v16, 12, v16
-; GFX9-NEXT:    v_mul_lo_u16_sdwa v4, v11, v18 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT:    v_mul_lo_u16_e32 v10, v10, v17
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 8, v5
-; GFX9-NEXT:    v_or_b32_sdwa v4, v10, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_mul_lo_u16_sdwa v9, v9, v16 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT:    v_mul_lo_u16_e32 v8, v8, v15
-; GFX9-NEXT:    v_or_b32_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_e32 v4, s0, v4
-; GFX9-NEXT:    v_or_b32_e32 v6, v4, v8
+; GFX9-NEXT:    v_mul_lo_u16_sdwa v3, v10, v17 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_mul_lo_u16_e32 v9, v9, v16
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 8, v4
+; GFX9-NEXT:    v_or_b32_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_mul_lo_u16_sdwa v8, v8, v15 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_mul_lo_u16_e32 v7, v7, v14
+; GFX9-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_e32 v3, s2, v3
+; GFX9-NEXT:    v_or_b32_e32 v5, v3, v7
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT:    v_add_u32_e32 v2, v2, v7
-; GFX9-NEXT:    v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2
-; GFX9-NEXT:    v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-NEXT:    v_add_u32_e32 v2, v2, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 8, v6
-; GFX9-NEXT:    v_add_u32_e32 v2, v2, v3
-; GFX9-NEXT:    v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT:    v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-NEXT:    global_store_byte v[0:1], v2, off
+; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
+; GFX9-NEXT:    v_add_u32_e32 v1, v1, v6
+; GFX9-NEXT:    v_add_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2
+; GFX9-NEXT:    v_add_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 8, v5
+; GFX9-NEXT:    v_add_u32_e32 v1, v1, v2
+; GFX9-NEXT:    v_add_u32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_add_u32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX9-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: idot8_acc8_vecMul:
 ; GFX9-DL:       ; %bb.0: ; %entry
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
 ; GFX9-DL-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
 ; GFX9-DL-NEXT:    s_mov_b32 s22, -1
-; GFX9-DL-NEXT:    s_mov_b32 s23, 0xe00000
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT:    global_load_ubyte v2, v[0:1], off
-; GFX9-DL-NEXT:    s_load_dword s1, s[4:5], 0x0
-; GFX9-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
+; GFX9-DL-NEXT:    global_load_ubyte v1, v0, s[0:1]
+; GFX9-DL-NEXT:    s_mov_b32 s23, 0xe00000
 ; GFX9-DL-NEXT:    s_add_u32 s20, s20, s3
+; GFX9-DL-NEXT:    s_load_dword s3, s[4:5], 0x0
+; GFX9-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX9-DL-NEXT:    s_addc_u32 s21, s21, 0
-; GFX9-DL-NEXT:    s_mov_b32 s0, 0xffff
+; GFX9-DL-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_lshr_b32 s7, s1, 4
-; GFX9-DL-NEXT:    s_lshr_b32 s14, s2, 4
-; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v3, 12, s1
-; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v4, 12, s2
-; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v7, 12, s7
-; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v14, 12, s14
-; GFX9-DL-NEXT:    s_lshr_b32 s8, s1, 12
-; GFX9-DL-NEXT:    s_lshr_b32 s9, s1, 8
-; GFX9-DL-NEXT:    s_lshr_b32 s15, s2, 12
-; GFX9-DL-NEXT:    s_lshr_b32 s16, s2, 8
-; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v5, 12, s9
-; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v6, 12, s8
-; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v12, 12, s16
-; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v13, 12, s15
+; GFX9-DL-NEXT:    s_lshr_b32 s9, s3, 4
+; GFX9-DL-NEXT:    s_lshr_b32 s16, s4, 4
+; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v2, 12, s3
+; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v3, 12, s4
+; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v6, 12, s9
+; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v13, 12, s16
+; GFX9-DL-NEXT:    s_lshr_b32 s10, s3, 12
+; GFX9-DL-NEXT:    s_lshr_b32 s11, s3, 8
+; GFX9-DL-NEXT:    s_lshr_b32 s17, s4, 12
+; GFX9-DL-NEXT:    s_lshr_b32 s18, s4, 8
+; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v4, 12, s11
+; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v5, 12, s10
+; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v11, 12, s18
+; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v12, 12, s17
+; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v2, 12, v2
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v3, 12, v3
+; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v6, 12, v6
+; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v13, 12, v13
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v4, 12, v4
-; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v7, 12, v7
-; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v14, 12, v14
+; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v11, 12, v11
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v5, 12, v5
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v12, 12, v12
-; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v6, 12, v6
-; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v13, 12, v13
-; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v3, v3, v4
-; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v7, v7, v14 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-DL-NEXT:    v_or_b32_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-DL-NEXT:    s_lshr_b32 s3, s1, 20
-; GFX9-DL-NEXT:    s_lshr_b32 s4, s1, 16
-; GFX9-DL-NEXT:    s_lshr_b32 s10, s2, 20
-; GFX9-DL-NEXT:    s_lshr_b32 s11, s2, 16
+; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v2, v2, v3
 ; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v6, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v5, v5, v12
-; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v10, 12, s4
-; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v11, 12, s3
-; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v17, 12, s11
-; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v18, 12, s10
-; GFX9-DL-NEXT:    s_lshr_b32 s5, s1, 28
-; GFX9-DL-NEXT:    s_lshr_b32 s6, s1, 24
-; GFX9-DL-NEXT:    s_lshr_b32 s12, s2, 28
-; GFX9-DL-NEXT:    s_lshr_b32 s13, s2, 24
-; GFX9-DL-NEXT:    v_and_b32_e32 v3, s0, v3
-; GFX9-DL-NEXT:    v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v8, 12, s6
-; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v9, 12, s5
-; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v15, 12, s13
-; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v16, 12, s12
-; GFX9-DL-NEXT:    v_or_b32_e32 v5, v3, v5
+; GFX9-DL-NEXT:    v_or_b32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-DL-NEXT:    s_lshr_b32 s5, s3, 20
+; GFX9-DL-NEXT:    s_lshr_b32 s6, s3, 16
+; GFX9-DL-NEXT:    s_lshr_b32 s12, s4, 20
+; GFX9-DL-NEXT:    s_lshr_b32 s13, s4, 16
+; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v5, v5, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v4, v4, v11
+; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v9, 12, s6
+; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v10, 12, s5
+; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v16, 12, s13
+; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v17, 12, s12
+; GFX9-DL-NEXT:    s_lshr_b32 s7, s3, 28
+; GFX9-DL-NEXT:    s_lshr_b32 s8, s3, 24
+; GFX9-DL-NEXT:    s_lshr_b32 s14, s4, 28
+; GFX9-DL-NEXT:    s_lshr_b32 s15, s4, 24
+; GFX9-DL-NEXT:    v_and_b32_e32 v2, s2, v2
+; GFX9-DL-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v7, 12, s8
+; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v8, 12, s7
+; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v14, 12, s15
+; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v15, 12, s14
+; GFX9-DL-NEXT:    v_or_b32_e32 v4, v2, v4
+; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v9, 12, v9
+; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v16, 12, v16
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v10, 12, v10
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v17, 12, v17
-; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v11, 12, v11
-; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v18, 12, v18
+; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v7, 12, v7
+; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v14, 12, v14
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v8, 12, v8
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v15, 12, v15
-; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v9, 12, v9
-; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v16, 12, v16
-; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v4, v11, v18 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v10, v10, v17
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v7, 8, v5
-; GFX9-DL-NEXT:    v_or_b32_sdwa v4, v10, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v9, v9, v16 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v8, v8, v15
-; GFX9-DL-NEXT:    v_or_b32_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-DL-NEXT:    v_and_b32_e32 v4, s0, v4
-; GFX9-DL-NEXT:    v_or_b32_e32 v6, v4, v8
+; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v3, v10, v17 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v9, v9, v16
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v6, 8, v4
+; GFX9-DL-NEXT:    v_or_b32_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v8, v8, v15 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v7, v7, v14
+; GFX9-DL-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-DL-NEXT:    v_and_b32_e32 v3, s2, v3
+; GFX9-DL-NEXT:    v_or_b32_e32 v5, v3, v7
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX9-DL-NEXT:    v_add_u32_e32 v2, v2, v7
-; GFX9-DL-NEXT:    v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2
-; GFX9-DL-NEXT:    v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-DL-NEXT:    v_add_u32_e32 v2, v2, v4
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v3, 8, v6
-; GFX9-DL-NEXT:    v_add_u32_e32 v2, v2, v3
-; GFX9-DL-NEXT:    v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-DL-NEXT:    v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-DL-NEXT:    global_store_byte v[0:1], v2, off
+; GFX9-DL-NEXT:    v_add_u32_e32 v1, v2, v1
+; GFX9-DL-NEXT:    v_add_u32_e32 v1, v1, v6
+; GFX9-DL-NEXT:    v_add_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2
+; GFX9-DL-NEXT:    v_add_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX9-DL-NEXT:    v_add_u32_e32 v1, v1, v3
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v2, 8, v5
+; GFX9-DL-NEXT:    v_add_u32_e32 v1, v1, v2
+; GFX9-DL-NEXT:    v_add_u32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-DL-NEXT:    v_add_u32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX9-DL-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: idot8_acc8_vecMul:
 ; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-DL-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
 ; GFX10-DL-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
 ; GFX10-DL-NEXT:    s_mov_b32 s22, -1
 ; GFX10-DL-NEXT:    s_mov_b32 s23, 0x31c16000
 ; GFX10-DL-NEXT:    s_add_u32 s20, s20, s3
-; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-DL-NEXT:    s_addc_u32 s21, s21, 0
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-DL-NEXT:    global_load_ubyte v2, v[0:1], off
-; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    global_load_ubyte v1, v0, s[4:5]
 ; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX10-DL-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_lshr_b32 s7, s0, 4
-; GFX10-DL-NEXT:    s_lshr_b32 s14, s1, 4
-; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v7, 12, s7
-; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v13, 12, s14
-; GFX10-DL-NEXT:    s_lshr_b32 s8, s0, 12
-; GFX10-DL-NEXT:    s_lshr_b32 s15, s1, 12
-; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v3, 12, s0
-; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v4, 12, s1
-; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v14, 12, s15
-; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v6, 12, s8
-; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v7, 12, v7
-; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v13, 12, v13
-; GFX10-DL-NEXT:    s_lshr_b32 s9, s0, 8
-; GFX10-DL-NEXT:    s_lshr_b32 s16, s1, 8
-; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v5, 12, s9
+; GFX10-DL-NEXT:    s_lshr_b32 s9, s0, 4
+; GFX10-DL-NEXT:    s_lshr_b32 s16, s1, 4
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v6, 12, s9
 ; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v12, 12, s16
+; GFX10-DL-NEXT:    s_lshr_b32 s10, s0, 12
+; GFX10-DL-NEXT:    s_lshr_b32 s17, s1, 12
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v2, 12, s0
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v3, 12, s1
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v13, 12, s17
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v5, 12, s10
+; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v6, 12, v6
+; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v12, 12, v12
+; GFX10-DL-NEXT:    s_lshr_b32 s11, s0, 8
+; GFX10-DL-NEXT:    s_lshr_b32 s18, s1, 8
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v4, 12, s11
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v11, 12, s18
+; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v2, 12, v2
 ; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v3, 12, v3
+; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v19, 12, v5
+; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v6, v6, v12
+; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v13, 12, v13
 ; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v4, 12, v4
-; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v19, 12, v6
-; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v7, v7, v13
-; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v14, 12, v14
-; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v5, 12, v5
-; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v12, 12, v12
-; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v3, v3, v4
-; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v6, 8, v7
-; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v4, v19, v14
+; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v11, 12, v11
+; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v2, v2, v3
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v5, 8, v6
+; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v3, v19, v13
 ; GFX10-DL-NEXT:    s_lshr_b32 s3, s0, 20
-; GFX10-DL-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX10-DL-NEXT:    s_lshr_b32 s5, s0, 28
-; GFX10-DL-NEXT:    s_lshr_b32 s6, s0, 24
-; GFX10-DL-NEXT:    s_lshr_b32 s10, s1, 20
-; GFX10-DL-NEXT:    v_or_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v8, 12, s6
-; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v9, 12, s5
-; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v10, 12, s4
-; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v11, 12, s3
-; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v13, 12, s10
-; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v5, v5, v12
-; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v4, 8, v4
-; GFX10-DL-NEXT:    s_lshr_b32 s11, s1, 16
-; GFX10-DL-NEXT:    s_lshr_b32 s12, s1, 28
-; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v7, 12, s11
-; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v6, 12, v8
+; GFX10-DL-NEXT:    s_lshr_b32 s6, s0, 16
+; GFX10-DL-NEXT:    s_lshr_b32 s7, s0, 28
+; GFX10-DL-NEXT:    s_lshr_b32 s8, s0, 24
+; GFX10-DL-NEXT:    s_lshr_b32 s12, s1, 20
+; GFX10-DL-NEXT:    v_or_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v7, 12, s8
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v8, 12, s7
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v9, 12, s6
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v10, 12, s3
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v12, 12, s12
+; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v4, v4, v11
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v3, 8, v3
+; GFX10-DL-NEXT:    s_lshr_b32 s13, s1, 16
+; GFX10-DL-NEXT:    s_lshr_b32 s14, s1, 28
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v6, 12, s13
+; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v5, 12, v7
+; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v7, 12, v8
 ; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v8, 12, v9
-; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v9, 12, v10
-; GFX10-DL-NEXT:    v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-DL-NEXT:    v_and_b32_e32 v3, s2, v3
-; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v16, 12, s12
-; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v5, 12, v11
-; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v10, 12, v13
-; GFX10-DL-NEXT:    s_lshr_b32 s13, s1, 24
-; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v7, 12, v7
-; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v15, 12, s13
-; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v11, 12, v16
-; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v5, v5, v10
-; GFX10-DL-NEXT:    v_or_b32_e32 v4, v3, v4
-; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v10, v9, v7
-; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v12, 12, v15
-; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v8, v8, v11
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v9, 8, v4
+; GFX10-DL-NEXT:    v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-DL-NEXT:    v_and_b32_e32 v2, s2, v2
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v15, 12, s14
+; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v4, 12, v10
+; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v9, 12, v12
+; GFX10-DL-NEXT:    s_lshr_b32 s15, s1, 24
+; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v6, 12, v6
+; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v10, 12, v15
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v14, 12, s15
+; GFX10-DL-NEXT:    v_or_b32_e32 v3, v2, v3
+; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v4, v4, v9
+; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v15, v8, v6
+; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v7, v7, v10
+; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v11, 12, v14
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v8, 8, v3
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v3, v2
-; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v3, 8, v5
-; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v5, v6, v12
-; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v6, 8, v8
-; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v2, v9
-; GFX10-DL-NEXT:    v_or_b32_sdwa v3, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-DL-NEXT:    v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2
-; GFX10-DL-NEXT:    v_and_b32_e32 v3, s2, v3
-; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX10-DL-NEXT:    v_or_b32_e32 v4, v3, v5
-; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v2, v3
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v3, 8, v4
-; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v2, v3
-; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX10-DL-NEXT:    global_store_byte v[0:1], v2, off
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v1, v2, v1
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v2, 8, v4
+; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v4, v5, v11
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v5, 8, v7
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v1, v1, v8
+; GFX10-DL-NEXT:    v_or_b32_sdwa v2, v15, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-DL-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2
+; GFX10-DL-NEXT:    v_and_b32_e32 v2, s2, v2
+; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX10-DL-NEXT:    v_or_b32_e32 v3, v2, v4
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v1, v1, v2
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 8, v3
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v1, v1, v2
+; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX10-DL-NEXT:    global_store_byte v0, v1, s[4:5]
 ; GFX10-DL-NEXT:    s_endpgm
                                              <8 x i4> addrspace(1)* %src2,
                                              i8 addrspace(1)* nocapture %dst) {

diff  --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll
index 8e4ddbf16c71..b37d1d12c6f7 100644
--- a/llvm/test/CodeGen/AMDGPU/idot8u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll
@@ -144,26 +144,25 @@ define amdgpu_kernel void @udot8_acc32(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    s_bfe_u32 s10, s2, 0x40008
 ; GFX9-NEXT:    s_bfe_u32 s11, s2, 0x40004
 ; GFX9-NEXT:    s_and_b32 s2, s2, 15
-; GFX9-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-NEXT:    v_mov_b32_e32 v1, s18
-; GFX9-NEXT:    v_mad_u32_u24 v0, s2, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v1, s17
-; GFX9-NEXT:    v_mad_u32_u24 v0, s11, v1, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s16
-; GFX9-NEXT:    v_mad_u32_u24 v0, s10, v1, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s15
-; GFX9-NEXT:    v_mad_u32_u24 v0, s9, v1, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s14
-; GFX9-NEXT:    v_mad_u32_u24 v0, s8, v1, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s13
-; GFX9-NEXT:    v_mad_u32_u24 v0, s5, v1, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s12
-; GFX9-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-NEXT:    v_mad_u32_u24 v2, s3, v1, v0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-NEXT:    v_mov_b32_e32 v2, s18
+; GFX9-NEXT:    v_mad_u32_u24 v1, s2, v1, v2
+; GFX9-NEXT:    v_mov_b32_e32 v2, s17
+; GFX9-NEXT:    v_mad_u32_u24 v1, s11, v2, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s16
+; GFX9-NEXT:    v_mad_u32_u24 v1, s10, v2, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s15
+; GFX9-NEXT:    v_mad_u32_u24 v1, s9, v2, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s14
+; GFX9-NEXT:    v_mad_u32_u24 v1, s8, v2, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s13
+; GFX9-NEXT:    v_mad_u32_u24 v1, s5, v2, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s12
+; GFX9-NEXT:    v_mad_u32_u24 v1, s4, v2, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s7
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mad_u32_u24 v1, s3, v2, v1
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: udot8_acc32:
@@ -179,14 +178,13 @@ define amdgpu_kernel void @udot8_acc32(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s3, s[0:1], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-DL-NEXT:    v_dot8_u32_u4 v2, s4, v0, v1
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-DL-NEXT:    v_dot8_u32_u4 v1, s4, v1, v2
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: udot8_acc32:
@@ -199,6 +197,7 @@ define amdgpu_kernel void @udot8_acc32(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_clause 0x1
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-DL-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -207,10 +206,8 @@ define amdgpu_kernel void @udot8_acc32(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s6
-; GFX10-DL-NEXT:    v_dot8_u32_u4 v2, s0, s1, v0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT:    v_dot8_u32_u4 v0, s0, s1, v0
+; GFX10-DL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-DL-NEXT:    s_endpgm
                                        <8 x i4> addrspace(1)* %src2,
                                        i32 addrspace(1)* nocapture %dst) {
@@ -394,156 +391,152 @@ define amdgpu_kernel void @udot8_acc16(<8 x i4> addrspace(1)* %src1,
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX9-NEXT:    s_mov_b32 s16, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s17, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s18, -1
-; GFX9-NEXT:    s_mov_b32 s19, 0xe00000
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s22, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_load_ushort v2, v[0:1], off
-; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX9-NEXT:    s_load_dword s1, s[6:7], 0x0
-; GFX9-NEXT:    s_add_u32 s16, s16, s3
-; GFX9-NEXT:    s_addc_u32 s17, s17, 0
+; GFX9-NEXT:    global_load_ushort v1, v0, s[0:1]
+; GFX9-NEXT:    s_mov_b32 s23, 0xe00000
+; GFX9-NEXT:    s_add_u32 s20, s20, s3
+; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX9-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX9-NEXT:    s_addc_u32 s21, s21, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshr_b32 s2, s0, 28
-; GFX9-NEXT:    s_bfe_u32 s10, s1, 0x40018
-; GFX9-NEXT:    s_bfe_u32 s11, s1, 0x40014
-; GFX9-NEXT:    s_bfe_u32 s12, s1, 0x40010
-; GFX9-NEXT:    s_bfe_u32 s13, s1, 0x4000c
-; GFX9-NEXT:    s_bfe_u32 s14, s1, 0x40008
-; GFX9-NEXT:    s_bfe_u32 s15, s1, 0x40004
-; GFX9-NEXT:    s_lshr_b32 s9, s1, 28
-; GFX9-NEXT:    s_and_b32 s1, s1, 15
-; GFX9-NEXT:    s_bfe_u32 s3, s0, 0x40018
-; GFX9-NEXT:    s_bfe_u32 s4, s0, 0x40014
-; GFX9-NEXT:    s_bfe_u32 s5, s0, 0x40010
-; GFX9-NEXT:    s_bfe_u32 s6, s0, 0x4000c
-; GFX9-NEXT:    s_bfe_u32 s7, s0, 0x40008
-; GFX9-NEXT:    s_bfe_u32 s8, s0, 0x40004
-; GFX9-NEXT:    s_and_b32 s0, s0, 15
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    v_mov_b32_e32 v4, s15
-; GFX9-NEXT:    v_mov_b32_e32 v5, s14
-; GFX9-NEXT:    v_mov_b32_e32 v6, s13
-; GFX9-NEXT:    v_mov_b32_e32 v7, s12
-; GFX9-NEXT:    v_mov_b32_e32 v8, s11
-; GFX9-NEXT:    v_mov_b32_e32 v9, s10
+; GFX9-NEXT:    s_lshr_b32 s4, s2, 28
+; GFX9-NEXT:    s_bfe_u32 s12, s3, 0x40018
+; GFX9-NEXT:    s_bfe_u32 s13, s3, 0x40014
+; GFX9-NEXT:    s_bfe_u32 s14, s3, 0x40010
+; GFX9-NEXT:    s_bfe_u32 s15, s3, 0x4000c
+; GFX9-NEXT:    s_bfe_u32 s16, s3, 0x40008
+; GFX9-NEXT:    s_bfe_u32 s17, s3, 0x40004
+; GFX9-NEXT:    s_lshr_b32 s11, s3, 28
+; GFX9-NEXT:    s_and_b32 s3, s3, 15
+; GFX9-NEXT:    s_bfe_u32 s5, s2, 0x40018
+; GFX9-NEXT:    s_bfe_u32 s6, s2, 0x40014
+; GFX9-NEXT:    s_bfe_u32 s7, s2, 0x40010
+; GFX9-NEXT:    s_bfe_u32 s8, s2, 0x4000c
+; GFX9-NEXT:    s_bfe_u32 s9, s2, 0x40008
+; GFX9-NEXT:    s_bfe_u32 s10, s2, 0x40004
+; GFX9-NEXT:    s_and_b32 s2, s2, 15
+; GFX9-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-NEXT:    v_mov_b32_e32 v3, s17
+; GFX9-NEXT:    v_mov_b32_e32 v4, s16
+; GFX9-NEXT:    v_mov_b32_e32 v5, s15
+; GFX9-NEXT:    v_mov_b32_e32 v6, s14
+; GFX9-NEXT:    v_mov_b32_e32 v7, s13
+; GFX9-NEXT:    v_mov_b32_e32 v8, s12
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s8, v4, v2
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s7, v5, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s6, v6, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s5, v7, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s4, v8, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s3, v9, v2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s9
-; GFX9-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
-; GFX9-NEXT:    global_store_short v[0:1], v2, off
+; GFX9-NEXT:    v_mad_u32_u24 v1, s2, v2, v1
+; GFX9-NEXT:    v_mad_u32_u24 v1, s10, v3, v1
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX9-NEXT:    v_mad_u32_u24 v1, s9, v4, v1
+; GFX9-NEXT:    v_mad_u32_u24 v1, s8, v5, v1
+; GFX9-NEXT:    v_mad_u32_u24 v1, s7, v6, v1
+; GFX9-NEXT:    v_mad_u32_u24 v1, s6, v7, v1
+; GFX9-NEXT:    v_mad_u32_u24 v1, s5, v8, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s11
+; GFX9-NEXT:    v_mad_u32_u24 v1, s4, v2, v1
+; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: udot8_acc16:
 ; GFX9-DL:       ; %bb.0: ; %entry
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX9-DL-NEXT:    s_mov_b32 s16, SCRATCH_RSRC_DWORD0
-; GFX9-DL-NEXT:    s_mov_b32 s17, SCRATCH_RSRC_DWORD1
-; GFX9-DL-NEXT:    s_mov_b32 s18, -1
-; GFX9-DL-NEXT:    s_mov_b32 s19, 0xe00000
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
+; GFX9-DL-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
+; GFX9-DL-NEXT:    s_mov_b32 s22, -1
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT:    global_load_ushort v2, v[0:1], off
-; GFX9-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX9-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
-; GFX9-DL-NEXT:    s_add_u32 s16, s16, s3
-; GFX9-DL-NEXT:    s_addc_u32 s17, s17, 0
+; GFX9-DL-NEXT:    global_load_ushort v1, v0, s[0:1]
+; GFX9-DL-NEXT:    s_mov_b32 s23, 0xe00000
+; GFX9-DL-NEXT:    s_add_u32 s20, s20, s3
+; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX9-DL-NEXT:    s_addc_u32 s21, s21, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_lshr_b32 s2, s0, 28
-; GFX9-DL-NEXT:    s_bfe_u32 s10, s1, 0x40018
-; GFX9-DL-NEXT:    s_bfe_u32 s11, s1, 0x40014
-; GFX9-DL-NEXT:    s_bfe_u32 s12, s1, 0x40010
-; GFX9-DL-NEXT:    s_bfe_u32 s13, s1, 0x4000c
-; GFX9-DL-NEXT:    s_bfe_u32 s14, s1, 0x40008
-; GFX9-DL-NEXT:    s_bfe_u32 s15, s1, 0x40004
-; GFX9-DL-NEXT:    s_lshr_b32 s9, s1, 28
-; GFX9-DL-NEXT:    s_and_b32 s1, s1, 15
-; GFX9-DL-NEXT:    s_bfe_u32 s3, s0, 0x40018
-; GFX9-DL-NEXT:    s_bfe_u32 s4, s0, 0x40014
-; GFX9-DL-NEXT:    s_bfe_u32 s5, s0, 0x40010
-; GFX9-DL-NEXT:    s_bfe_u32 s6, s0, 0x4000c
-; GFX9-DL-NEXT:    s_bfe_u32 s7, s0, 0x40008
-; GFX9-DL-NEXT:    s_bfe_u32 s8, s0, 0x40004
-; GFX9-DL-NEXT:    s_and_b32 s0, s0, 15
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s15
-; GFX9-DL-NEXT:    v_mov_b32_e32 v5, s14
-; GFX9-DL-NEXT:    v_mov_b32_e32 v6, s13
-; GFX9-DL-NEXT:    v_mov_b32_e32 v7, s12
-; GFX9-DL-NEXT:    v_mov_b32_e32 v8, s11
-; GFX9-DL-NEXT:    v_mov_b32_e32 v9, s10
+; GFX9-DL-NEXT:    s_lshr_b32 s4, s2, 28
+; GFX9-DL-NEXT:    s_bfe_u32 s12, s3, 0x40018
+; GFX9-DL-NEXT:    s_bfe_u32 s13, s3, 0x40014
+; GFX9-DL-NEXT:    s_bfe_u32 s14, s3, 0x40010
+; GFX9-DL-NEXT:    s_bfe_u32 s15, s3, 0x4000c
+; GFX9-DL-NEXT:    s_bfe_u32 s16, s3, 0x40008
+; GFX9-DL-NEXT:    s_bfe_u32 s17, s3, 0x40004
+; GFX9-DL-NEXT:    s_lshr_b32 s11, s3, 28
+; GFX9-DL-NEXT:    s_and_b32 s3, s3, 15
+; GFX9-DL-NEXT:    s_bfe_u32 s5, s2, 0x40018
+; GFX9-DL-NEXT:    s_bfe_u32 s6, s2, 0x40014
+; GFX9-DL-NEXT:    s_bfe_u32 s7, s2, 0x40010
+; GFX9-DL-NEXT:    s_bfe_u32 s8, s2, 0x4000c
+; GFX9-DL-NEXT:    s_bfe_u32 s9, s2, 0x40008
+; GFX9-DL-NEXT:    s_bfe_u32 s10, s2, 0x40004
+; GFX9-DL-NEXT:    s_and_b32 s2, s2, 15
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s17
+; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s16
+; GFX9-DL-NEXT:    v_mov_b32_e32 v5, s15
+; GFX9-DL-NEXT:    v_mov_b32_e32 v6, s14
+; GFX9-DL-NEXT:    v_mov_b32_e32 v7, s13
+; GFX9-DL-NEXT:    v_mov_b32_e32 v8, s12
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s8, v4, v2
-; GFX9-DL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s7, v5, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s6, v6, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s5, v7, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s4, v8, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s3, v9, v2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s9
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
-; GFX9-DL-NEXT:    global_store_short v[0:1], v2, off
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s2, v2, v1
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s10, v3, v1
+; GFX9-DL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s9, v4, v1
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s8, v5, v1
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s7, v6, v1
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s6, v7, v1
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s5, v8, v1
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s11
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s4, v2, v1
+; GFX9-DL-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: udot8_acc16:
 ; GFX10-DL:       ; %bb.0: ; %entry
-; GFX10-DL-NEXT:    s_mov_b32 s4, SCRATCH_RSRC_DWORD0
-; GFX10-DL-NEXT:    s_mov_b32 s5, SCRATCH_RSRC_DWORD1
-; GFX10-DL-NEXT:    s_mov_b32 s6, -1
-; GFX10-DL-NEXT:    s_mov_b32 s7, 0x31c16000
-; GFX10-DL-NEXT:    s_add_u32 s4, s4, s3
-; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX10-DL-NEXT:    s_addc_u32 s5, s5, 0
-; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX10-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX10-DL-NEXT:    s_mov_b32 s10, -1
+; GFX10-DL-NEXT:    s_mov_b32 s11, 0x31c16000
+; GFX10-DL-NEXT:    s_add_u32 s8, s8, s3
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-DL-NEXT:    global_load_ushort v2, v[0:1], off
+; GFX10-DL-NEXT:    s_addc_u32 s9, s9, 0
+; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    global_load_ushort v1, v0, s[4:5]
 ; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_and_b32 s2, s0, 15
 ; GFX10-DL-NEXT:    s_and_b32 s3, s1, 15
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s3, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s3, v1
 ; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40004
 ; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x40004
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s3, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s3, v1
 ; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40008
 ; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x40008
-; GFX10-DL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s3, v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s3, v1
 ; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x4000c
 ; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x4000c
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s3, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s3, v1
 ; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40010
 ; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x40010
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s3, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s3, v1
 ; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40014
 ; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x40014
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s3, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s3, v1
 ; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40018
 ; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x40018
 ; GFX10-DL-NEXT:    s_lshr_b32 s0, s0, 28
 ; GFX10-DL-NEXT:    s_lshr_b32 s1, s1, 28
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s3, v2
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s0, s1, v2
-; GFX10-DL-NEXT:    global_store_short v[0:1], v2, off
+; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s3, v1
+; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s0, s1, v1
+; GFX10-DL-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-DL-NEXT:    s_endpgm
                                        <8 x i4> addrspace(1)* %src2,
                                        i16 addrspace(1)* nocapture %dst) {
@@ -727,156 +720,152 @@ define amdgpu_kernel void @udot8_acc8(<8 x i4> addrspace(1)* %src1,
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX9-NEXT:    s_mov_b32 s16, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s17, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s18, -1
-; GFX9-NEXT:    s_mov_b32 s19, 0xe00000
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s22, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_load_ubyte v2, v[0:1], off
-; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX9-NEXT:    s_load_dword s1, s[6:7], 0x0
-; GFX9-NEXT:    s_add_u32 s16, s16, s3
-; GFX9-NEXT:    s_addc_u32 s17, s17, 0
+; GFX9-NEXT:    global_load_ubyte v1, v0, s[0:1]
+; GFX9-NEXT:    s_mov_b32 s23, 0xe00000
+; GFX9-NEXT:    s_add_u32 s20, s20, s3
+; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX9-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX9-NEXT:    s_addc_u32 s21, s21, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshr_b32 s2, s0, 28
-; GFX9-NEXT:    s_bfe_u32 s10, s1, 0x40018
-; GFX9-NEXT:    s_bfe_u32 s11, s1, 0x40014
-; GFX9-NEXT:    s_bfe_u32 s12, s1, 0x40010
-; GFX9-NEXT:    s_bfe_u32 s13, s1, 0x4000c
-; GFX9-NEXT:    s_bfe_u32 s14, s1, 0x40008
-; GFX9-NEXT:    s_bfe_u32 s15, s1, 0x40004
-; GFX9-NEXT:    s_lshr_b32 s9, s1, 28
-; GFX9-NEXT:    s_and_b32 s1, s1, 15
-; GFX9-NEXT:    s_bfe_u32 s3, s0, 0x40018
-; GFX9-NEXT:    s_bfe_u32 s4, s0, 0x40014
-; GFX9-NEXT:    s_bfe_u32 s5, s0, 0x40010
-; GFX9-NEXT:    s_bfe_u32 s6, s0, 0x4000c
-; GFX9-NEXT:    s_bfe_u32 s7, s0, 0x40008
-; GFX9-NEXT:    s_bfe_u32 s8, s0, 0x40004
-; GFX9-NEXT:    s_and_b32 s0, s0, 15
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    v_mov_b32_e32 v4, s15
-; GFX9-NEXT:    v_mov_b32_e32 v5, s14
-; GFX9-NEXT:    v_mov_b32_e32 v6, s13
-; GFX9-NEXT:    v_mov_b32_e32 v7, s12
-; GFX9-NEXT:    v_mov_b32_e32 v8, s11
-; GFX9-NEXT:    v_mov_b32_e32 v9, s10
+; GFX9-NEXT:    s_lshr_b32 s4, s2, 28
+; GFX9-NEXT:    s_bfe_u32 s12, s3, 0x40018
+; GFX9-NEXT:    s_bfe_u32 s13, s3, 0x40014
+; GFX9-NEXT:    s_bfe_u32 s14, s3, 0x40010
+; GFX9-NEXT:    s_bfe_u32 s15, s3, 0x4000c
+; GFX9-NEXT:    s_bfe_u32 s16, s3, 0x40008
+; GFX9-NEXT:    s_bfe_u32 s17, s3, 0x40004
+; GFX9-NEXT:    s_lshr_b32 s11, s3, 28
+; GFX9-NEXT:    s_and_b32 s3, s3, 15
+; GFX9-NEXT:    s_bfe_u32 s5, s2, 0x40018
+; GFX9-NEXT:    s_bfe_u32 s6, s2, 0x40014
+; GFX9-NEXT:    s_bfe_u32 s7, s2, 0x40010
+; GFX9-NEXT:    s_bfe_u32 s8, s2, 0x4000c
+; GFX9-NEXT:    s_bfe_u32 s9, s2, 0x40008
+; GFX9-NEXT:    s_bfe_u32 s10, s2, 0x40004
+; GFX9-NEXT:    s_and_b32 s2, s2, 15
+; GFX9-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-NEXT:    v_mov_b32_e32 v3, s17
+; GFX9-NEXT:    v_mov_b32_e32 v4, s16
+; GFX9-NEXT:    v_mov_b32_e32 v5, s15
+; GFX9-NEXT:    v_mov_b32_e32 v6, s14
+; GFX9-NEXT:    v_mov_b32_e32 v7, s13
+; GFX9-NEXT:    v_mov_b32_e32 v8, s12
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s8, v4, v2
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s7, v5, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s6, v6, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s5, v7, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s4, v8, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s3, v9, v2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s9
-; GFX9-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
-; GFX9-NEXT:    global_store_byte v[0:1], v2, off
+; GFX9-NEXT:    v_mad_u32_u24 v1, s2, v2, v1
+; GFX9-NEXT:    v_mad_u32_u24 v1, s10, v3, v1
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX9-NEXT:    v_mad_u32_u24 v1, s9, v4, v1
+; GFX9-NEXT:    v_mad_u32_u24 v1, s8, v5, v1
+; GFX9-NEXT:    v_mad_u32_u24 v1, s7, v6, v1
+; GFX9-NEXT:    v_mad_u32_u24 v1, s6, v7, v1
+; GFX9-NEXT:    v_mad_u32_u24 v1, s5, v8, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s11
+; GFX9-NEXT:    v_mad_u32_u24 v1, s4, v2, v1
+; GFX9-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: udot8_acc8:
 ; GFX9-DL:       ; %bb.0: ; %entry
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX9-DL-NEXT:    s_mov_b32 s16, SCRATCH_RSRC_DWORD0
-; GFX9-DL-NEXT:    s_mov_b32 s17, SCRATCH_RSRC_DWORD1
-; GFX9-DL-NEXT:    s_mov_b32 s18, -1
-; GFX9-DL-NEXT:    s_mov_b32 s19, 0xe00000
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
+; GFX9-DL-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
+; GFX9-DL-NEXT:    s_mov_b32 s22, -1
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT:    global_load_ubyte v2, v[0:1], off
-; GFX9-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX9-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
-; GFX9-DL-NEXT:    s_add_u32 s16, s16, s3
-; GFX9-DL-NEXT:    s_addc_u32 s17, s17, 0
+; GFX9-DL-NEXT:    global_load_ubyte v1, v0, s[0:1]
+; GFX9-DL-NEXT:    s_mov_b32 s23, 0xe00000
+; GFX9-DL-NEXT:    s_add_u32 s20, s20, s3
+; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX9-DL-NEXT:    s_addc_u32 s21, s21, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_lshr_b32 s2, s0, 28
-; GFX9-DL-NEXT:    s_bfe_u32 s10, s1, 0x40018
-; GFX9-DL-NEXT:    s_bfe_u32 s11, s1, 0x40014
-; GFX9-DL-NEXT:    s_bfe_u32 s12, s1, 0x40010
-; GFX9-DL-NEXT:    s_bfe_u32 s13, s1, 0x4000c
-; GFX9-DL-NEXT:    s_bfe_u32 s14, s1, 0x40008
-; GFX9-DL-NEXT:    s_bfe_u32 s15, s1, 0x40004
-; GFX9-DL-NEXT:    s_lshr_b32 s9, s1, 28
-; GFX9-DL-NEXT:    s_and_b32 s1, s1, 15
-; GFX9-DL-NEXT:    s_bfe_u32 s3, s0, 0x40018
-; GFX9-DL-NEXT:    s_bfe_u32 s4, s0, 0x40014
-; GFX9-DL-NEXT:    s_bfe_u32 s5, s0, 0x40010
-; GFX9-DL-NEXT:    s_bfe_u32 s6, s0, 0x4000c
-; GFX9-DL-NEXT:    s_bfe_u32 s7, s0, 0x40008
-; GFX9-DL-NEXT:    s_bfe_u32 s8, s0, 0x40004
-; GFX9-DL-NEXT:    s_and_b32 s0, s0, 15
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s15
-; GFX9-DL-NEXT:    v_mov_b32_e32 v5, s14
-; GFX9-DL-NEXT:    v_mov_b32_e32 v6, s13
-; GFX9-DL-NEXT:    v_mov_b32_e32 v7, s12
-; GFX9-DL-NEXT:    v_mov_b32_e32 v8, s11
-; GFX9-DL-NEXT:    v_mov_b32_e32 v9, s10
+; GFX9-DL-NEXT:    s_lshr_b32 s4, s2, 28
+; GFX9-DL-NEXT:    s_bfe_u32 s12, s3, 0x40018
+; GFX9-DL-NEXT:    s_bfe_u32 s13, s3, 0x40014
+; GFX9-DL-NEXT:    s_bfe_u32 s14, s3, 0x40010
+; GFX9-DL-NEXT:    s_bfe_u32 s15, s3, 0x4000c
+; GFX9-DL-NEXT:    s_bfe_u32 s16, s3, 0x40008
+; GFX9-DL-NEXT:    s_bfe_u32 s17, s3, 0x40004
+; GFX9-DL-NEXT:    s_lshr_b32 s11, s3, 28
+; GFX9-DL-NEXT:    s_and_b32 s3, s3, 15
+; GFX9-DL-NEXT:    s_bfe_u32 s5, s2, 0x40018
+; GFX9-DL-NEXT:    s_bfe_u32 s6, s2, 0x40014
+; GFX9-DL-NEXT:    s_bfe_u32 s7, s2, 0x40010
+; GFX9-DL-NEXT:    s_bfe_u32 s8, s2, 0x4000c
+; GFX9-DL-NEXT:    s_bfe_u32 s9, s2, 0x40008
+; GFX9-DL-NEXT:    s_bfe_u32 s10, s2, 0x40004
+; GFX9-DL-NEXT:    s_and_b32 s2, s2, 15
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s17
+; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s16
+; GFX9-DL-NEXT:    v_mov_b32_e32 v5, s15
+; GFX9-DL-NEXT:    v_mov_b32_e32 v6, s14
+; GFX9-DL-NEXT:    v_mov_b32_e32 v7, s13
+; GFX9-DL-NEXT:    v_mov_b32_e32 v8, s12
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s8, v4, v2
-; GFX9-DL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s7, v5, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s6, v6, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s5, v7, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s4, v8, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s3, v9, v2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s9
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
-; GFX9-DL-NEXT:    global_store_byte v[0:1], v2, off
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s2, v2, v1
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s10, v3, v1
+; GFX9-DL-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s9, v4, v1
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s8, v5, v1
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s7, v6, v1
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s6, v7, v1
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s5, v8, v1
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s11
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s4, v2, v1
+; GFX9-DL-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: udot8_acc8:
 ; GFX10-DL:       ; %bb.0: ; %entry
-; GFX10-DL-NEXT:    s_mov_b32 s4, SCRATCH_RSRC_DWORD0
-; GFX10-DL-NEXT:    s_mov_b32 s5, SCRATCH_RSRC_DWORD1
-; GFX10-DL-NEXT:    s_mov_b32 s6, -1
-; GFX10-DL-NEXT:    s_mov_b32 s7, 0x31c16000
-; GFX10-DL-NEXT:    s_add_u32 s4, s4, s3
-; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX10-DL-NEXT:    s_addc_u32 s5, s5, 0
-; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX10-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX10-DL-NEXT:    s_mov_b32 s10, -1
+; GFX10-DL-NEXT:    s_mov_b32 s11, 0x31c16000
+; GFX10-DL-NEXT:    s_add_u32 s8, s8, s3
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-DL-NEXT:    global_load_ubyte v2, v[0:1], off
+; GFX10-DL-NEXT:    s_addc_u32 s9, s9, 0
+; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    global_load_ubyte v1, v0, s[4:5]
 ; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_and_b32 s2, s0, 15
 ; GFX10-DL-NEXT:    s_and_b32 s3, s1, 15
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s3, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s3, v1
 ; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40004
 ; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x40004
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s3, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s3, v1
 ; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40008
 ; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x40008
-; GFX10-DL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s3, v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s3, v1
 ; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x4000c
 ; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x4000c
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s3, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s3, v1
 ; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40010
 ; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x40010
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s3, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s3, v1
 ; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40014
 ; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x40014
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s3, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s3, v1
 ; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40018
 ; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x40018
 ; GFX10-DL-NEXT:    s_lshr_b32 s0, s0, 28
 ; GFX10-DL-NEXT:    s_lshr_b32 s1, s1, 28
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s3, v2
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s0, s1, v2
-; GFX10-DL-NEXT:    global_store_byte v[0:1], v2, off
+; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s3, v1
+; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s0, s1, v1
+; GFX10-DL-NEXT:    global_store_byte v0, v1, s[4:5]
 ; GFX10-DL-NEXT:    s_endpgm
                                       <8 x i4> addrspace(1)* %src2,
                                       i8 addrspace(1)* nocapture %dst) {
@@ -1064,165 +1053,161 @@ define amdgpu_kernel void @udot8_acc4(<8 x i4> addrspace(1)* %src1,
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX9-NEXT:    s_mov_b32 s16, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s17, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s18, -1
-; GFX9-NEXT:    s_mov_b32 s19, 0xe00000
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s22, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_load_ubyte v2, v[0:1], off
-; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX9-NEXT:    s_load_dword s1, s[6:7], 0x0
-; GFX9-NEXT:    s_add_u32 s16, s16, s3
-; GFX9-NEXT:    s_addc_u32 s17, s17, 0
+; GFX9-NEXT:    global_load_ubyte v1, v0, s[0:1]
+; GFX9-NEXT:    s_mov_b32 s23, 0xe00000
+; GFX9-NEXT:    s_add_u32 s20, s20, s3
+; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX9-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX9-NEXT:    s_addc_u32 s21, s21, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s8, s0, 15
-; GFX9-NEXT:    s_and_b32 s15, s1, 15
-; GFX9-NEXT:    s_bfe_u32 s14, s1, 0x40004
-; GFX9-NEXT:    v_mov_b32_e32 v4, s15
-; GFX9-NEXT:    s_bfe_u32 s10, s1, 0x40018
-; GFX9-NEXT:    s_bfe_u32 s11, s1, 0x40014
-; GFX9-NEXT:    s_bfe_u32 s12, s1, 0x40010
-; GFX9-NEXT:    s_bfe_u32 s13, s1, 0x40008
-; GFX9-NEXT:    s_lshr_b32 s9, s1, 28
-; GFX9-NEXT:    s_bfe_u32 s1, s1, 0x4000c
-; GFX9-NEXT:    s_bfe_u32 s7, s0, 0x40004
-; GFX9-NEXT:    v_mov_b32_e32 v5, s14
-; GFX9-NEXT:    s_lshr_b32 s2, s0, 28
-; GFX9-NEXT:    s_bfe_u32 s3, s0, 0x40018
-; GFX9-NEXT:    s_bfe_u32 s4, s0, 0x40014
-; GFX9-NEXT:    s_bfe_u32 s5, s0, 0x40010
-; GFX9-NEXT:    s_bfe_u32 s6, s0, 0x40008
-; GFX9-NEXT:    s_bfe_u32 s0, s0, 0x4000c
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    v_mov_b32_e32 v6, s13
-; GFX9-NEXT:    v_mul_u32_u24_e32 v3, s0, v3
-; GFX9-NEXT:    v_and_b32_e32 v3, 15, v3
-; GFX9-NEXT:    v_mov_b32_e32 v7, s12
-; GFX9-NEXT:    v_mov_b32_e32 v8, s11
-; GFX9-NEXT:    v_mov_b32_e32 v9, s10
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mad_u32_u24 v2, s8, v4, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s7, v5, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s6, v6, v2
-; GFX9-NEXT:    v_and_b32_e32 v2, 15, v2
-; GFX9-NEXT:    v_add_u32_e32 v2, v2, v3
-; GFX9-NEXT:    v_mad_u32_u24 v2, s5, v7, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s4, v8, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s3, v9, v2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s9
-; GFX9-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
+; GFX9-NEXT:    s_and_b32 s10, s2, 15
+; GFX9-NEXT:    s_and_b32 s17, s3, 15
+; GFX9-NEXT:    s_bfe_u32 s16, s3, 0x40004
+; GFX9-NEXT:    v_mov_b32_e32 v3, s17
+; GFX9-NEXT:    s_bfe_u32 s12, s3, 0x40018
+; GFX9-NEXT:    s_bfe_u32 s13, s3, 0x40014
+; GFX9-NEXT:    s_bfe_u32 s14, s3, 0x40010
+; GFX9-NEXT:    s_bfe_u32 s15, s3, 0x40008
+; GFX9-NEXT:    s_lshr_b32 s11, s3, 28
+; GFX9-NEXT:    s_bfe_u32 s3, s3, 0x4000c
+; GFX9-NEXT:    s_bfe_u32 s9, s2, 0x40004
+; GFX9-NEXT:    v_mov_b32_e32 v4, s16
+; GFX9-NEXT:    s_lshr_b32 s4, s2, 28
+; GFX9-NEXT:    s_bfe_u32 s5, s2, 0x40018
+; GFX9-NEXT:    s_bfe_u32 s6, s2, 0x40014
+; GFX9-NEXT:    s_bfe_u32 s7, s2, 0x40010
+; GFX9-NEXT:    s_bfe_u32 s8, s2, 0x40008
+; GFX9-NEXT:    s_bfe_u32 s2, s2, 0x4000c
+; GFX9-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-NEXT:    v_mov_b32_e32 v5, s15
+; GFX9-NEXT:    v_mul_u32_u24_e32 v2, s2, v2
 ; GFX9-NEXT:    v_and_b32_e32 v2, 15, v2
-; GFX9-NEXT:    global_store_byte v[0:1], v2, off
+; GFX9-NEXT:    v_mov_b32_e32 v6, s14
+; GFX9-NEXT:    v_mov_b32_e32 v7, s13
+; GFX9-NEXT:    v_mov_b32_e32 v8, s12
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mad_u32_u24 v1, s10, v3, v1
+; GFX9-NEXT:    v_mad_u32_u24 v1, s9, v4, v1
+; GFX9-NEXT:    v_mad_u32_u24 v1, s8, v5, v1
+; GFX9-NEXT:    v_and_b32_e32 v1, 15, v1
+; GFX9-NEXT:    v_add_u32_e32 v1, v1, v2
+; GFX9-NEXT:    v_mad_u32_u24 v1, s7, v6, v1
+; GFX9-NEXT:    v_mad_u32_u24 v1, s6, v7, v1
+; GFX9-NEXT:    v_mad_u32_u24 v1, s5, v8, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s11
+; GFX9-NEXT:    v_mad_u32_u24 v1, s4, v2, v1
+; GFX9-NEXT:    v_and_b32_e32 v1, 15, v1
+; GFX9-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: udot8_acc4:
 ; GFX9-DL:       ; %bb.0: ; %entry
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX9-DL-NEXT:    s_mov_b32 s16, SCRATCH_RSRC_DWORD0
-; GFX9-DL-NEXT:    s_mov_b32 s17, SCRATCH_RSRC_DWORD1
-; GFX9-DL-NEXT:    s_mov_b32 s18, -1
-; GFX9-DL-NEXT:    s_mov_b32 s19, 0xe00000
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
+; GFX9-DL-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
+; GFX9-DL-NEXT:    s_mov_b32 s22, -1
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT:    global_load_ubyte v2, v[0:1], off
-; GFX9-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX9-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
-; GFX9-DL-NEXT:    s_add_u32 s16, s16, s3
-; GFX9-DL-NEXT:    s_addc_u32 s17, s17, 0
+; GFX9-DL-NEXT:    global_load_ubyte v1, v0, s[0:1]
+; GFX9-DL-NEXT:    s_mov_b32 s23, 0xe00000
+; GFX9-DL-NEXT:    s_add_u32 s20, s20, s3
+; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX9-DL-NEXT:    s_addc_u32 s21, s21, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_and_b32 s8, s0, 15
-; GFX9-DL-NEXT:    s_and_b32 s15, s1, 15
-; GFX9-DL-NEXT:    s_bfe_u32 s14, s1, 0x40004
-; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s15
-; GFX9-DL-NEXT:    s_bfe_u32 s10, s1, 0x40018
-; GFX9-DL-NEXT:    s_bfe_u32 s11, s1, 0x40014
-; GFX9-DL-NEXT:    s_bfe_u32 s12, s1, 0x40010
-; GFX9-DL-NEXT:    s_bfe_u32 s13, s1, 0x40008
-; GFX9-DL-NEXT:    s_lshr_b32 s9, s1, 28
-; GFX9-DL-NEXT:    s_bfe_u32 s1, s1, 0x4000c
-; GFX9-DL-NEXT:    s_bfe_u32 s7, s0, 0x40004
-; GFX9-DL-NEXT:    v_mov_b32_e32 v5, s14
-; GFX9-DL-NEXT:    s_lshr_b32 s2, s0, 28
-; GFX9-DL-NEXT:    s_bfe_u32 s3, s0, 0x40018
-; GFX9-DL-NEXT:    s_bfe_u32 s4, s0, 0x40014
-; GFX9-DL-NEXT:    s_bfe_u32 s5, s0, 0x40010
-; GFX9-DL-NEXT:    s_bfe_u32 s6, s0, 0x40008
-; GFX9-DL-NEXT:    s_bfe_u32 s0, s0, 0x4000c
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-DL-NEXT:    v_mov_b32_e32 v6, s13
-; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v3, s0, v3
-; GFX9-DL-NEXT:    v_and_b32_e32 v3, 15, v3
-; GFX9-DL-NEXT:    v_mov_b32_e32 v7, s12
-; GFX9-DL-NEXT:    v_mov_b32_e32 v8, s11
-; GFX9-DL-NEXT:    v_mov_b32_e32 v9, s10
-; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s8, v4, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s7, v5, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s6, v6, v2
-; GFX9-DL-NEXT:    v_and_b32_e32 v2, 15, v2
-; GFX9-DL-NEXT:    v_add_u32_e32 v2, v2, v3
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s5, v7, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s4, v8, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s3, v9, v2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s9
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
+; GFX9-DL-NEXT:    s_and_b32 s10, s2, 15
+; GFX9-DL-NEXT:    s_and_b32 s17, s3, 15
+; GFX9-DL-NEXT:    s_bfe_u32 s16, s3, 0x40004
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s17
+; GFX9-DL-NEXT:    s_bfe_u32 s12, s3, 0x40018
+; GFX9-DL-NEXT:    s_bfe_u32 s13, s3, 0x40014
+; GFX9-DL-NEXT:    s_bfe_u32 s14, s3, 0x40010
+; GFX9-DL-NEXT:    s_bfe_u32 s15, s3, 0x40008
+; GFX9-DL-NEXT:    s_lshr_b32 s11, s3, 28
+; GFX9-DL-NEXT:    s_bfe_u32 s3, s3, 0x4000c
+; GFX9-DL-NEXT:    s_bfe_u32 s9, s2, 0x40004
+; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s16
+; GFX9-DL-NEXT:    s_lshr_b32 s4, s2, 28
+; GFX9-DL-NEXT:    s_bfe_u32 s5, s2, 0x40018
+; GFX9-DL-NEXT:    s_bfe_u32 s6, s2, 0x40014
+; GFX9-DL-NEXT:    s_bfe_u32 s7, s2, 0x40010
+; GFX9-DL-NEXT:    s_bfe_u32 s8, s2, 0x40008
+; GFX9-DL-NEXT:    s_bfe_u32 s2, s2, 0x4000c
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-DL-NEXT:    v_mov_b32_e32 v5, s15
+; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v2, s2, v2
 ; GFX9-DL-NEXT:    v_and_b32_e32 v2, 15, v2
-; GFX9-DL-NEXT:    global_store_byte v[0:1], v2, off
+; GFX9-DL-NEXT:    v_mov_b32_e32 v6, s14
+; GFX9-DL-NEXT:    v_mov_b32_e32 v7, s13
+; GFX9-DL-NEXT:    v_mov_b32_e32 v8, s12
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s10, v3, v1
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s9, v4, v1
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s8, v5, v1
+; GFX9-DL-NEXT:    v_and_b32_e32 v1, 15, v1
+; GFX9-DL-NEXT:    v_add_u32_e32 v1, v1, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s7, v6, v1
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s6, v7, v1
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s5, v8, v1
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s11
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s4, v2, v1
+; GFX9-DL-NEXT:    v_and_b32_e32 v1, 15, v1
+; GFX9-DL-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: udot8_acc4:
 ; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
 ; GFX10-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
 ; GFX10-DL-NEXT:    s_mov_b32 s10, -1
 ; GFX10-DL-NEXT:    s_mov_b32 s11, 0x31c16000
 ; GFX10-DL-NEXT:    s_add_u32 s8, s8, s3
-; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-DL-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-DL-NEXT:    global_load_ubyte v2, v[0:1], off
-; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    global_load_ubyte v1, v0, s[4:5]
 ; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_and_b32 s2, s0, 15
 ; GFX10-DL-NEXT:    s_and_b32 s3, s1, 15
-; GFX10-DL-NEXT:    s_bfe_u32 s4, s1, 0x40008
-; GFX10-DL-NEXT:    s_bfe_u32 s5, s1, 0x4000c
+; GFX10-DL-NEXT:    s_bfe_u32 s6, s1, 0x40008
+; GFX10-DL-NEXT:    s_bfe_u32 s7, s1, 0x4000c
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s3, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s3, v1
 ; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40004
 ; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x40004
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s3, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s3, v1
 ; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40008
 ; GFX10-DL-NEXT:    s_bfe_u32 s3, s0, 0x4000c
-; GFX10-DL-NEXT:    v_mul_u32_u24_e64 v3, s3, s5
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s4, v2
+; GFX10-DL-NEXT:    v_mul_u32_u24_e64 v2, s3, s7
+; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s6, v1
 ; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40010
 ; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x40010
-; GFX10-DL-NEXT:    v_and_b32_e32 v3, 15, v3
 ; GFX10-DL-NEXT:    v_and_b32_e32 v2, 15, v2
-; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v2, v3
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s3, v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v1, 15, v1
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v1, v1, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s3, v1
 ; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40014
 ; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x40014
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s3, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s3, v1
 ; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40018
 ; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x40018
 ; GFX10-DL-NEXT:    s_lshr_b32 s0, s0, 28
 ; GFX10-DL-NEXT:    s_lshr_b32 s1, s1, 28
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s3, v2
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s0, s1, v2
-; GFX10-DL-NEXT:    v_and_b32_e32 v2, 15, v2
-; GFX10-DL-NEXT:    global_store_byte v[0:1], v2, off
+; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s3, v1
+; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s0, s1, v1
+; GFX10-DL-NEXT:    v_and_b32_e32 v1, 15, v1
+; GFX10-DL-NEXT:    global_store_byte v0, v1, s[4:5]
 ; GFX10-DL-NEXT:    s_endpgm
                                       <8 x i4> addrspace(1)* %src2,
                                       i4 addrspace(1)* nocapture %dst) {
@@ -1394,165 +1379,161 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(<8 x i4> addrspace(1)* %sr
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX9-NEXT:    s_mov_b32 s16, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s17, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s18, -1
-; GFX9-NEXT:    s_mov_b32 s19, 0xe00000
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s22, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_load_ubyte v2, v[0:1], off
-; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX9-NEXT:    s_load_dword s1, s[6:7], 0x0
-; GFX9-NEXT:    s_add_u32 s16, s16, s3
-; GFX9-NEXT:    s_addc_u32 s17, s17, 0
+; GFX9-NEXT:    global_load_ubyte v1, v0, s[0:1]
+; GFX9-NEXT:    s_mov_b32 s23, 0xe00000
+; GFX9-NEXT:    s_add_u32 s20, s20, s3
+; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX9-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX9-NEXT:    s_addc_u32 s21, s21, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s8, s0, 15
-; GFX9-NEXT:    s_and_b32 s15, s1, 15
-; GFX9-NEXT:    s_bfe_u32 s14, s1, 0x40004
-; GFX9-NEXT:    v_mov_b32_e32 v4, s15
-; GFX9-NEXT:    s_bfe_u32 s10, s1, 0x40018
-; GFX9-NEXT:    s_bfe_u32 s11, s1, 0x40014
-; GFX9-NEXT:    s_bfe_u32 s12, s1, 0x40010
-; GFX9-NEXT:    s_bfe_u32 s13, s1, 0x40008
-; GFX9-NEXT:    s_lshr_b32 s9, s1, 28
-; GFX9-NEXT:    s_bfe_u32 s1, s1, 0x4000c
-; GFX9-NEXT:    s_bfe_u32 s7, s0, 0x40004
-; GFX9-NEXT:    v_mov_b32_e32 v5, s14
-; GFX9-NEXT:    s_lshr_b32 s2, s0, 28
-; GFX9-NEXT:    s_bfe_u32 s3, s0, 0x40018
-; GFX9-NEXT:    s_bfe_u32 s4, s0, 0x40014
-; GFX9-NEXT:    s_bfe_u32 s5, s0, 0x40010
-; GFX9-NEXT:    s_bfe_u32 s6, s0, 0x40008
-; GFX9-NEXT:    s_bfe_u32 s0, s0, 0x4000c
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    v_mov_b32_e32 v6, s13
-; GFX9-NEXT:    v_mul_u32_u24_e32 v3, s0, v3
-; GFX9-NEXT:    v_and_b32_e32 v3, 15, v3
-; GFX9-NEXT:    v_mov_b32_e32 v7, s12
-; GFX9-NEXT:    v_mov_b32_e32 v8, s11
-; GFX9-NEXT:    v_mov_b32_e32 v9, s10
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mad_u32_u24 v2, s8, v4, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s7, v5, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s6, v6, v2
-; GFX9-NEXT:    v_and_b32_e32 v2, 15, v2
-; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s5, v7, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s4, v8, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s3, v9, v2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s9
-; GFX9-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
+; GFX9-NEXT:    s_and_b32 s10, s2, 15
+; GFX9-NEXT:    s_and_b32 s17, s3, 15
+; GFX9-NEXT:    s_bfe_u32 s16, s3, 0x40004
+; GFX9-NEXT:    v_mov_b32_e32 v3, s17
+; GFX9-NEXT:    s_bfe_u32 s12, s3, 0x40018
+; GFX9-NEXT:    s_bfe_u32 s13, s3, 0x40014
+; GFX9-NEXT:    s_bfe_u32 s14, s3, 0x40010
+; GFX9-NEXT:    s_bfe_u32 s15, s3, 0x40008
+; GFX9-NEXT:    s_lshr_b32 s11, s3, 28
+; GFX9-NEXT:    s_bfe_u32 s3, s3, 0x4000c
+; GFX9-NEXT:    s_bfe_u32 s9, s2, 0x40004
+; GFX9-NEXT:    v_mov_b32_e32 v4, s16
+; GFX9-NEXT:    s_lshr_b32 s4, s2, 28
+; GFX9-NEXT:    s_bfe_u32 s5, s2, 0x40018
+; GFX9-NEXT:    s_bfe_u32 s6, s2, 0x40014
+; GFX9-NEXT:    s_bfe_u32 s7, s2, 0x40010
+; GFX9-NEXT:    s_bfe_u32 s8, s2, 0x40008
+; GFX9-NEXT:    s_bfe_u32 s2, s2, 0x4000c
+; GFX9-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-NEXT:    v_mov_b32_e32 v5, s15
+; GFX9-NEXT:    v_mul_u32_u24_e32 v2, s2, v2
 ; GFX9-NEXT:    v_and_b32_e32 v2, 15, v2
-; GFX9-NEXT:    global_store_byte v[0:1], v2, off
+; GFX9-NEXT:    v_mov_b32_e32 v6, s14
+; GFX9-NEXT:    v_mov_b32_e32 v7, s13
+; GFX9-NEXT:    v_mov_b32_e32 v8, s12
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mad_u32_u24 v1, s10, v3, v1
+; GFX9-NEXT:    v_mad_u32_u24 v1, s9, v4, v1
+; GFX9-NEXT:    v_mad_u32_u24 v1, s8, v5, v1
+; GFX9-NEXT:    v_and_b32_e32 v1, 15, v1
+; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
+; GFX9-NEXT:    v_mad_u32_u24 v1, s7, v6, v1
+; GFX9-NEXT:    v_mad_u32_u24 v1, s6, v7, v1
+; GFX9-NEXT:    v_mad_u32_u24 v1, s5, v8, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s11
+; GFX9-NEXT:    v_mad_u32_u24 v1, s4, v2, v1
+; GFX9-NEXT:    v_and_b32_e32 v1, 15, v1
+; GFX9-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: udot8_CommutationInsideMAD:
 ; GFX9-DL:       ; %bb.0: ; %entry
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX9-DL-NEXT:    s_mov_b32 s16, SCRATCH_RSRC_DWORD0
-; GFX9-DL-NEXT:    s_mov_b32 s17, SCRATCH_RSRC_DWORD1
-; GFX9-DL-NEXT:    s_mov_b32 s18, -1
-; GFX9-DL-NEXT:    s_mov_b32 s19, 0xe00000
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
+; GFX9-DL-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
+; GFX9-DL-NEXT:    s_mov_b32 s22, -1
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT:    global_load_ubyte v2, v[0:1], off
-; GFX9-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX9-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
-; GFX9-DL-NEXT:    s_add_u32 s16, s16, s3
-; GFX9-DL-NEXT:    s_addc_u32 s17, s17, 0
+; GFX9-DL-NEXT:    global_load_ubyte v1, v0, s[0:1]
+; GFX9-DL-NEXT:    s_mov_b32 s23, 0xe00000
+; GFX9-DL-NEXT:    s_add_u32 s20, s20, s3
+; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX9-DL-NEXT:    s_addc_u32 s21, s21, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_and_b32 s8, s0, 15
-; GFX9-DL-NEXT:    s_and_b32 s15, s1, 15
-; GFX9-DL-NEXT:    s_bfe_u32 s14, s1, 0x40004
-; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s15
-; GFX9-DL-NEXT:    s_bfe_u32 s10, s1, 0x40018
-; GFX9-DL-NEXT:    s_bfe_u32 s11, s1, 0x40014
-; GFX9-DL-NEXT:    s_bfe_u32 s12, s1, 0x40010
-; GFX9-DL-NEXT:    s_bfe_u32 s13, s1, 0x40008
-; GFX9-DL-NEXT:    s_lshr_b32 s9, s1, 28
-; GFX9-DL-NEXT:    s_bfe_u32 s1, s1, 0x4000c
-; GFX9-DL-NEXT:    s_bfe_u32 s7, s0, 0x40004
-; GFX9-DL-NEXT:    v_mov_b32_e32 v5, s14
-; GFX9-DL-NEXT:    s_lshr_b32 s2, s0, 28
-; GFX9-DL-NEXT:    s_bfe_u32 s3, s0, 0x40018
-; GFX9-DL-NEXT:    s_bfe_u32 s4, s0, 0x40014
-; GFX9-DL-NEXT:    s_bfe_u32 s5, s0, 0x40010
-; GFX9-DL-NEXT:    s_bfe_u32 s6, s0, 0x40008
-; GFX9-DL-NEXT:    s_bfe_u32 s0, s0, 0x4000c
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-DL-NEXT:    v_mov_b32_e32 v6, s13
-; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v3, s0, v3
-; GFX9-DL-NEXT:    v_and_b32_e32 v3, 15, v3
-; GFX9-DL-NEXT:    v_mov_b32_e32 v7, s12
-; GFX9-DL-NEXT:    v_mov_b32_e32 v8, s11
-; GFX9-DL-NEXT:    v_mov_b32_e32 v9, s10
-; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s8, v4, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s7, v5, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s6, v6, v2
-; GFX9-DL-NEXT:    v_and_b32_e32 v2, 15, v2
-; GFX9-DL-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s5, v7, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s4, v8, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s3, v9, v2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s9
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
+; GFX9-DL-NEXT:    s_and_b32 s10, s2, 15
+; GFX9-DL-NEXT:    s_and_b32 s17, s3, 15
+; GFX9-DL-NEXT:    s_bfe_u32 s16, s3, 0x40004
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s17
+; GFX9-DL-NEXT:    s_bfe_u32 s12, s3, 0x40018
+; GFX9-DL-NEXT:    s_bfe_u32 s13, s3, 0x40014
+; GFX9-DL-NEXT:    s_bfe_u32 s14, s3, 0x40010
+; GFX9-DL-NEXT:    s_bfe_u32 s15, s3, 0x40008
+; GFX9-DL-NEXT:    s_lshr_b32 s11, s3, 28
+; GFX9-DL-NEXT:    s_bfe_u32 s3, s3, 0x4000c
+; GFX9-DL-NEXT:    s_bfe_u32 s9, s2, 0x40004
+; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s16
+; GFX9-DL-NEXT:    s_lshr_b32 s4, s2, 28
+; GFX9-DL-NEXT:    s_bfe_u32 s5, s2, 0x40018
+; GFX9-DL-NEXT:    s_bfe_u32 s6, s2, 0x40014
+; GFX9-DL-NEXT:    s_bfe_u32 s7, s2, 0x40010
+; GFX9-DL-NEXT:    s_bfe_u32 s8, s2, 0x40008
+; GFX9-DL-NEXT:    s_bfe_u32 s2, s2, 0x4000c
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-DL-NEXT:    v_mov_b32_e32 v5, s15
+; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v2, s2, v2
 ; GFX9-DL-NEXT:    v_and_b32_e32 v2, 15, v2
-; GFX9-DL-NEXT:    global_store_byte v[0:1], v2, off
+; GFX9-DL-NEXT:    v_mov_b32_e32 v6, s14
+; GFX9-DL-NEXT:    v_mov_b32_e32 v7, s13
+; GFX9-DL-NEXT:    v_mov_b32_e32 v8, s12
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s10, v3, v1
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s9, v4, v1
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s8, v5, v1
+; GFX9-DL-NEXT:    v_and_b32_e32 v1, 15, v1
+; GFX9-DL-NEXT:    v_add_u32_e32 v1, v2, v1
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s7, v6, v1
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s6, v7, v1
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s5, v8, v1
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s11
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s4, v2, v1
+; GFX9-DL-NEXT:    v_and_b32_e32 v1, 15, v1
+; GFX9-DL-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: udot8_CommutationInsideMAD:
 ; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
 ; GFX10-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
 ; GFX10-DL-NEXT:    s_mov_b32 s10, -1
 ; GFX10-DL-NEXT:    s_mov_b32 s11, 0x31c16000
 ; GFX10-DL-NEXT:    s_add_u32 s8, s8, s3
-; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-DL-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-DL-NEXT:    global_load_ubyte v2, v[0:1], off
-; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    global_load_ubyte v1, v0, s[4:5]
 ; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_and_b32 s2, s0, 15
 ; GFX10-DL-NEXT:    s_and_b32 s3, s1, 15
-; GFX10-DL-NEXT:    s_bfe_u32 s4, s0, 0x40008
-; GFX10-DL-NEXT:    s_bfe_u32 s5, s1, 0x40008
+; GFX10-DL-NEXT:    s_bfe_u32 s6, s0, 0x40008
+; GFX10-DL-NEXT:    s_bfe_u32 s7, s1, 0x40008
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s3, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s3, v1
 ; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40004
 ; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x40004
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s3, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s3, v1
 ; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x4000c
 ; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x4000c
-; GFX10-DL-NEXT:    v_mul_u32_u24_e64 v3, s2, s3
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s4, s5, v2
+; GFX10-DL-NEXT:    v_mul_u32_u24_e64 v2, s2, s3
+; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s6, s7, v1
 ; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40010
 ; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x40010
-; GFX10-DL-NEXT:    v_and_b32_e32 v3, 15, v3
 ; GFX10-DL-NEXT:    v_and_b32_e32 v2, 15, v2
-; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v3, v2
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s3, v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v1, 15, v1
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v1, v2, v1
+; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s3, v1
 ; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40014
 ; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x40014
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s3, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s3, v1
 ; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40018
 ; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x40018
 ; GFX10-DL-NEXT:    s_lshr_b32 s0, s0, 28
 ; GFX10-DL-NEXT:    s_lshr_b32 s1, s1, 28
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s3, v2
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s0, s1, v2
-; GFX10-DL-NEXT:    v_and_b32_e32 v2, 15, v2
-; GFX10-DL-NEXT:    global_store_byte v[0:1], v2, off
+; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s3, v1
+; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s0, s1, v1
+; GFX10-DL-NEXT:    v_and_b32_e32 v1, 15, v1
+; GFX10-DL-NEXT:    global_store_byte v0, v1, s[4:5]
 ; GFX10-DL-NEXT:    s_endpgm
                                                       <8 x i4> addrspace(1)* %src2,
                                                       i4 addrspace(1)* nocapture %dst) {
@@ -1748,28 +1729,27 @@ define amdgpu_kernel void @udot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    s_bfe_u32 s10, s2, 0x40008
 ; GFX9-NEXT:    s_bfe_u32 s11, s2, 0x40004
 ; GFX9-NEXT:    s_and_b32 s2, s2, 15
-; GFX9-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-NEXT:    v_mov_b32_e32 v1, s18
-; GFX9-NEXT:    v_mad_u32_u24 v1, s2, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v2, s17
-; GFX9-NEXT:    v_mad_u32_u24 v0, s2, v0, v1
-; GFX9-NEXT:    v_mad_u32_u24 v1, s11, v2, v1
-; GFX9-NEXT:    v_mov_b32_e32 v2, s16
-; GFX9-NEXT:    v_mad_u32_u24 v1, s10, v2, v1
-; GFX9-NEXT:    v_mov_b32_e32 v2, s15
-; GFX9-NEXT:    v_mad_u32_u24 v1, s9, v2, v1
-; GFX9-NEXT:    v_mov_b32_e32 v2, s14
-; GFX9-NEXT:    v_mad_u32_u24 v1, s8, v2, v1
-; GFX9-NEXT:    v_mov_b32_e32 v2, s13
-; GFX9-NEXT:    v_mad_u32_u24 v1, s5, v2, v1
-; GFX9-NEXT:    v_mov_b32_e32 v2, s12
-; GFX9-NEXT:    v_mad_u32_u24 v1, s4, v2, v1
-; GFX9-NEXT:    v_mov_b32_e32 v2, s7
-; GFX9-NEXT:    v_mad_u32_u24 v1, s3, v2, v1
-; GFX9-NEXT:    v_add_u32_e32 v2, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-NEXT:    v_mov_b32_e32 v2, s18
+; GFX9-NEXT:    v_mad_u32_u24 v2, s2, v1, v2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s17
+; GFX9-NEXT:    v_mad_u32_u24 v1, s2, v1, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s11, v3, v2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s16
+; GFX9-NEXT:    v_mad_u32_u24 v2, s10, v3, v2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s15
+; GFX9-NEXT:    v_mad_u32_u24 v2, s9, v3, v2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s14
+; GFX9-NEXT:    v_mad_u32_u24 v2, s8, v3, v2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s13
+; GFX9-NEXT:    v_mad_u32_u24 v2, s5, v3, v2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s12
+; GFX9-NEXT:    v_mad_u32_u24 v2, s4, v3, v2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    v_mad_u32_u24 v2, s3, v3, v2
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_add_u32_e32 v1, v1, v2
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: udot8_multiuses_mul1:
@@ -1803,28 +1783,27 @@ define amdgpu_kernel void @udot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_bfe_u32 s10, s2, 0x40008
 ; GFX9-DL-NEXT:    s_bfe_u32 s11, s2, 0x40004
 ; GFX9-DL-NEXT:    s_and_b32 s2, s2, 15
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s18
-; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s2, v0, v1
-; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s17
-; GFX9-DL-NEXT:    v_mad_u32_u24 v0, s2, v0, v1
-; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s11, v2, v1
-; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s16
-; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s10, v2, v1
-; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s15
-; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s9, v2, v1
-; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s14
-; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s8, v2, v1
-; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s13
-; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s5, v2, v1
-; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s12
-; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s4, v2, v1
-; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s7
-; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s3, v2, v1
-; GFX9-DL-NEXT:    v_add_u32_e32 v2, v0, v1
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s18
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s2, v1, v2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s17
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s2, v1, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s11, v3, v2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s16
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s10, v3, v2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s15
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s9, v3, v2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s14
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s8, v3, v2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s13
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s5, v3, v2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s12
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s4, v3, v2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s3, v3, v2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT:    v_add_u32_e32 v1, v1, v2
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: udot8_multiuses_mul1:
@@ -1838,6 +1817,7 @@ define amdgpu_kernel void @udot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-DL-NEXT:    s_addc_u32 s9, s9, 0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -1870,10 +1850,8 @@ define amdgpu_kernel void @udot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_lshr_b32 s1, s1, 28
 ; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s6, s7, v1
 ; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s0, s1, v1
-; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v0, v1
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v0, v0, v1
+; GFX10-DL-NEXT:    global_store_dword v2, v0, s[4:5]
 ; GFX10-DL-NEXT:    s_endpgm
                                                 <8 x i4> addrspace(1)* %src2,
                                                 i32 addrspace(1)* nocapture %dst) {
@@ -2083,26 +2061,25 @@ define amdgpu_kernel void @udot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    s_bfe_u32 s10, s2, 0x40008
 ; GFX9-NEXT:    s_bfe_u32 s11, s2, 0x40004
 ; GFX9-NEXT:    s_and_b32 s2, s2, 15
-; GFX9-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-NEXT:    v_mov_b32_e32 v1, s18
-; GFX9-NEXT:    v_mad_u32_u24 v0, s2, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v1, s17
-; GFX9-NEXT:    v_mad_u32_u24 v0, s11, v1, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s16
-; GFX9-NEXT:    v_mad_u32_u24 v0, s10, v1, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s15
-; GFX9-NEXT:    v_mad_u32_u24 v0, s9, v1, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s14
-; GFX9-NEXT:    v_mad_u32_u24 v0, s8, v1, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s13
-; GFX9-NEXT:    v_mad_u32_u24 v0, s5, v1, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s12
-; GFX9-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-NEXT:    v_mad_u32_u24 v2, s3, v1, v0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-NEXT:    v_mov_b32_e32 v2, s18
+; GFX9-NEXT:    v_mad_u32_u24 v1, s2, v1, v2
+; GFX9-NEXT:    v_mov_b32_e32 v2, s17
+; GFX9-NEXT:    v_mad_u32_u24 v1, s11, v2, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s16
+; GFX9-NEXT:    v_mad_u32_u24 v1, s10, v2, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s15
+; GFX9-NEXT:    v_mad_u32_u24 v1, s9, v2, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s14
+; GFX9-NEXT:    v_mad_u32_u24 v1, s8, v2, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s13
+; GFX9-NEXT:    v_mad_u32_u24 v1, s5, v2, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s12
+; GFX9-NEXT:    v_mad_u32_u24 v1, s4, v2, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s7
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mad_u32_u24 v1, s3, v2, v1
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: udot8_acc32_vecMul:
@@ -2118,14 +2095,13 @@ define amdgpu_kernel void @udot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s3, s[0:1], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-DL-NEXT:    v_dot8_u32_u4 v2, s4, v0, v1
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-DL-NEXT:    v_dot8_u32_u4 v1, s4, v1, v2
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: udot8_acc32_vecMul:
@@ -2138,6 +2114,7 @@ define amdgpu_kernel void @udot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_clause 0x1
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-DL-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2146,10 +2123,8 @@ define amdgpu_kernel void @udot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s6
-; GFX10-DL-NEXT:    v_dot8_u32_u4 v2, s0, s1, v0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT:    v_dot8_u32_u4 v0, s0, s1, v0
+; GFX10-DL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-DL-NEXT:    s_endpgm
                                               <8 x i4> addrspace(1)* %src2,
                                               i32 addrspace(1)* nocapture %dst) {
@@ -2308,11 +2283,13 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
 ; GFX9-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s22, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_load_dword s6, s[6:7], 0x0
+; GFX9-NEXT:    global_load_ushort v5, v0, s[0:1]
+; GFX9-NEXT:    s_mov_b32 s22, -1
 ; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-NEXT:    s_mov_b32 s23, 0xe00000
 ; GFX9-NEXT:    s_add_u32 s20, s20, s3
@@ -2326,54 +2303,53 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    s_bfe_u32 s13, s6, 0x40010
 ; GFX9-NEXT:    s_bfe_u32 s14, s6, 0x40014
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s3, s4
-; GFX9-NEXT:    v_mov_b32_e32 v0, s7
-; GFX9-NEXT:    v_pk_mul_lo_u16 v2, s3, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    v_pk_mul_lo_u16 v1, s3, v1
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s13, s14
-; GFX9-NEXT:    s_bfe_u32 s5, s2, 0x40010
-; GFX9-NEXT:    s_bfe_u32 s8, s2, 0x40014
 ; GFX9-NEXT:    s_bfe_u32 s15, s6, 0x40008
 ; GFX9-NEXT:    s_bfe_u32 s16, s6, 0x4000c
 ; GFX9-NEXT:    s_and_b32 s17, s6, 15
-; GFX9-NEXT:    v_mov_b32_e32 v0, s3
-; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s5, s8
+; GFX9-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s15, s16
+; GFX9-NEXT:    s_bfe_u32 s6, s6, 0x40004
+; GFX9-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s17, s6
+; GFX9-NEXT:    s_bfe_u32 s5, s2, 0x40010
+; GFX9-NEXT:    s_bfe_u32 s8, s2, 0x40014
 ; GFX9-NEXT:    s_bfe_u32 s9, s2, 0x40008
 ; GFX9-NEXT:    s_bfe_u32 s10, s2, 0x4000c
-; GFX9-NEXT:    s_bfe_u32 s6, s6, 0x40004
-; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s15, s16
-; GFX9-NEXT:    v_pk_mul_lo_u16 v3, s4, v0
 ; GFX9-NEXT:    s_and_b32 s11, s2, 15
 ; GFX9-NEXT:    s_bfe_u32 s2, s2, 0x40004
-; GFX9-NEXT:    v_mov_b32_e32 v0, s3
-; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s9, s10
-; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s17, s6
-; GFX9-NEXT:    v_pk_mul_lo_u16 v4, s4, v0
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s11, s2
-; GFX9-NEXT:    v_mov_b32_e32 v0, s3
-; GFX9-NEXT:    v_pk_mul_lo_u16 v5, s2, v0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_load_ushort v6, v[0:1], off
+; GFX9-NEXT:    v_mov_b32_e32 v4, s3
+; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s5, s8
+; GFX9-NEXT:    v_pk_mul_lo_u16 v4, s2, v4
+; GFX9-NEXT:    v_pk_mul_lo_u16 v2, s4, v2
+; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s9, s10
+; GFX9-NEXT:    v_pk_mul_lo_u16 v3, s4, v3
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_u32_e32 v6, v5, v6
-; GFX9-NEXT:    v_add_u32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT:    v_add_u32_sdwa v5, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0
+; GFX9-NEXT:    v_add_u32_e32 v5, v4, v5
 ; GFX9-NEXT:    v_add_u32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT:    v_add_u32_e32 v4, v4, v3
+; GFX9-NEXT:    v_add_u32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-NEXT:    v_add_u32_e32 v3, v3, v2
 ; GFX9-NEXT:    v_add_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT:    global_store_short v[0:1], v2, off
+; GFX9-NEXT:    v_add_u32_e32 v2, v2, v1
+; GFX9-NEXT:    v_add_u32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: udot8_acc16_vecMul:
 ; GFX9-DL:       ; %bb.0: ; %entry
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
 ; GFX9-DL-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
-; GFX9-DL-NEXT:    s_mov_b32 s22, -1
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    s_load_dword s6, s[6:7], 0x0
+; GFX9-DL-NEXT:    global_load_ushort v5, v0, s[0:1]
+; GFX9-DL-NEXT:    s_mov_b32 s22, -1
 ; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_mov_b32 s23, 0xe00000
 ; GFX9-DL-NEXT:    s_add_u32 s20, s20, s3
@@ -2387,102 +2363,97 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_bfe_u32 s13, s6, 0x40010
 ; GFX9-DL-NEXT:    s_bfe_u32 s14, s6, 0x40014
 ; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s3, s3, s4
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s7
-; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v2, s3, v0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v1, s3, v1
 ; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s3, s13, s14
-; GFX9-DL-NEXT:    s_bfe_u32 s5, s2, 0x40010
-; GFX9-DL-NEXT:    s_bfe_u32 s8, s2, 0x40014
 ; GFX9-DL-NEXT:    s_bfe_u32 s15, s6, 0x40008
 ; GFX9-DL-NEXT:    s_bfe_u32 s16, s6, 0x4000c
 ; GFX9-DL-NEXT:    s_and_b32 s17, s6, 15
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s3
-; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s4, s5, s8
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s3, s15, s16
+; GFX9-DL-NEXT:    s_bfe_u32 s6, s6, 0x40004
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s3, s17, s6
+; GFX9-DL-NEXT:    s_bfe_u32 s5, s2, 0x40010
+; GFX9-DL-NEXT:    s_bfe_u32 s8, s2, 0x40014
 ; GFX9-DL-NEXT:    s_bfe_u32 s9, s2, 0x40008
 ; GFX9-DL-NEXT:    s_bfe_u32 s10, s2, 0x4000c
-; GFX9-DL-NEXT:    s_bfe_u32 s6, s6, 0x40004
-; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s3, s15, s16
-; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v3, s4, v0
 ; GFX9-DL-NEXT:    s_and_b32 s11, s2, 15
 ; GFX9-DL-NEXT:    s_bfe_u32 s2, s2, 0x40004
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s3
-; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s4, s9, s10
-; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s3, s17, s6
-; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v4, s4, v0
 ; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s2, s11, s2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s3
-; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v5, s2, v0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT:    global_load_ushort v6, v[0:1], off
+; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s3
+; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s4, s5, s8
+; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v4, s2, v4
+; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v2, s4, v2
+; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s4, s9, s10
+; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v3, s4, v3
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_add_u32_e32 v6, v5, v6
-; GFX9-DL-NEXT:    v_add_u32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-DL-NEXT:    v_add_u32_sdwa v5, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0
+; GFX9-DL-NEXT:    v_add_u32_e32 v5, v4, v5
 ; GFX9-DL-NEXT:    v_add_u32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-DL-NEXT:    v_add_u32_e32 v4, v4, v3
+; GFX9-DL-NEXT:    v_add_u32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0
 ; GFX9-DL-NEXT:    v_add_u32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-DL-NEXT:    v_add_u32_e32 v3, v3, v2
 ; GFX9-DL-NEXT:    v_add_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-DL-NEXT:    global_store_short v[0:1], v2, off
+; GFX9-DL-NEXT:    v_add_u32_e32 v2, v2, v1
+; GFX9-DL-NEXT:    v_add_u32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-DL-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: udot8_acc16_vecMul:
 ; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
 ; GFX10-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
 ; GFX10-DL-NEXT:    s_mov_b32 s10, -1
 ; GFX10-DL-NEXT:    s_mov_b32 s11, 0x31c16000
 ; GFX10-DL-NEXT:    s_add_u32 s8, s8, s3
-; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-DL-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-DL-NEXT:    global_load_ushort v2, v[0:1], off
-; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    global_load_ushort v1, v0, s[4:5]
 ; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_and_b32 s2, s0, 15
-; GFX10-DL-NEXT:    s_bfe_u32 s5, s0, 0x40004
+; GFX10-DL-NEXT:    s_bfe_u32 s7, s0, 0x40004
 ; GFX10-DL-NEXT:    s_and_b32 s3, s1, 15
-; GFX10-DL-NEXT:    s_bfe_u32 s4, s1, 0x40004
-; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s2, s2, s5
-; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s3, s3, s4
-; GFX10-DL-NEXT:    s_bfe_u32 s4, s1, 0x40008
-; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v3, s2, s3
-; GFX10-DL-NEXT:    s_bfe_u32 s5, s1, 0x4000c
+; GFX10-DL-NEXT:    s_bfe_u32 s6, s1, 0x40004
+; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s2, s2, s7
+; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s3, s3, s6
+; GFX10-DL-NEXT:    s_bfe_u32 s6, s1, 0x40008
+; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v2, s2, s3
+; GFX10-DL-NEXT:    s_bfe_u32 s7, s1, 0x4000c
 ; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40008
 ; GFX10-DL-NEXT:    s_bfe_u32 s3, s0, 0x4000c
-; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s4, s4, s5
+; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s6, s6, s7
 ; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s2, s2, s3
 ; GFX10-DL-NEXT:    s_bfe_u32 s3, s0, 0x40014
-; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v4, s2, s4
+; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v3, s2, s6
 ; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40010
-; GFX10-DL-NEXT:    s_bfe_u32 s4, s1, 0x40010
-; GFX10-DL-NEXT:    s_bfe_u32 s5, s1, 0x40014
+; GFX10-DL-NEXT:    s_bfe_u32 s6, s1, 0x40010
+; GFX10-DL-NEXT:    s_bfe_u32 s7, s1, 0x40014
 ; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s2, s2, s3
-; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s4, s4, s5
+; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s6, s6, s7
 ; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x40018
 ; GFX10-DL-NEXT:    s_lshr_b32 s1, s1, 28
 ; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s1, s3, s1
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v3, v2
-; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v3, s2, s4
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v1, v2, v1
+; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v2, s2, s6
 ; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40018
 ; GFX10-DL-NEXT:    s_lshr_b32 s0, s0, 28
-; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0
+; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0
 ; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s0, s2, s0
-; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v4, s0, s1
-; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v2, v3
-; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v2, v4
-; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-DL-NEXT:    global_store_short v[0:1], v2, off
+; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v3, s0, s1
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v1, v1, v2
+; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v1, v1, v3
+; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-DL-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-DL-NEXT:    s_endpgm
                                               <8 x i4> addrspace(1)* %src2,
                                               i16 addrspace(1)* nocapture %dst) {
@@ -2678,214 +2649,210 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
 ; GFX9-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s22, -1
-; GFX9-NEXT:    s_mov_b32 s23, 0xe00000
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_load_ubyte v2, v[0:1], off
-; GFX9-NEXT:    s_load_dword s1, s[4:5], 0x0
-; GFX9-NEXT:    s_load_dword s2, s[6:7], 0x0
+; GFX9-NEXT:    global_load_ubyte v1, v0, s[0:1]
+; GFX9-NEXT:    s_mov_b32 s23, 0xe00000
 ; GFX9-NEXT:    s_add_u32 s20, s20, s3
+; GFX9-NEXT:    s_load_dword s3, s[4:5], 0x0
+; GFX9-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX9-NEXT:    s_addc_u32 s21, s21, 0
-; GFX9-NEXT:    s_mov_b32 s0, 0xffff
+; GFX9-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_bfe_u32 s3, s1, 0x40010
-; GFX9-NEXT:    s_bfe_u32 s10, s2, 0x40010
-; GFX9-NEXT:    s_bfe_u32 s11, s2, 0x40014
-; GFX9-NEXT:    s_bfe_u32 s12, s2, 0x40018
-; GFX9-NEXT:    s_lshr_b32 s13, s2, 28
-; GFX9-NEXT:    s_and_b32 s14, s2, 15
-; GFX9-NEXT:    s_bfe_u32 s15, s2, 0x40004
-; GFX9-NEXT:    s_bfe_u32 s16, s2, 0x40008
-; GFX9-NEXT:    v_mov_b32_e32 v3, s10
-; GFX9-NEXT:    s_bfe_u32 s2, s2, 0x4000c
-; GFX9-NEXT:    s_bfe_u32 s4, s1, 0x40014
-; GFX9-NEXT:    v_mov_b32_e32 v4, s11
-; GFX9-NEXT:    s_bfe_u32 s5, s1, 0x40018
-; GFX9-NEXT:    v_mov_b32_e32 v5, s12
-; GFX9-NEXT:    s_lshr_b32 s6, s1, 28
-; GFX9-NEXT:    v_mov_b32_e32 v6, s13
-; GFX9-NEXT:    s_and_b32 s7, s1, 15
-; GFX9-NEXT:    v_mov_b32_e32 v7, s14
-; GFX9-NEXT:    s_bfe_u32 s8, s1, 0x40004
-; GFX9-NEXT:    v_mov_b32_e32 v8, s15
-; GFX9-NEXT:    s_bfe_u32 s9, s1, 0x40008
-; GFX9-NEXT:    v_mov_b32_e32 v9, s16
-; GFX9-NEXT:    s_bfe_u32 s1, s1, 0x4000c
-; GFX9-NEXT:    v_mov_b32_e32 v10, s2
-; GFX9-NEXT:    v_mul_lo_u16_e32 v3, s3, v3
-; GFX9-NEXT:    v_mul_lo_u16_sdwa v4, s4, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT:    v_mul_lo_u16_e32 v5, s5, v5
-; GFX9-NEXT:    v_mul_lo_u16_sdwa v6, s6, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT:    v_mul_lo_u16_e32 v7, s7, v7
-; GFX9-NEXT:    v_mul_lo_u16_sdwa v8, s8, v8 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_e32 v3, v3, v4
-; GFX9-NEXT:    v_or_b32_sdwa v4, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_e32 v5, v7, v8
-; GFX9-NEXT:    v_mul_lo_u16_e32 v9, s9, v9
-; GFX9-NEXT:    v_mul_lo_u16_sdwa v10, s1, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_e32 v5, s0, v5
-; GFX9-NEXT:    v_or_b32_sdwa v6, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_e32 v6, v5, v6
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 8, v6
-; GFX9-NEXT:    v_and_b32_e32 v3, s0, v3
-; GFX9-NEXT:    v_or_b32_e32 v4, v3, v4
+; GFX9-NEXT:    s_bfe_u32 s5, s3, 0x40010
+; GFX9-NEXT:    s_bfe_u32 s12, s4, 0x40010
+; GFX9-NEXT:    s_bfe_u32 s13, s4, 0x40014
+; GFX9-NEXT:    s_bfe_u32 s14, s4, 0x40018
+; GFX9-NEXT:    s_lshr_b32 s15, s4, 28
+; GFX9-NEXT:    s_and_b32 s16, s4, 15
+; GFX9-NEXT:    s_bfe_u32 s17, s4, 0x40004
+; GFX9-NEXT:    s_bfe_u32 s18, s4, 0x40008
+; GFX9-NEXT:    v_mov_b32_e32 v2, s12
+; GFX9-NEXT:    s_bfe_u32 s4, s4, 0x4000c
+; GFX9-NEXT:    s_bfe_u32 s6, s3, 0x40014
+; GFX9-NEXT:    v_mov_b32_e32 v3, s13
+; GFX9-NEXT:    s_bfe_u32 s7, s3, 0x40018
+; GFX9-NEXT:    v_mov_b32_e32 v4, s14
+; GFX9-NEXT:    s_lshr_b32 s8, s3, 28
+; GFX9-NEXT:    v_mov_b32_e32 v5, s15
+; GFX9-NEXT:    s_and_b32 s9, s3, 15
+; GFX9-NEXT:    v_mov_b32_e32 v6, s16
+; GFX9-NEXT:    s_bfe_u32 s10, s3, 0x40004
+; GFX9-NEXT:    v_mov_b32_e32 v7, s17
+; GFX9-NEXT:    s_bfe_u32 s11, s3, 0x40008
+; GFX9-NEXT:    v_mov_b32_e32 v8, s18
+; GFX9-NEXT:    s_bfe_u32 s3, s3, 0x4000c
+; GFX9-NEXT:    v_mov_b32_e32 v9, s4
+; GFX9-NEXT:    v_mul_lo_u16_e32 v2, s5, v2
+; GFX9-NEXT:    v_mul_lo_u16_sdwa v3, s6, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_mul_lo_u16_e32 v4, s7, v4
+; GFX9-NEXT:    v_mul_lo_u16_sdwa v5, s8, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_mul_lo_u16_e32 v6, s9, v6
+; GFX9-NEXT:    v_mul_lo_u16_sdwa v7, s10, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX9-NEXT:    v_or_b32_sdwa v3, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_e32 v4, v6, v7
+; GFX9-NEXT:    v_mul_lo_u16_e32 v8, s11, v8
+; GFX9-NEXT:    v_mul_lo_u16_sdwa v9, s3, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_e32 v4, s2, v4
+; GFX9-NEXT:    v_or_b32_sdwa v5, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_e32 v5, v4, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 8, v5
+; GFX9-NEXT:    v_and_b32_e32 v2, s2, v2
+; GFX9-NEXT:    v_or_b32_e32 v3, v2, v3
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_u32_e32 v2, v5, v2
-; GFX9-NEXT:    v_add_u32_e32 v2, v2, v7
-; GFX9-NEXT:    v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2
-; GFX9-NEXT:    v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-NEXT:    v_add_u32_e32 v2, v2, v3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 8, v4
-; GFX9-NEXT:    v_add_u32_e32 v2, v2, v3
-; GFX9-NEXT:    v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT:    v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-NEXT:    global_store_byte v[0:1], v2, off
+; GFX9-NEXT:    v_add_u32_e32 v1, v4, v1
+; GFX9-NEXT:    v_add_u32_e32 v1, v1, v6
+; GFX9-NEXT:    v_add_u32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2
+; GFX9-NEXT:    v_add_u32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX9-NEXT:    v_add_u32_e32 v1, v1, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 8, v3
+; GFX9-NEXT:    v_add_u32_e32 v1, v1, v2
+; GFX9-NEXT:    v_add_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_add_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX9-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: udot8_acc8_vecMul:
 ; GFX9-DL:       ; %bb.0: ; %entry
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
 ; GFX9-DL-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
 ; GFX9-DL-NEXT:    s_mov_b32 s22, -1
-; GFX9-DL-NEXT:    s_mov_b32 s23, 0xe00000
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT:    global_load_ubyte v2, v[0:1], off
-; GFX9-DL-NEXT:    s_load_dword s1, s[4:5], 0x0
-; GFX9-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
+; GFX9-DL-NEXT:    global_load_ubyte v1, v0, s[0:1]
+; GFX9-DL-NEXT:    s_mov_b32 s23, 0xe00000
 ; GFX9-DL-NEXT:    s_add_u32 s20, s20, s3
+; GFX9-DL-NEXT:    s_load_dword s3, s[4:5], 0x0
+; GFX9-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX9-DL-NEXT:    s_addc_u32 s21, s21, 0
-; GFX9-DL-NEXT:    s_mov_b32 s0, 0xffff
+; GFX9-DL-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_bfe_u32 s3, s1, 0x40010
-; GFX9-DL-NEXT:    s_bfe_u32 s10, s2, 0x40010
-; GFX9-DL-NEXT:    s_bfe_u32 s11, s2, 0x40014
-; GFX9-DL-NEXT:    s_bfe_u32 s12, s2, 0x40018
-; GFX9-DL-NEXT:    s_lshr_b32 s13, s2, 28
-; GFX9-DL-NEXT:    s_and_b32 s14, s2, 15
-; GFX9-DL-NEXT:    s_bfe_u32 s15, s2, 0x40004
-; GFX9-DL-NEXT:    s_bfe_u32 s16, s2, 0x40008
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s10
-; GFX9-DL-NEXT:    s_bfe_u32 s2, s2, 0x4000c
-; GFX9-DL-NEXT:    s_bfe_u32 s4, s1, 0x40014
-; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s11
-; GFX9-DL-NEXT:    s_bfe_u32 s5, s1, 0x40018
-; GFX9-DL-NEXT:    v_mov_b32_e32 v5, s12
-; GFX9-DL-NEXT:    s_lshr_b32 s6, s1, 28
-; GFX9-DL-NEXT:    v_mov_b32_e32 v6, s13
-; GFX9-DL-NEXT:    s_and_b32 s7, s1, 15
-; GFX9-DL-NEXT:    v_mov_b32_e32 v7, s14
-; GFX9-DL-NEXT:    s_bfe_u32 s8, s1, 0x40004
-; GFX9-DL-NEXT:    v_mov_b32_e32 v8, s15
-; GFX9-DL-NEXT:    s_bfe_u32 s9, s1, 0x40008
-; GFX9-DL-NEXT:    v_mov_b32_e32 v9, s16
-; GFX9-DL-NEXT:    s_bfe_u32 s1, s1, 0x4000c
-; GFX9-DL-NEXT:    v_mov_b32_e32 v10, s2
-; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v3, s3, v3
-; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v4, s4, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v5, s5, v5
-; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v6, s6, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v7, s7, v7
-; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v8, s8, v8 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-DL-NEXT:    v_or_b32_e32 v3, v3, v4
-; GFX9-DL-NEXT:    v_or_b32_sdwa v4, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-DL-NEXT:    v_or_b32_e32 v5, v7, v8
-; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v9, s9, v9
-; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v10, s1, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-DL-NEXT:    v_and_b32_e32 v5, s0, v5
-; GFX9-DL-NEXT:    v_or_b32_sdwa v6, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-DL-NEXT:    v_or_b32_e32 v6, v5, v6
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v7, 8, v6
-; GFX9-DL-NEXT:    v_and_b32_e32 v3, s0, v3
-; GFX9-DL-NEXT:    v_or_b32_e32 v4, v3, v4
+; GFX9-DL-NEXT:    s_bfe_u32 s5, s3, 0x40010
+; GFX9-DL-NEXT:    s_bfe_u32 s12, s4, 0x40010
+; GFX9-DL-NEXT:    s_bfe_u32 s13, s4, 0x40014
+; GFX9-DL-NEXT:    s_bfe_u32 s14, s4, 0x40018
+; GFX9-DL-NEXT:    s_lshr_b32 s15, s4, 28
+; GFX9-DL-NEXT:    s_and_b32 s16, s4, 15
+; GFX9-DL-NEXT:    s_bfe_u32 s17, s4, 0x40004
+; GFX9-DL-NEXT:    s_bfe_u32 s18, s4, 0x40008
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s12
+; GFX9-DL-NEXT:    s_bfe_u32 s4, s4, 0x4000c
+; GFX9-DL-NEXT:    s_bfe_u32 s6, s3, 0x40014
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s13
+; GFX9-DL-NEXT:    s_bfe_u32 s7, s3, 0x40018
+; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s14
+; GFX9-DL-NEXT:    s_lshr_b32 s8, s3, 28
+; GFX9-DL-NEXT:    v_mov_b32_e32 v5, s15
+; GFX9-DL-NEXT:    s_and_b32 s9, s3, 15
+; GFX9-DL-NEXT:    v_mov_b32_e32 v6, s16
+; GFX9-DL-NEXT:    s_bfe_u32 s10, s3, 0x40004
+; GFX9-DL-NEXT:    v_mov_b32_e32 v7, s17
+; GFX9-DL-NEXT:    s_bfe_u32 s11, s3, 0x40008
+; GFX9-DL-NEXT:    v_mov_b32_e32 v8, s18
+; GFX9-DL-NEXT:    s_bfe_u32 s3, s3, 0x4000c
+; GFX9-DL-NEXT:    v_mov_b32_e32 v9, s4
+; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v2, s5, v2
+; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v3, s6, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v4, s7, v4
+; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v5, s8, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v6, s9, v6
+; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v7, s10, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-DL-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX9-DL-NEXT:    v_or_b32_sdwa v3, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-DL-NEXT:    v_or_b32_e32 v4, v6, v7
+; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v8, s11, v8
+; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v9, s3, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-DL-NEXT:    v_and_b32_e32 v4, s2, v4
+; GFX9-DL-NEXT:    v_or_b32_sdwa v5, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-DL-NEXT:    v_or_b32_e32 v5, v4, v5
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v6, 8, v5
+; GFX9-DL-NEXT:    v_and_b32_e32 v2, s2, v2
+; GFX9-DL-NEXT:    v_or_b32_e32 v3, v2, v3
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_add_u32_e32 v2, v5, v2
-; GFX9-DL-NEXT:    v_add_u32_e32 v2, v2, v7
-; GFX9-DL-NEXT:    v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2
-; GFX9-DL-NEXT:    v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-DL-NEXT:    v_add_u32_e32 v2, v2, v3
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v3, 8, v4
-; GFX9-DL-NEXT:    v_add_u32_e32 v2, v2, v3
-; GFX9-DL-NEXT:    v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-DL-NEXT:    v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-DL-NEXT:    global_store_byte v[0:1], v2, off
+; GFX9-DL-NEXT:    v_add_u32_e32 v1, v4, v1
+; GFX9-DL-NEXT:    v_add_u32_e32 v1, v1, v6
+; GFX9-DL-NEXT:    v_add_u32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2
+; GFX9-DL-NEXT:    v_add_u32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX9-DL-NEXT:    v_add_u32_e32 v1, v1, v2
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v2, 8, v3
+; GFX9-DL-NEXT:    v_add_u32_e32 v1, v1, v2
+; GFX9-DL-NEXT:    v_add_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-DL-NEXT:    v_add_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX9-DL-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: udot8_acc8_vecMul:
 ; GFX10-DL:       ; %bb.0: ; %entry
-; GFX10-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GFX10-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; GFX10-DL-NEXT:    s_mov_b32 s10, -1
-; GFX10-DL-NEXT:    s_mov_b32 s11, 0x31c16000
-; GFX10-DL-NEXT:    s_add_u32 s8, s8, s3
-; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX10-DL-NEXT:    s_addc_u32 s9, s9, 0
-; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-DL-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GFX10-DL-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GFX10-DL-NEXT:    s_mov_b32 s14, -1
+; GFX10-DL-NEXT:    s_mov_b32 s15, 0x31c16000
+; GFX10-DL-NEXT:    s_add_u32 s12, s12, s3
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-DL-NEXT:    global_load_ubyte v2, v[0:1], off
+; GFX10-DL-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    global_load_ubyte v1, v0, s[4:5]
 ; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_bfe_u32 s3, s0, 0x40004
-; GFX10-DL-NEXT:    s_bfe_u32 s5, s1, 0x40004
+; GFX10-DL-NEXT:    s_bfe_u32 s7, s1, 0x40004
 ; GFX10-DL-NEXT:    s_and_b32 s2, s0, 15
-; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v3, s3, s5
+; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v2, s3, s7
 ; GFX10-DL-NEXT:    s_and_b32 s3, s1, 15
-; GFX10-DL-NEXT:    s_bfe_u32 s6, s0, 0x4000c
-; GFX10-DL-NEXT:    s_bfe_u32 s5, s1, 0x4000c
-; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v4, s2, s3
-; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v5, s6, s5
-; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v3, 8, v3
-; GFX10-DL-NEXT:    s_bfe_u32 s4, s0, 0x40008
+; GFX10-DL-NEXT:    s_bfe_u32 s8, s0, 0x4000c
+; GFX10-DL-NEXT:    s_bfe_u32 s7, s1, 0x4000c
+; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v3, s2, s3
+; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v4, s8, s7
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v2, 8, v2
+; GFX10-DL-NEXT:    s_bfe_u32 s6, s0, 0x40008
 ; GFX10-DL-NEXT:    s_bfe_u32 s2, s1, 0x40008
 ; GFX10-DL-NEXT:    s_mov_b32 s3, 0xffff
-; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v6, s4, s2
-; GFX10-DL-NEXT:    v_or_b32_e32 v3, v4, v3
-; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v5, 8, v5
-; GFX10-DL-NEXT:    s_bfe_u32 s4, s0, 0x40014
-; GFX10-DL-NEXT:    s_bfe_u32 s6, s1, 0x40014
+; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v5, s6, s2
+; GFX10-DL-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v4, 8, v4
+; GFX10-DL-NEXT:    s_bfe_u32 s6, s0, 0x40014
+; GFX10-DL-NEXT:    s_bfe_u32 s8, s1, 0x40014
 ; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40010
-; GFX10-DL-NEXT:    v_and_b32_e32 v3, s3, v3
-; GFX10-DL-NEXT:    v_or_b32_sdwa v4, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v5, s4, s6
-; GFX10-DL-NEXT:    s_bfe_u32 s5, s0, 0x40018
-; GFX10-DL-NEXT:    s_bfe_u32 s7, s1, 0x40010
+; GFX10-DL-NEXT:    v_and_b32_e32 v2, s3, v2
+; GFX10-DL-NEXT:    v_or_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v4, s6, s8
+; GFX10-DL-NEXT:    s_bfe_u32 s7, s0, 0x40018
+; GFX10-DL-NEXT:    s_bfe_u32 s9, s1, 0x40010
 ; GFX10-DL-NEXT:    s_lshr_b32 s0, s0, 28
-; GFX10-DL-NEXT:    v_or_b32_e32 v4, v3, v4
-; GFX10-DL-NEXT:    s_lshr_b32 s4, s1, 28
-; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v6, s2, s7
-; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v7, s0, s4
-; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v5, 8, v5
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v8, 8, v4
+; GFX10-DL-NEXT:    v_or_b32_e32 v3, v2, v3
+; GFX10-DL-NEXT:    s_lshr_b32 s6, s1, 28
+; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v5, s2, s9
+; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v6, s0, s6
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v4, 8, v4
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v7, 8, v3
 ; GFX10-DL-NEXT:    s_bfe_u32 s0, s1, 0x40018
-; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v10, s5, s0
-; GFX10-DL-NEXT:    v_or_b32_e32 v5, v6, v5
-; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v7, 8, v7
-; GFX10-DL-NEXT:    v_and_b32_e32 v5, s3, v5
+; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v11, s7, s0
+; GFX10-DL-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v6, 8, v6
+; GFX10-DL-NEXT:    v_and_b32_e32 v4, s3, v4
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v3, v2
-; GFX10-DL-NEXT:    v_or_b32_sdwa v3, v10, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v2, v8
-; GFX10-DL-NEXT:    v_or_b32_e32 v3, v5, v3
-; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2
-; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v4, 8, v3
-; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v2, v5
-; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v2, v4
-; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX10-DL-NEXT:    global_store_byte v[0:1], v2, off
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v1, v2, v1
+; GFX10-DL-NEXT:    v_or_b32_sdwa v2, v11, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v1, v1, v7
+; GFX10-DL-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2
+; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v3, 8, v2
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v1, v1, v4
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v1, v1, v3
+; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX10-DL-NEXT:    global_store_byte v0, v1, s[4:5]
 ; GFX10-DL-NEXT:    s_endpgm
                                              <8 x i4> addrspace(1)* %src2,
                                              i8 addrspace(1)* nocapture %dst) {
@@ -3037,165 +3004,161 @@ define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX9-NEXT:    s_mov_b32 s16, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s17, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s18, -1
-; GFX9-NEXT:    s_mov_b32 s19, 0xe00000
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s22, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_load_ubyte v2, v[0:1], off
-; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX9-NEXT:    s_load_dword s1, s[6:7], 0x0
-; GFX9-NEXT:    s_add_u32 s16, s16, s3
-; GFX9-NEXT:    s_addc_u32 s17, s17, 0
+; GFX9-NEXT:    global_load_ubyte v1, v0, s[0:1]
+; GFX9-NEXT:    s_mov_b32 s23, 0xe00000
+; GFX9-NEXT:    s_add_u32 s20, s20, s3
+; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX9-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX9-NEXT:    s_addc_u32 s21, s21, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s8, s0, 15
-; GFX9-NEXT:    s_and_b32 s15, s1, 15
-; GFX9-NEXT:    s_bfe_u32 s14, s1, 0x40004
-; GFX9-NEXT:    v_mov_b32_e32 v4, s15
-; GFX9-NEXT:    s_bfe_u32 s10, s1, 0x40018
-; GFX9-NEXT:    s_bfe_u32 s11, s1, 0x40014
-; GFX9-NEXT:    s_bfe_u32 s12, s1, 0x40010
-; GFX9-NEXT:    s_bfe_u32 s13, s1, 0x40008
-; GFX9-NEXT:    s_lshr_b32 s9, s1, 28
-; GFX9-NEXT:    s_bfe_u32 s1, s1, 0x4000c
-; GFX9-NEXT:    s_bfe_u32 s7, s0, 0x40004
-; GFX9-NEXT:    v_mov_b32_e32 v5, s14
-; GFX9-NEXT:    s_lshr_b32 s2, s0, 28
-; GFX9-NEXT:    s_bfe_u32 s3, s0, 0x40018
-; GFX9-NEXT:    s_bfe_u32 s4, s0, 0x40014
-; GFX9-NEXT:    s_bfe_u32 s5, s0, 0x40010
-; GFX9-NEXT:    s_bfe_u32 s6, s0, 0x40008
-; GFX9-NEXT:    s_bfe_u32 s0, s0, 0x4000c
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    v_mov_b32_e32 v6, s13
-; GFX9-NEXT:    v_mul_u32_u24_e32 v3, s0, v3
-; GFX9-NEXT:    v_and_b32_e32 v3, 15, v3
-; GFX9-NEXT:    v_mov_b32_e32 v7, s12
-; GFX9-NEXT:    v_mov_b32_e32 v8, s11
-; GFX9-NEXT:    v_mov_b32_e32 v9, s10
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mad_u32_u24 v2, s8, v4, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s7, v5, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s6, v6, v2
-; GFX9-NEXT:    v_and_b32_e32 v2, 15, v2
-; GFX9-NEXT:    v_add_u32_e32 v2, v2, v3
-; GFX9-NEXT:    v_mad_u32_u24 v2, s5, v7, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s4, v8, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s3, v9, v2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s9
-; GFX9-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
+; GFX9-NEXT:    s_and_b32 s10, s2, 15
+; GFX9-NEXT:    s_and_b32 s17, s3, 15
+; GFX9-NEXT:    s_bfe_u32 s16, s3, 0x40004
+; GFX9-NEXT:    v_mov_b32_e32 v3, s17
+; GFX9-NEXT:    s_bfe_u32 s12, s3, 0x40018
+; GFX9-NEXT:    s_bfe_u32 s13, s3, 0x40014
+; GFX9-NEXT:    s_bfe_u32 s14, s3, 0x40010
+; GFX9-NEXT:    s_bfe_u32 s15, s3, 0x40008
+; GFX9-NEXT:    s_lshr_b32 s11, s3, 28
+; GFX9-NEXT:    s_bfe_u32 s3, s3, 0x4000c
+; GFX9-NEXT:    s_bfe_u32 s9, s2, 0x40004
+; GFX9-NEXT:    v_mov_b32_e32 v4, s16
+; GFX9-NEXT:    s_lshr_b32 s4, s2, 28
+; GFX9-NEXT:    s_bfe_u32 s5, s2, 0x40018
+; GFX9-NEXT:    s_bfe_u32 s6, s2, 0x40014
+; GFX9-NEXT:    s_bfe_u32 s7, s2, 0x40010
+; GFX9-NEXT:    s_bfe_u32 s8, s2, 0x40008
+; GFX9-NEXT:    s_bfe_u32 s2, s2, 0x4000c
+; GFX9-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-NEXT:    v_mov_b32_e32 v5, s15
+; GFX9-NEXT:    v_mul_u32_u24_e32 v2, s2, v2
 ; GFX9-NEXT:    v_and_b32_e32 v2, 15, v2
-; GFX9-NEXT:    global_store_byte v[0:1], v2, off
+; GFX9-NEXT:    v_mov_b32_e32 v6, s14
+; GFX9-NEXT:    v_mov_b32_e32 v7, s13
+; GFX9-NEXT:    v_mov_b32_e32 v8, s12
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mad_u32_u24 v1, s10, v3, v1
+; GFX9-NEXT:    v_mad_u32_u24 v1, s9, v4, v1
+; GFX9-NEXT:    v_mad_u32_u24 v1, s8, v5, v1
+; GFX9-NEXT:    v_and_b32_e32 v1, 15, v1
+; GFX9-NEXT:    v_add_u32_e32 v1, v1, v2
+; GFX9-NEXT:    v_mad_u32_u24 v1, s7, v6, v1
+; GFX9-NEXT:    v_mad_u32_u24 v1, s6, v7, v1
+; GFX9-NEXT:    v_mad_u32_u24 v1, s5, v8, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s11
+; GFX9-NEXT:    v_mad_u32_u24 v1, s4, v2, v1
+; GFX9-NEXT:    v_and_b32_e32 v1, 15, v1
+; GFX9-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: udot8_acc4_vecMul:
 ; GFX9-DL:       ; %bb.0: ; %entry
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX9-DL-NEXT:    s_mov_b32 s16, SCRATCH_RSRC_DWORD0
-; GFX9-DL-NEXT:    s_mov_b32 s17, SCRATCH_RSRC_DWORD1
-; GFX9-DL-NEXT:    s_mov_b32 s18, -1
-; GFX9-DL-NEXT:    s_mov_b32 s19, 0xe00000
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
+; GFX9-DL-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
+; GFX9-DL-NEXT:    s_mov_b32 s22, -1
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT:    global_load_ubyte v2, v[0:1], off
-; GFX9-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX9-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
-; GFX9-DL-NEXT:    s_add_u32 s16, s16, s3
-; GFX9-DL-NEXT:    s_addc_u32 s17, s17, 0
+; GFX9-DL-NEXT:    global_load_ubyte v1, v0, s[0:1]
+; GFX9-DL-NEXT:    s_mov_b32 s23, 0xe00000
+; GFX9-DL-NEXT:    s_add_u32 s20, s20, s3
+; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX9-DL-NEXT:    s_addc_u32 s21, s21, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_and_b32 s8, s0, 15
-; GFX9-DL-NEXT:    s_and_b32 s15, s1, 15
-; GFX9-DL-NEXT:    s_bfe_u32 s14, s1, 0x40004
-; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s15
-; GFX9-DL-NEXT:    s_bfe_u32 s10, s1, 0x40018
-; GFX9-DL-NEXT:    s_bfe_u32 s11, s1, 0x40014
-; GFX9-DL-NEXT:    s_bfe_u32 s12, s1, 0x40010
-; GFX9-DL-NEXT:    s_bfe_u32 s13, s1, 0x40008
-; GFX9-DL-NEXT:    s_lshr_b32 s9, s1, 28
-; GFX9-DL-NEXT:    s_bfe_u32 s1, s1, 0x4000c
-; GFX9-DL-NEXT:    s_bfe_u32 s7, s0, 0x40004
-; GFX9-DL-NEXT:    v_mov_b32_e32 v5, s14
-; GFX9-DL-NEXT:    s_lshr_b32 s2, s0, 28
-; GFX9-DL-NEXT:    s_bfe_u32 s3, s0, 0x40018
-; GFX9-DL-NEXT:    s_bfe_u32 s4, s0, 0x40014
-; GFX9-DL-NEXT:    s_bfe_u32 s5, s0, 0x40010
-; GFX9-DL-NEXT:    s_bfe_u32 s6, s0, 0x40008
-; GFX9-DL-NEXT:    s_bfe_u32 s0, s0, 0x4000c
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-DL-NEXT:    v_mov_b32_e32 v6, s13
-; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v3, s0, v3
-; GFX9-DL-NEXT:    v_and_b32_e32 v3, 15, v3
-; GFX9-DL-NEXT:    v_mov_b32_e32 v7, s12
-; GFX9-DL-NEXT:    v_mov_b32_e32 v8, s11
-; GFX9-DL-NEXT:    v_mov_b32_e32 v9, s10
-; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s8, v4, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s7, v5, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s6, v6, v2
-; GFX9-DL-NEXT:    v_and_b32_e32 v2, 15, v2
-; GFX9-DL-NEXT:    v_add_u32_e32 v2, v2, v3
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s5, v7, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s4, v8, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s3, v9, v2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s9
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
+; GFX9-DL-NEXT:    s_and_b32 s10, s2, 15
+; GFX9-DL-NEXT:    s_and_b32 s17, s3, 15
+; GFX9-DL-NEXT:    s_bfe_u32 s16, s3, 0x40004
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s17
+; GFX9-DL-NEXT:    s_bfe_u32 s12, s3, 0x40018
+; GFX9-DL-NEXT:    s_bfe_u32 s13, s3, 0x40014
+; GFX9-DL-NEXT:    s_bfe_u32 s14, s3, 0x40010
+; GFX9-DL-NEXT:    s_bfe_u32 s15, s3, 0x40008
+; GFX9-DL-NEXT:    s_lshr_b32 s11, s3, 28
+; GFX9-DL-NEXT:    s_bfe_u32 s3, s3, 0x4000c
+; GFX9-DL-NEXT:    s_bfe_u32 s9, s2, 0x40004
+; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s16
+; GFX9-DL-NEXT:    s_lshr_b32 s4, s2, 28
+; GFX9-DL-NEXT:    s_bfe_u32 s5, s2, 0x40018
+; GFX9-DL-NEXT:    s_bfe_u32 s6, s2, 0x40014
+; GFX9-DL-NEXT:    s_bfe_u32 s7, s2, 0x40010
+; GFX9-DL-NEXT:    s_bfe_u32 s8, s2, 0x40008
+; GFX9-DL-NEXT:    s_bfe_u32 s2, s2, 0x4000c
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-DL-NEXT:    v_mov_b32_e32 v5, s15
+; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v2, s2, v2
 ; GFX9-DL-NEXT:    v_and_b32_e32 v2, 15, v2
-; GFX9-DL-NEXT:    global_store_byte v[0:1], v2, off
+; GFX9-DL-NEXT:    v_mov_b32_e32 v6, s14
+; GFX9-DL-NEXT:    v_mov_b32_e32 v7, s13
+; GFX9-DL-NEXT:    v_mov_b32_e32 v8, s12
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s10, v3, v1
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s9, v4, v1
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s8, v5, v1
+; GFX9-DL-NEXT:    v_and_b32_e32 v1, 15, v1
+; GFX9-DL-NEXT:    v_add_u32_e32 v1, v1, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s7, v6, v1
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s6, v7, v1
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s5, v8, v1
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s11
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s4, v2, v1
+; GFX9-DL-NEXT:    v_and_b32_e32 v1, 15, v1
+; GFX9-DL-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: udot8_acc4_vecMul:
 ; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
 ; GFX10-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
 ; GFX10-DL-NEXT:    s_mov_b32 s10, -1
 ; GFX10-DL-NEXT:    s_mov_b32 s11, 0x31c16000
 ; GFX10-DL-NEXT:    s_add_u32 s8, s8, s3
-; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-DL-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-DL-NEXT:    global_load_ubyte v2, v[0:1], off
-; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    global_load_ubyte v1, v0, s[4:5]
 ; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_and_b32 s2, s0, 15
 ; GFX10-DL-NEXT:    s_and_b32 s3, s1, 15
-; GFX10-DL-NEXT:    s_bfe_u32 s4, s1, 0x40008
-; GFX10-DL-NEXT:    s_bfe_u32 s5, s1, 0x4000c
+; GFX10-DL-NEXT:    s_bfe_u32 s6, s1, 0x40008
+; GFX10-DL-NEXT:    s_bfe_u32 s7, s1, 0x4000c
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s3, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s3, v1
 ; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40004
 ; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x40004
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s3, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s3, v1
 ; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40008
 ; GFX10-DL-NEXT:    s_bfe_u32 s3, s0, 0x4000c
-; GFX10-DL-NEXT:    v_mul_u32_u24_e64 v3, s3, s5
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s4, v2
+; GFX10-DL-NEXT:    v_mul_u32_u24_e64 v2, s3, s7
+; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s6, v1
 ; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40010
 ; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x40010
-; GFX10-DL-NEXT:    v_and_b32_e32 v3, 15, v3
 ; GFX10-DL-NEXT:    v_and_b32_e32 v2, 15, v2
-; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v2, v3
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s3, v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v1, 15, v1
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v1, v1, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s3, v1
 ; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40014
 ; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x40014
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s3, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s3, v1
 ; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40018
 ; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x40018
 ; GFX10-DL-NEXT:    s_lshr_b32 s0, s0, 28
 ; GFX10-DL-NEXT:    s_lshr_b32 s1, s1, 28
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s3, v2
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s0, s1, v2
-; GFX10-DL-NEXT:    v_and_b32_e32 v2, 15, v2
-; GFX10-DL-NEXT:    global_store_byte v[0:1], v2, off
+; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s3, v1
+; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s0, s1, v1
+; GFX10-DL-NEXT:    v_and_b32_e32 v1, 15, v1
+; GFX10-DL-NEXT:    global_store_byte v0, v1, s[4:5]
 ; GFX10-DL-NEXT:    s_endpgm
                                              <8 x i4> addrspace(1)* %src2,
                                              i4 addrspace(1)* nocapture %dst) {
@@ -3326,6 +3289,7 @@ define amdgpu_kernel void @udot8_variant1(i32 addrspace(1)* %v1addr,
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-NEXT:    s_load_dword s3, s[6:7], 0x0
@@ -3340,9 +3304,9 @@ define amdgpu_kernel void @udot8_variant1(i32 addrspace(1)* %v1addr,
 ; GFX9-NEXT:    s_bfe_u32 s14, s2, 0x40014
 ; GFX9-NEXT:    s_bfe_u32 s16, s2, 0x40018
 ; GFX9-NEXT:    s_lshr_b32 s2, s2, 28
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    v_mov_b32_e32 v1, s18
-; GFX9-NEXT:    v_mad_u32_u24 v0, s5, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:    v_mov_b32_e32 v2, s18
+; GFX9-NEXT:    v_mad_u32_u24 v1, s5, v1, v2
 ; GFX9-NEXT:    s_bfe_u32 s7, s3, 0x40004
 ; GFX9-NEXT:    s_bfe_u32 s9, s3, 0x40008
 ; GFX9-NEXT:    s_bfe_u32 s11, s3, 0x4000c
@@ -3350,40 +3314,37 @@ define amdgpu_kernel void @udot8_variant1(i32 addrspace(1)* %v1addr,
 ; GFX9-NEXT:    s_bfe_u32 s15, s3, 0x40014
 ; GFX9-NEXT:    s_bfe_u32 s17, s3, 0x40018
 ; GFX9-NEXT:    s_lshr_b32 s3, s3, 28
-; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    v_mad_u32_u24 v0, s3, v1, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-NEXT:    v_mad_u32_u24 v0, s7, v1, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s8
-; GFX9-NEXT:    v_mad_u32_u24 v0, s9, v1, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s10
-; GFX9-NEXT:    v_mad_u32_u24 v0, s11, v1, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s12
-; GFX9-NEXT:    v_mad_u32_u24 v0, s13, v1, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s14
-; GFX9-NEXT:    v_mad_u32_u24 v0, s15, v1, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s16
-; GFX9-NEXT:    v_mad_u32_u24 v2, s17, v1, v0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-NEXT:    v_mad_u32_u24 v1, s3, v2, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_mad_u32_u24 v1, s7, v2, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s8
+; GFX9-NEXT:    v_mad_u32_u24 v1, s9, v2, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s10
+; GFX9-NEXT:    v_mad_u32_u24 v1, s11, v2, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s12
+; GFX9-NEXT:    v_mad_u32_u24 v1, s13, v2, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s14
+; GFX9-NEXT:    v_mad_u32_u24 v1, s15, v2, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s16
+; GFX9-NEXT:    v_mad_u32_u24 v1, s17, v2, v1
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: udot8_variant1:
 ; GFX9-DL:       ; %bb.0: ; %entry
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s3, s[0:1], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-DL-NEXT:    v_dot8_u32_u4 v2, s4, v0, v1
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-DL-NEXT:    v_dot8_u32_u4 v1, s4, v1, v2
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: udot8_variant1:
@@ -3391,6 +3352,7 @@ define amdgpu_kernel void @udot8_variant1(i32 addrspace(1)* %v1addr,
 ; GFX10-DL-NEXT:    s_clause 0x1
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -3398,10 +3360,8 @@ define amdgpu_kernel void @udot8_variant1(i32 addrspace(1)* %v1addr,
 ; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s6
-; GFX10-DL-NEXT:    v_dot8_u32_u4 v2, s1, s0, v0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT:    v_dot8_u32_u4 v0, s1, s0, v0
+; GFX10-DL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-DL-NEXT:    s_endpgm
                                           i32 addrspace(1)* %v2addr,
                                           i32 addrspace(1)* %dst) {

diff  --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
index 9f4b3b8890fa..e26d577c9bee 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
@@ -7,14 +7,13 @@ define amdgpu_kernel void @s_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out,
 ; GFX9-LABEL: s_insertelement_v2i16_0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_pack_lh_b32_b16 s0, 0x3e7, s0
-; GFX9-NEXT:    v_mov_b32_e32 v2, s0
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    s_pack_lh_b32_b16 s2, 0x3e7, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; CIVI-LABEL: s_insertelement_v2i16_0:
@@ -42,14 +41,13 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reg(<2 x i16> addrspace(1)* %
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX9-NEXT:    s_load_dword s4, s[4:5], 0x30
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_pack_lh_b32_b16 s0, s4, s0
-; GFX9-NEXT:    v_mov_b32_e32 v2, s0
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    s_pack_lh_b32_b16 s2, s4, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_insertelement_v2i16_0_reg:
@@ -94,17 +92,16 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(<2 x i16> ad
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX9-NEXT:    s_load_dword s4, s[4:5], 0x30
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshr_b32 s0, s2, 16
-; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s4, s0
-; GFX9-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    s_lshr_b32 s2, s2, 16
+; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s4, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s0
+; GFX9-NEXT:    ; use s2
 ; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -161,14 +158,13 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)*
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX9-NEXT:    s_load_dword s4, s[4:5], 0x30
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_pack_hh_b32_b16 s0, s4, s0
-; GFX9-NEXT:    v_mov_b32_e32 v2, s0
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    s_pack_hh_b32_b16 s2, s4, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_insertelement_v2i16_0_reghi:
@@ -215,17 +211,16 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(<2 x i16> a
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX9-NEXT:    s_load_dword s4, s[4:5], 0x10
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    s_lshr_b32 s0, s4, 16
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    s_lshr_b32 s3, s4, 16
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_pack_lh_b32_b16 s1, s0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    s_pack_lh_b32_b16 s2, s3, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s0
+; GFX9-NEXT:    ; use s3
 ; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -281,21 +276,20 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(<2 x i
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX9-NEXT:    s_load_dword s4, s[4:5], 0x10
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    s_lshr_b32 s1, s4, 16
+; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GFX9-NEXT:    s_lshr_b32 s3, s4, 16
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s1, s0
-; GFX9-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    s_lshr_b32 s2, s2, 16
+; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s3, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s1
+; GFX9-NEXT:    ; use s3
 ; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s0
+; GFX9-NEXT:    ; use s2
 ; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -361,14 +355,13 @@ define amdgpu_kernel void @s_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out,
 ; GFX9-LABEL: s_insertelement_v2i16_1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, 0x3e7
-; GFX9-NEXT:    v_mov_b32_e32 v2, s0
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s2, 0x3e7
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; CIVI-LABEL: s_insertelement_v2i16_1:
@@ -395,14 +388,13 @@ define amdgpu_kernel void @s_insertelement_v2i16_1_reg(<2 x i16> addrspace(1)* %
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX9-NEXT:    s_load_dword s4, s[4:5], 0x30
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s4
-; GFX9-NEXT:    v_mov_b32_e32 v2, s0
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s2, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_insertelement_v2i16_1_reg:
@@ -446,15 +438,14 @@ define amdgpu_kernel void @s_insertelement_v2f16_0(<2 x half> addrspace(1)* %out
 ; GFX9-LABEL: s_insertelement_v2f16_0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX9-NEXT:    s_pack_ll_b32_b16 s0, 0x4500, s0
-; GFX9-NEXT:    v_mov_b32_e32 v2, s0
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    s_lshr_b32 s2, s2, 16
+; GFX9-NEXT:    s_pack_ll_b32_b16 s2, 0x4500, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; CIVI-LABEL: s_insertelement_v2f16_0:
@@ -480,14 +471,13 @@ define amdgpu_kernel void @s_insertelement_v2f16_1(<2 x half> addrspace(1)* %out
 ; GFX9-LABEL: s_insertelement_v2f16_1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, 0x4500
-; GFX9-NEXT:    v_mov_b32_e32 v2, s0
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s2, 0x4500
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; CIVI-LABEL: s_insertelement_v2f16_1:
@@ -1050,19 +1040,18 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(<2 x i16> addrspace(1)*
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x10
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX9-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b32 s0, s0, 4
-; GFX9-NEXT:    s_lshl_b32 s0, 0xffff, s0
-; GFX9-NEXT:    s_andn2_b32 s1, s2, s0
-; GFX9-NEXT:    s_and_b32 s0, s0, 0x3e703e7
-; GFX9-NEXT:    s_or_b32 s0, s0, s1
-; GFX9-NEXT:    v_mov_b32_e32 v2, s0
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    s_lshl_b32 s3, s4, 4
+; GFX9-NEXT:    s_lshl_b32 s3, 0xffff, s3
+; GFX9-NEXT:    s_andn2_b32 s2, s2, s3
+; GFX9-NEXT:    s_and_b32 s3, s3, 0x3e703e7
+; GFX9-NEXT:    s_or_b32 s2, s3, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_insertelement_v2i16_dynamic:

diff  --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
index 3b02e85a7b6e..a3dda8dec81a 100644
--- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
@@ -855,10 +855,11 @@ define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32,
 ; multiple.
 ; FUNC-LABEL: {{^}}packed_struct_argument_alignment:
 ; HSA-GFX9: kernarg_segment_byte_size = 28
-; HSA-GFX9: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
-; HSA-GFX9: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x4
-; HSA-GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:17
-; HSA-GFX9: global_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:13
+; HSA-GFX9-DAG: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
+; HSA-GFX9-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x4
+; HSA-GFX9-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; HSA-GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}} offset:17
+; HSA-GFX9: global_load_dword v{{[0-9]+}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}} offset:13
 define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) {
   %val0 = extractvalue <{i32, i64}> %arg0, 0
   %val1 = extractvalue <{i32, i64}> %arg0, 1
@@ -904,9 +905,10 @@ define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) {
 
 ; FIXME: Why not all scalar loads?
 ; GCN-LABEL: {{^}}array_3xi16:
-; HSA-GFX9: global_load_ushort v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:2
-; HSA-GFX9: global_load_ushort v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:4
-; HSA-GFX9: global_load_ushort v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:6
+; HSA-GFX9-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; HSA-GFX9: global_load_ushort v{{[0-9]+}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}} offset:2
+; HSA-GFX9: global_load_ushort v{{[0-9]+}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}} offset:4
+; HSA-GFX9: global_load_ushort v{{[0-9]+}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}} offset:6
 define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) {
   store volatile i8 %arg0, i8 addrspace(1)* undef
   store volatile [3 x i16] %arg1, [3 x i16] addrspace(1)* undef
@@ -914,7 +916,8 @@ define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) {
 }
 
 ; GCN-LABEL: {{^}}small_array_round_down_offset:
-; HSA-GFX9: global_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:1
+; HSA-GFX9-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; HSA-GFX9: global_load_ubyte v{{[0-9]+}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}} offset:1
 define amdgpu_kernel void @small_array_round_down_offset(i8, [1 x i8] %arg) {
   %val = extractvalue [1 x i8] %arg, 0
   store volatile i8 %val, i8 addrspace(1)* undef

diff  --git a/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll b/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll
index 8e90431cf742..ef3ad70343e2 100644
--- a/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll
@@ -75,8 +75,9 @@ define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32,
 ; multiple.
 ; FUNC-LABEL: {{^}}packed_struct_argument_alignment:
 ; HSA-VI: kernarg_segment_byte_size = 28
-; HSA-VI: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:17
-; HSA-VI: global_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:13
+; HSA-VI-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; HSA-VI: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}} offset:17
+; HSA-VI: global_load_dword v{{[0-9]+}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}} offset:13
 ; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
 ; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x4
 define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) {
@@ -156,9 +157,8 @@ entry:
 ; Byref pointers should only be treated as offsets from kernarg
 ; GCN-LABEL: {{^}}byref_constant_i8_arg:
 ; GCN: kernarg_segment_byte_size = 12
-; GCN: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], s4
-; GCN: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], s5
-; GCN: global_load_ubyte v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]{{\]}}, off offset:8
+; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; GCN: global_load_ubyte v{{[0-9]+}}, [[ZERO]], s[4:5] offset:8
 define amdgpu_kernel void @byref_constant_i8_arg(i32 addrspace(1)* nocapture %out, i8 addrspace(4)* byref(i8) %in.byref) {
   %in = load i8, i8 addrspace(4)* %in.byref
   %ext = zext i8 %in to i32
@@ -168,9 +168,8 @@ define amdgpu_kernel void @byref_constant_i8_arg(i32 addrspace(1)* nocapture %ou
 
 ; GCN-LABEL: {{^}}byref_constant_i16_arg:
 ; GCN: kernarg_segment_byte_size = 12
-; GCN: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], s4
-; GCN: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], s5
-; GCN: global_load_ushort v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]{{\]}}, off offset:8
+; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; GCN: global_load_ushort v{{[0-9]+}}, [[ZERO]], s[4:5] offset:8
 define amdgpu_kernel void @byref_constant_i16_arg(i32 addrspace(1)* nocapture %out, i16 addrspace(4)* byref(i16) %in.byref) {
   %in = load i16, i16 addrspace(4)* %in.byref
   %ext = zext i16 %in to i32
@@ -207,8 +206,8 @@ define amdgpu_kernel void @byref_constant_v4i32_arg(<4 x i32> addrspace(1)* noca
 ; GCN-DAG: s_load_dword [[AFTER_OFFSET:s[0-9]+]], s[4:5], 0x104{{$}}
 ; GCN-DAG: v_mov_b32_e32 [[V_IN:v[0-9]+]], [[IN]]
 ; GCN-DAG: v_mov_b32_e32 [[V_AFTER_OFFSET:v[0-9]+]], [[AFTER_OFFSET]]
-; GCN: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[V_IN]]
-; GCN: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[V_AFTER_OFFSET]]
+; GCN: global_store_dword v{{[0-9]+}}, [[V_IN]], s
+; GCN: global_store_dword v{{[0-9]+}}, [[V_AFTER_OFFSET]], s
 define amdgpu_kernel void @byref_align_constant_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(4)* byref(i32) align(256) %in.byref, i32 %after.offset) {
   %in = load i32, i32 addrspace(4)* %in.byref
   store volatile i32 %in, i32 addrspace(1)* %out, align 4

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.csub.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.csub.ll
index 4ab8c3446582..5e9b71166732 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.csub.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.csub.ll
@@ -21,7 +21,7 @@ main_body:
 }
 
 ; GCN-LABEL: {{^}}global_atomic_csub:
-; GCN: global_atomic_csub v{{[0-9]+}}, v[{{[0-9:]+}}], v{{[0-9]+}}, off glc
+; GCN: global_atomic_csub v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9:]+}}, s{{\[[0-9]+:[0-9]+\]}} glc
 define amdgpu_kernel void @global_atomic_csub(i32 addrspace(1)* %ptr, i32 %data) {
 main_body:
   %ret = call i32 @llvm.amdgcn.global.atomic.csub(i32 addrspace(1)* %ptr, i32 %data)
@@ -29,7 +29,7 @@ main_body:
 }
 
 ; GCN-LABEL: {{^}}global_atomic_csub_off4:
-; GCN: global_atomic_csub v{{[0-9]+}}, v[{{[0-9:]+}}], v{{[0-9]+}}, off offset:4 glc
+; GCN: global_atomic_csub v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:4 glc
 define amdgpu_kernel void @global_atomic_csub_off4(i32 addrspace(1)* %ptr, i32 %data) {
 main_body:
   %p = getelementptr i32, i32 addrspace(1)* %ptr, i64 1

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll
index 4d49d87c67ad..130ce9e8d83f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll
@@ -63,9 +63,10 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i32_offset(i32 addrspace(3)* %pt
 }
 
 ; GCN-LABEL: {{^}}global_atomic_dec_ret_i32:
-; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
+; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; CIVI: buffer_atomic_dec [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 glc{{$}}
-; GFX9: global_atomic_dec v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]], off glc{{$}}
+; GFX9-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; GFX9: global_atomic_dec v{{[0-9]+}}, [[ZERO]], [[K]], s{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 define amdgpu_kernel void @global_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
   %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %ptr, i32 42, i32 0, i32 0, i1 false)
   store i32 %result, i32 addrspace(1)* %out
@@ -73,9 +74,11 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32
 }
 
 ; GCN-LABEL: {{^}}global_atomic_dec_ret_i32_offset:
-; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
+; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; CIVI: buffer_atomic_dec [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16 glc{{$}}
-; GFX9: global_atomic_dec v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]], off offset:16 glc{{$}}
+
+; GFX9-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; GFX9: global_atomic_dec v{{[0-9]+}}, [[ZERO]], [[K]], s{{\[[0-9]+:[0-9]+\]}} offset:16 glc{{$}}
 define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
   %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false)
@@ -84,18 +87,22 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(i32 addrspace(1)* %o
 }
 
 ; GCN-LABEL: {{^}}global_atomic_dec_noret_i32:
-; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
+; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; CIVI: buffer_atomic_dec [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-; GFX9: global_atomic_dec v{{\[[0-9]+:[0-9]+\]}}, [[K]], off{{$}}
+
+; GFX9-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; GFX9: global_atomic_dec [[ZERO]], [[K]], s{{\[[0-9]+:[0-9]+\]$}}
 define amdgpu_kernel void @global_atomic_dec_noret_i32(i32 addrspace(1)* %ptr) nounwind {
   %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %ptr, i32 42, i32 0, i32 0, i1 false)
   ret void
 }
 
 ; GCN-LABEL: {{^}}global_atomic_dec_noret_i32_offset:
-; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
+; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; CIVI: buffer_atomic_dec [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
-; GFX9: global_atomic_dec v{{\[[0-9]+:[0-9]+\]}}, [[K]], off offset:16{{$}}
+
+; GFX9-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; GFX9: global_atomic_dec [[ZERO]], [[K]], s{{\[[0-9]+:[0-9]+\]}} offset:16{{$}}
 define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(i32 addrspace(1)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
   %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false)
@@ -338,9 +345,11 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i64_offset(i64 addrspace(3)* %pt
 
 ; GCN-LABEL: {{^}}global_atomic_dec_ret_i64:
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GFX9-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; CIVI: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 glc{{$}}
-; GFX9: global_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off glc{{$}}
+
+; GFX9: global_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[ZERO]], v{{\[}}[[KLO]]:[[KHI]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 define amdgpu_kernel void @global_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
   %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %ptr, i64 42, i32 0, i32 0, i1 false)
   store i64 %result, i64 addrspace(1)* %out
@@ -349,9 +358,10 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64
 
 ; GCN-LABEL: {{^}}global_atomic_dec_ret_i64_offset:
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GFX9-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; CIVI: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32 glc{{$}}
-; GFX9: global_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off offset:32 glc{{$}}
+; GFX9: global_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[ZERO]], v{{\[}}[[KLO]]:[[KHI]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:32 glc{{$}}
 define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4
   %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false)
@@ -361,9 +371,10 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(i64 addrspace(1)* %o
 
 ; GCN-LABEL: {{^}}global_atomic_dec_noret_i64:
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GFX9-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; CIVI: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-; GFX9: global_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off{{$}}
+; GFX9: global_atomic_dec_x2 v[[ZERO]], v{{\[}}[[KLO]]:[[KHI]]{{\]}}, s{{\[[0-9]+:[0-9]+\]$}}
 define amdgpu_kernel void @global_atomic_dec_noret_i64(i64 addrspace(1)* %ptr) nounwind {
   %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %ptr, i64 42, i32 0, i32 0, i1 false)
   ret void
@@ -371,9 +382,10 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64(i64 addrspace(1)* %ptr) n
 
 ; GCN-LABEL: {{^}}global_atomic_dec_noret_i64_offset:
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GFX9-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; CIVI: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32{{$}}
-; GFX9: global_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off offset:32{{$}}
+; GFX9: global_atomic_dec_x2 v[[ZERO]], v{{\[}}[[KLO]]:[[KHI]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}}
 define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(i64 addrspace(1)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4
   %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false)

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.ll
index aee44794ac89..19322c1b6481 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.ll
@@ -38,7 +38,7 @@ main_body:
 }
 
 ; GCN-LABEL: {{^}}global_atomic_add_f32:
-; GCN: global_atomic_add_f32 v[{{[0-9:]+}}], v{{[0-9]+}}, off
+; GCN: global_atomic_add_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @global_atomic_add_f32(float addrspace(1)* %ptr, float %data) {
 main_body:
   %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %ptr, float %data)
@@ -46,7 +46,7 @@ main_body:
 }
 
 ; GCN-LABEL: {{^}}global_atomic_add_f32_off4:
-; GCN: global_atomic_add_f32 v[{{[0-9:]+}}], v{{[0-9]+}}, off offset:4
+; GCN: global_atomic_add_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:4
 define amdgpu_kernel void @global_atomic_add_f32_off4(float addrspace(1)* %ptr, float %data) {
 main_body:
   %p = getelementptr float, float addrspace(1)* %ptr, i64 1
@@ -55,7 +55,7 @@ main_body:
 }
 
 ; GCN-LABEL: {{^}}global_atomic_add_f32_offneg4:
-; GCN: global_atomic_add_f32 v[{{[0-9:]+}}], v{{[0-9]+}}, off offset:-4
+; GCN: global_atomic_add_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:-4
 define amdgpu_kernel void @global_atomic_add_f32_offneg4(float addrspace(1)* %ptr, float %data) {
 main_body:
   %p = getelementptr float, float addrspace(1)* %ptr, i64 -1
@@ -64,7 +64,7 @@ main_body:
 }
 
 ; GCN-LABEL: {{^}}global_atomic_pk_add_v2f16:
-; GCN: global_atomic_pk_add_f16 v[{{[0-9:]+}}], v{{[0-9]+}}, off
+; GCN: global_atomic_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]$}}
 define amdgpu_kernel void @global_atomic_pk_add_v2f16(<2 x half> addrspace(1)* %ptr, <2 x half> %data) {
 main_body:
   %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)* %ptr, <2 x half> %data)
@@ -72,7 +72,7 @@ main_body:
 }
 
 ; GCN-LABEL: {{^}}global_atomic_pk_add_v2f16_off4:
-; GCN: global_atomic_pk_add_f16 v[{{[0-9:]+}}], v{{[0-9]+}}, off offset:4
+; GCN: global_atomic_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:4
 define amdgpu_kernel void @global_atomic_pk_add_v2f16_off4(<2 x half> addrspace(1)* %ptr, <2 x half> %data) {
 main_body:
   %p = getelementptr <2 x half>, <2 x half> addrspace(1)* %ptr, i64 1
@@ -81,7 +81,7 @@ main_body:
 }
 
 ; GCN-LABEL: {{^}}global_atomic_pk_add_v2f16_offneg4:
-; GCN: global_atomic_pk_add_f16 v[{{[0-9:]+}}], v{{[0-9]+}}, off offset:-4
+; GCN: global_atomic_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:-4{{$}}
 define amdgpu_kernel void @global_atomic_pk_add_v2f16_offneg4(<2 x half> addrspace(1)* %ptr, <2 x half> %data) {
 main_body:
   %p = getelementptr <2 x half>, <2 x half> addrspace(1)* %ptr, i64 -1
@@ -92,7 +92,7 @@ main_body:
 ; Make sure this artificially selects with an incorrect subtarget, but
 ; the feature set.
 ; GCN-LABEL: {{^}}global_atomic_fadd_f32_wrong_subtarget:
-; GCN: global_atomic_add_f32 v[{{[0-9:]+}}], v{{[0-9]+}}, off
+; GCN: global_atomic_add_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]$}}
 define amdgpu_kernel void @global_atomic_fadd_f32_wrong_subtarget(float addrspace(1)* %ptr, float %data) #0 {
   %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %ptr, float %data)
   ret void

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll
index ff9e1ccefbc9..6b66070e537a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll
@@ -70,7 +70,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i32_offset(i32 addrspace(3)* %pt
 ; GCN-LABEL: {{^}}global_atomic_inc_ret_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; CIVI: buffer_atomic_inc [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 glc{{$}}
-; GFX9: global_atomic_inc v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]], off glc{{$}}
+; GFX9: global_atomic_inc v{{[0-9]+}}, v{{[0-9]+}}, [[K]], s{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 define amdgpu_kernel void @global_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
   %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 42, i32 0, i32 0, i1 false)
   store i32 %result, i32 addrspace(1)* %out
@@ -80,7 +80,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32
 ; GCN-LABEL: {{^}}global_atomic_inc_ret_i32_offset:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; CIVI: buffer_atomic_inc [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16 glc{{$}}
-; GFX9: global_atomic_inc v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]], off offset:16 glc{{$}}
+; GFX9: global_atomic_inc v{{[0-9]+}}, v{{[0-9]+}}, [[K]], s{{\[[0-9]+:[0-9]+\]}} offset:16 glc{{$}}
 define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
   %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false)
@@ -91,7 +91,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(i32 addrspace(1)* %o
 ; GCN-LABEL: {{^}}global_atomic_inc_noret_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; CIVI: buffer_atomic_inc [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-; GFX9: global_atomic_inc v{{\[[0-9]+:[0-9]+\]}}, [[K]], off{{$}}
+; GFX9: global_atomic_inc v{{[0-9]+}}, [[K]], s{{\[[0-9]+:[0-9]+\]$}}
 define amdgpu_kernel void @global_atomic_inc_noret_i32(i32 addrspace(1)* %ptr) nounwind {
   %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 42, i32 0, i32 0, i1 false)
   ret void
@@ -100,7 +100,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32(i32 addrspace(1)* %ptr) n
 ; GCN-LABEL: {{^}}global_atomic_inc_noret_i32_offset:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; CIVI: buffer_atomic_inc [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
-; GFX9: global_atomic_inc v{{\[[0-9]+:[0-9]+\]}}, [[K]], off offset:16{{$}}
+; GFX9: global_atomic_inc v{{[0-9]+}}, [[K]], s{{\[[0-9]+:[0-9]+\]}} offset:16{{$}}
 define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(i32 addrspace(1)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
   %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false)
@@ -193,9 +193,10 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64_offset(i64 addrspace(3)* %pt
 
 ; GCN-LABEL: {{^}}global_atomic_inc_ret_i64:
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GFX9: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; CIVI: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 glc{{$}}
-; GFX9: global_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off glc{{$}}
+; GFX9: global_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[ZERO]], v{{\[}}[[KLO]]:[[KHI]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 define amdgpu_kernel void @global_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
   %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 42, i32 0, i32 0, i1 false)
   store i64 %result, i64 addrspace(1)* %out
@@ -204,9 +205,10 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64
 
 ; GCN-LABEL: {{^}}global_atomic_inc_ret_i64_offset:
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
-; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GFX9-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
+; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; CIVI: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32 glc{{$}}
-; GFX9: global_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off offset:32 glc{{$}}
+; GFX9: global_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[ZERO]], v{{\[}}[[KLO]]:[[KHI]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:32 glc{{$}}
 define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4
   %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false)
@@ -216,10 +218,11 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(i64 addrspace(1)* %o
 
 ; GCN-LABEL: {{^}}global_atomic_inc_noret_i64:
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GFX9-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; CIVI: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 
-; GFX9: global_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off{{$}}
+; GFX9: global_atomic_inc_x2 v[[ZERO]], v{{\[}}[[KLO]]:[[KHI]]{{\]}}, s{{\[[0-9]+:[0-9]+\]$}}
 define amdgpu_kernel void @global_atomic_inc_noret_i64(i64 addrspace(1)* %ptr) nounwind {
   %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 42, i32 0, i32 0, i1 false)
   ret void
@@ -227,9 +230,10 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(i64 addrspace(1)* %ptr) n
 
 ; GCN-LABEL: {{^}}global_atomic_inc_noret_i64_offset:
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GFX9-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; CIVI: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32{{$}}
-; GFX9: global_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off offset:32{{$}}
+; GFX9: global_atomic_inc_x2 v[[ZERO]], v{{\[}}[[KLO]]:[[KHI]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}}
 define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(i64 addrspace(1)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4
   %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false)

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
index 5df47347267b..e64555c5a6d1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
@@ -32,12 +32,11 @@ define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out,
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s1
-; GFX9-NEXT:    v_cvt_pkrtz_f16_f32 v2, s0, v0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_cvt_pkrtz_f16_f32 v1, s0, v1
+; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX9-NEXT:    s_endpgm
   %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %y)
   store <2 x half> %result, <2 x half> addrspace(1)* %out
@@ -71,11 +70,10 @@ define amdgpu_kernel void @s_cvt_pkrtz_samereg_v2f16_f32(<2 x half> addrspace(1)
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x2c
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_cvt_pkrtz_f16_f32 v2, s0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_cvt_pkrtz_f16_f32 v1, s0, s0
+; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX9-NEXT:    s_endpgm
   %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %x)
   store <2 x half> %result, <2 x half> addrspace(1)* %out

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll
index ca5a8180989d..562e0f5dcd87 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll
@@ -76,17 +76,20 @@ define amdgpu_ps <4 x float> @load_1d_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1
 ;
 ; GFX6789-LABEL: load_1d_tfe:
 ; GFX6789:       ; %bb.0: ; %main_body
+; GFX6789-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX6789-NEXT:    v_mov_b32_e32 v5, v0
-; GFX6789-NEXT:    v_mov_b32_e32 v0, 0
-; GFX6789-NEXT:    v_mov_b32_e32 v1, v0
-; GFX6789-NEXT:    v_mov_b32_e32 v2, v0
-; GFX6789-NEXT:    v_mov_b32_e32 v3, v0
-; GFX6789-NEXT:    v_mov_b32_e32 v4, v0
+; GFX6789-NEXT:    v_mov_b32_e32 v7, v6
+; GFX6789-NEXT:    v_mov_b32_e32 v8, v6
+; GFX6789-NEXT:    v_mov_b32_e32 v9, v6
+; GFX6789-NEXT:    v_mov_b32_e32 v10, v6
+; GFX6789-NEXT:    v_mov_b32_e32 v0, v6
+; GFX6789-NEXT:    v_mov_b32_e32 v1, v7
+; GFX6789-NEXT:    v_mov_b32_e32 v2, v8
+; GFX6789-NEXT:    v_mov_b32_e32 v3, v9
+; GFX6789-NEXT:    v_mov_b32_e32 v4, v10
 ; GFX6789-NEXT:    image_load v[0:4], v5, s[0:7] dmask:0xf unorm tfe
-; GFX6789-NEXT:    v_mov_b32_e32 v5, s8
-; GFX6789-NEXT:    v_mov_b32_e32 v6, s9
 ; GFX6789-NEXT:    s_waitcnt vmcnt(0)
-; GFX6789-NEXT:    global_store_dword v[5:6], v4, off
+; GFX6789-NEXT:    global_store_dword v6, v4, s[8:9]
 ; GFX6789-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6789-NEXT:    ; return to shader part epilog
 ;
@@ -94,27 +97,29 @@ define amdgpu_ps <4 x float> @load_1d_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1
 ; NOPRT:       ; %bb.0: ; %main_body
 ; NOPRT-NEXT:    v_mov_b32_e32 v4, 0
 ; NOPRT-NEXT:    image_load v[0:4], v0, s[0:7] dmask:0xf unorm tfe
-; NOPRT-NEXT:    v_mov_b32_e32 v5, s8
-; NOPRT-NEXT:    v_mov_b32_e32 v6, s9
+; NOPRT-NEXT:    v_mov_b32_e32 v5, 0
 ; NOPRT-NEXT:    s_waitcnt vmcnt(0)
-; NOPRT-NEXT:    global_store_dword v[5:6], v4, off
+; NOPRT-NEXT:    global_store_dword v5, v4, s[8:9]
 ; NOPRT-NEXT:    s_waitcnt vmcnt(0)
 ; NOPRT-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: load_1d_tfe:
 ; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_mov_b32_e32 v6, 0 ; encoding: [0x80,0x02,0x0c,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e]
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v7, v6 ; encoding: [0x06,0x03,0x0e,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v8, v6 ; encoding: [0x06,0x03,0x10,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v9, v6 ; encoding: [0x06,0x03,0x12,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v10, v6 ; encoding: [0x06,0x03,0x14,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v0, v6 ; encoding: [0x06,0x03,0x00,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v1, v7 ; encoding: [0x07,0x03,0x02,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v2, v8 ; encoding: [0x08,0x03,0x04,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v3, v9 ; encoding: [0x09,0x03,0x06,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v4, v10 ; encoding: [0x0a,0x03,0x08,0x7e]
 ; GFX10-NEXT:    image_load v[0:4], v5, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm tfe ; encoding: [0x00,0x1f,0x01,0xf0,0x05,0x00,0x00,0x00]
-; GFX10-NEXT:    v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
-; GFX10-NEXT:    global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00]
+; GFX10-NEXT:    global_store_dword v6, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x06,0x04,0x08,0x00]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:
@@ -160,17 +165,20 @@ define amdgpu_ps <4 x float> @load_1d_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1
 ;
 ; GFX6789-LABEL: load_1d_lwe:
 ; GFX6789:       ; %bb.0: ; %main_body
+; GFX6789-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX6789-NEXT:    v_mov_b32_e32 v5, v0
-; GFX6789-NEXT:    v_mov_b32_e32 v0, 0
-; GFX6789-NEXT:    v_mov_b32_e32 v1, v0
-; GFX6789-NEXT:    v_mov_b32_e32 v2, v0
-; GFX6789-NEXT:    v_mov_b32_e32 v3, v0
-; GFX6789-NEXT:    v_mov_b32_e32 v4, v0
+; GFX6789-NEXT:    v_mov_b32_e32 v7, v6
+; GFX6789-NEXT:    v_mov_b32_e32 v8, v6
+; GFX6789-NEXT:    v_mov_b32_e32 v9, v6
+; GFX6789-NEXT:    v_mov_b32_e32 v10, v6
+; GFX6789-NEXT:    v_mov_b32_e32 v0, v6
+; GFX6789-NEXT:    v_mov_b32_e32 v1, v7
+; GFX6789-NEXT:    v_mov_b32_e32 v2, v8
+; GFX6789-NEXT:    v_mov_b32_e32 v3, v9
+; GFX6789-NEXT:    v_mov_b32_e32 v4, v10
 ; GFX6789-NEXT:    image_load v[0:4], v5, s[0:7] dmask:0xf unorm lwe
-; GFX6789-NEXT:    v_mov_b32_e32 v5, s8
-; GFX6789-NEXT:    v_mov_b32_e32 v6, s9
 ; GFX6789-NEXT:    s_waitcnt vmcnt(0)
-; GFX6789-NEXT:    global_store_dword v[5:6], v4, off
+; GFX6789-NEXT:    global_store_dword v6, v4, s[8:9]
 ; GFX6789-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6789-NEXT:    ; return to shader part epilog
 ;
@@ -178,27 +186,29 @@ define amdgpu_ps <4 x float> @load_1d_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1
 ; NOPRT:       ; %bb.0: ; %main_body
 ; NOPRT-NEXT:    v_mov_b32_e32 v4, 0
 ; NOPRT-NEXT:    image_load v[0:4], v0, s[0:7] dmask:0xf unorm lwe
-; NOPRT-NEXT:    v_mov_b32_e32 v5, s8
-; NOPRT-NEXT:    v_mov_b32_e32 v6, s9
+; NOPRT-NEXT:    v_mov_b32_e32 v5, 0
 ; NOPRT-NEXT:    s_waitcnt vmcnt(0)
-; NOPRT-NEXT:    global_store_dword v[5:6], v4, off
+; NOPRT-NEXT:    global_store_dword v5, v4, s[8:9]
 ; NOPRT-NEXT:    s_waitcnt vmcnt(0)
 ; NOPRT-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: load_1d_lwe:
 ; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_mov_b32_e32 v6, 0 ; encoding: [0x80,0x02,0x0c,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e]
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v7, v6 ; encoding: [0x06,0x03,0x0e,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v8, v6 ; encoding: [0x06,0x03,0x10,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v9, v6 ; encoding: [0x06,0x03,0x12,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v10, v6 ; encoding: [0x06,0x03,0x14,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v0, v6 ; encoding: [0x06,0x03,0x00,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v1, v7 ; encoding: [0x07,0x03,0x02,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v2, v8 ; encoding: [0x08,0x03,0x04,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v3, v9 ; encoding: [0x09,0x03,0x06,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v4, v10 ; encoding: [0x0a,0x03,0x08,0x7e]
 ; GFX10-NEXT:    image_load v[0:4], v5, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm lwe ; encoding: [0x00,0x1f,0x02,0xf0,0x05,0x00,0x00,0x00]
-; GFX10-NEXT:    v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
-; GFX10-NEXT:    global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00]
+; GFX10-NEXT:    global_store_dword v6, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x06,0x04,0x08,0x00]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:
@@ -282,18 +292,21 @@ define amdgpu_ps <4 x float> @load_2d_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1
 ;
 ; GFX6789-LABEL: load_2d_tfe:
 ; GFX6789:       ; %bb.0: ; %main_body
-; GFX6789-NEXT:    v_mov_b32_e32 v5, v0
-; GFX6789-NEXT:    v_mov_b32_e32 v0, 0
+; GFX6789-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX6789-NEXT:    v_mov_b32_e32 v6, v1
-; GFX6789-NEXT:    v_mov_b32_e32 v1, v0
-; GFX6789-NEXT:    v_mov_b32_e32 v2, v0
-; GFX6789-NEXT:    v_mov_b32_e32 v3, v0
-; GFX6789-NEXT:    v_mov_b32_e32 v4, v0
+; GFX6789-NEXT:    v_mov_b32_e32 v5, v0
+; GFX6789-NEXT:    v_mov_b32_e32 v8, v7
+; GFX6789-NEXT:    v_mov_b32_e32 v9, v7
+; GFX6789-NEXT:    v_mov_b32_e32 v10, v7
+; GFX6789-NEXT:    v_mov_b32_e32 v11, v7
+; GFX6789-NEXT:    v_mov_b32_e32 v0, v7
+; GFX6789-NEXT:    v_mov_b32_e32 v1, v8
+; GFX6789-NEXT:    v_mov_b32_e32 v2, v9
+; GFX6789-NEXT:    v_mov_b32_e32 v3, v10
+; GFX6789-NEXT:    v_mov_b32_e32 v4, v11
 ; GFX6789-NEXT:    image_load v[0:4], v[5:6], s[0:7] dmask:0xf unorm tfe
-; GFX6789-NEXT:    v_mov_b32_e32 v5, s8
-; GFX6789-NEXT:    v_mov_b32_e32 v6, s9
 ; GFX6789-NEXT:    s_waitcnt vmcnt(0)
-; GFX6789-NEXT:    global_store_dword v[5:6], v4, off
+; GFX6789-NEXT:    global_store_dword v7, v4, s[8:9]
 ; GFX6789-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6789-NEXT:    ; return to shader part epilog
 ;
@@ -301,28 +314,30 @@ define amdgpu_ps <4 x float> @load_2d_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1
 ; NOPRT:       ; %bb.0: ; %main_body
 ; NOPRT-NEXT:    v_mov_b32_e32 v4, 0
 ; NOPRT-NEXT:    image_load v[0:4], v[0:1], s[0:7] dmask:0xf unorm tfe
-; NOPRT-NEXT:    v_mov_b32_e32 v5, s8
-; NOPRT-NEXT:    v_mov_b32_e32 v6, s9
+; NOPRT-NEXT:    v_mov_b32_e32 v5, 0
 ; NOPRT-NEXT:    s_waitcnt vmcnt(0)
-; NOPRT-NEXT:    global_store_dword v[5:6], v4, off
+; NOPRT-NEXT:    global_store_dword v5, v4, s[8:9]
 ; NOPRT-NEXT:    s_waitcnt vmcnt(0)
 ; NOPRT-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: load_2d_tfe:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v7, 0 ; encoding: [0x80,0x02,0x0e,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e]
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v8, v7 ; encoding: [0x07,0x03,0x10,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v9, v7 ; encoding: [0x07,0x03,0x12,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v10, v7 ; encoding: [0x07,0x03,0x14,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v11, v7 ; encoding: [0x07,0x03,0x16,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v0, v7 ; encoding: [0x07,0x03,0x00,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v1, v8 ; encoding: [0x08,0x03,0x02,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v2, v9 ; encoding: [0x09,0x03,0x04,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v3, v10 ; encoding: [0x0a,0x03,0x06,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v4, v11 ; encoding: [0x0b,0x03,0x08,0x7e]
 ; GFX10-NEXT:    image_load v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm tfe ; encoding: [0x08,0x1f,0x01,0xf0,0x05,0x00,0x00,0x00]
-; GFX10-NEXT:    v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
-; GFX10-NEXT:    global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00]
+; GFX10-NEXT:    global_store_dword v7, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x07,0x04,0x08,0x00]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:
@@ -408,19 +423,22 @@ define amdgpu_ps <4 x float> @load_3d_tfe_lwe(<8 x i32> inreg %rsrc, i32 addrspa
 ;
 ; GFX6789-LABEL: load_3d_tfe_lwe:
 ; GFX6789:       ; %bb.0: ; %main_body
-; GFX6789-NEXT:    v_mov_b32_e32 v5, v0
-; GFX6789-NEXT:    v_mov_b32_e32 v0, 0
+; GFX6789-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX6789-NEXT:    v_mov_b32_e32 v7, v2
 ; GFX6789-NEXT:    v_mov_b32_e32 v6, v1
-; GFX6789-NEXT:    v_mov_b32_e32 v1, v0
-; GFX6789-NEXT:    v_mov_b32_e32 v2, v0
-; GFX6789-NEXT:    v_mov_b32_e32 v3, v0
-; GFX6789-NEXT:    v_mov_b32_e32 v4, v0
+; GFX6789-NEXT:    v_mov_b32_e32 v5, v0
+; GFX6789-NEXT:    v_mov_b32_e32 v9, v8
+; GFX6789-NEXT:    v_mov_b32_e32 v10, v8
+; GFX6789-NEXT:    v_mov_b32_e32 v11, v8
+; GFX6789-NEXT:    v_mov_b32_e32 v12, v8
+; GFX6789-NEXT:    v_mov_b32_e32 v0, v8
+; GFX6789-NEXT:    v_mov_b32_e32 v1, v9
+; GFX6789-NEXT:    v_mov_b32_e32 v2, v10
+; GFX6789-NEXT:    v_mov_b32_e32 v3, v11
+; GFX6789-NEXT:    v_mov_b32_e32 v4, v12
 ; GFX6789-NEXT:    image_load v[0:4], v[5:7], s[0:7] dmask:0xf unorm tfe lwe
-; GFX6789-NEXT:    v_mov_b32_e32 v5, s8
-; GFX6789-NEXT:    v_mov_b32_e32 v6, s9
 ; GFX6789-NEXT:    s_waitcnt vmcnt(0)
-; GFX6789-NEXT:    global_store_dword v[5:6], v4, off
+; GFX6789-NEXT:    global_store_dword v8, v4, s[8:9]
 ; GFX6789-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6789-NEXT:    ; return to shader part epilog
 ;
@@ -428,29 +446,31 @@ define amdgpu_ps <4 x float> @load_3d_tfe_lwe(<8 x i32> inreg %rsrc, i32 addrspa
 ; NOPRT:       ; %bb.0: ; %main_body
 ; NOPRT-NEXT:    v_mov_b32_e32 v4, 0
 ; NOPRT-NEXT:    image_load v[0:4], v[0:2], s[0:7] dmask:0xf unorm tfe lwe
-; NOPRT-NEXT:    v_mov_b32_e32 v5, s8
-; NOPRT-NEXT:    v_mov_b32_e32 v6, s9
+; NOPRT-NEXT:    v_mov_b32_e32 v5, 0
 ; NOPRT-NEXT:    s_waitcnt vmcnt(0)
-; NOPRT-NEXT:    global_store_dword v[5:6], v4, off
+; NOPRT-NEXT:    global_store_dword v5, v4, s[8:9]
 ; NOPRT-NEXT:    s_waitcnt vmcnt(0)
 ; NOPRT-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: load_3d_tfe_lwe:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v11, 0 ; encoding: [0x80,0x02,0x16,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e]
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v12, v11 ; encoding: [0x0b,0x03,0x18,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v13, v11 ; encoding: [0x0b,0x03,0x1a,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v14, v11 ; encoding: [0x0b,0x03,0x1c,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v15, v11 ; encoding: [0x0b,0x03,0x1e,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v0, v11 ; encoding: [0x0b,0x03,0x00,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v1, v12 ; encoding: [0x0c,0x03,0x02,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v2, v13 ; encoding: [0x0d,0x03,0x04,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v3, v14 ; encoding: [0x0e,0x03,0x06,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v4, v15 ; encoding: [0x0f,0x03,0x08,0x7e]
 ; GFX10-NEXT:    image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm tfe lwe ; encoding: [0x10,0x1f,0x03,0xf0,0x05,0x00,0x00,0x00]
-; GFX10-NEXT:    v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
-; GFX10-NEXT:    global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00]
+; GFX10-NEXT:    global_store_dword v11, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x0b,0x04,0x08,0x00]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:
@@ -536,19 +556,22 @@ define amdgpu_ps <4 x float> @load_cube_lwe(<8 x i32> inreg %rsrc, i32 addrspace
 ;
 ; GFX6789-LABEL: load_cube_lwe:
 ; GFX6789:       ; %bb.0: ; %main_body
-; GFX6789-NEXT:    v_mov_b32_e32 v5, v0
-; GFX6789-NEXT:    v_mov_b32_e32 v0, 0
+; GFX6789-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX6789-NEXT:    v_mov_b32_e32 v7, v2
 ; GFX6789-NEXT:    v_mov_b32_e32 v6, v1
-; GFX6789-NEXT:    v_mov_b32_e32 v1, v0
-; GFX6789-NEXT:    v_mov_b32_e32 v2, v0
-; GFX6789-NEXT:    v_mov_b32_e32 v3, v0
-; GFX6789-NEXT:    v_mov_b32_e32 v4, v0
+; GFX6789-NEXT:    v_mov_b32_e32 v5, v0
+; GFX6789-NEXT:    v_mov_b32_e32 v9, v8
+; GFX6789-NEXT:    v_mov_b32_e32 v10, v8
+; GFX6789-NEXT:    v_mov_b32_e32 v11, v8
+; GFX6789-NEXT:    v_mov_b32_e32 v12, v8
+; GFX6789-NEXT:    v_mov_b32_e32 v0, v8
+; GFX6789-NEXT:    v_mov_b32_e32 v1, v9
+; GFX6789-NEXT:    v_mov_b32_e32 v2, v10
+; GFX6789-NEXT:    v_mov_b32_e32 v3, v11
+; GFX6789-NEXT:    v_mov_b32_e32 v4, v12
 ; GFX6789-NEXT:    image_load v[0:4], v[5:7], s[0:7] dmask:0xf unorm lwe da
-; GFX6789-NEXT:    v_mov_b32_e32 v5, s8
-; GFX6789-NEXT:    v_mov_b32_e32 v6, s9
 ; GFX6789-NEXT:    s_waitcnt vmcnt(0)
-; GFX6789-NEXT:    global_store_dword v[5:6], v4, off
+; GFX6789-NEXT:    global_store_dword v8, v4, s[8:9]
 ; GFX6789-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6789-NEXT:    ; return to shader part epilog
 ;
@@ -556,29 +579,31 @@ define amdgpu_ps <4 x float> @load_cube_lwe(<8 x i32> inreg %rsrc, i32 addrspace
 ; NOPRT:       ; %bb.0: ; %main_body
 ; NOPRT-NEXT:    v_mov_b32_e32 v4, 0
 ; NOPRT-NEXT:    image_load v[0:4], v[0:2], s[0:7] dmask:0xf unorm lwe da
-; NOPRT-NEXT:    v_mov_b32_e32 v5, s8
-; NOPRT-NEXT:    v_mov_b32_e32 v6, s9
+; NOPRT-NEXT:    v_mov_b32_e32 v5, 0
 ; NOPRT-NEXT:    s_waitcnt vmcnt(0)
-; NOPRT-NEXT:    global_store_dword v[5:6], v4, off
+; NOPRT-NEXT:    global_store_dword v5, v4, s[8:9]
 ; NOPRT-NEXT:    s_waitcnt vmcnt(0)
 ; NOPRT-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: load_cube_lwe:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v11, 0 ; encoding: [0x80,0x02,0x16,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e]
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v12, v11 ; encoding: [0x0b,0x03,0x18,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v13, v11 ; encoding: [0x0b,0x03,0x1a,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v14, v11 ; encoding: [0x0b,0x03,0x1c,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v15, v11 ; encoding: [0x0b,0x03,0x1e,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v0, v11 ; encoding: [0x0b,0x03,0x00,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v1, v12 ; encoding: [0x0c,0x03,0x02,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v2, v13 ; encoding: [0x0d,0x03,0x04,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v3, v14 ; encoding: [0x0e,0x03,0x06,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v4, v15 ; encoding: [0x0f,0x03,0x08,0x7e]
 ; GFX10-NEXT:    image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm lwe ; encoding: [0x18,0x1f,0x02,0xf0,0x05,0x00,0x00,0x00]
-; GFX10-NEXT:    v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
-; GFX10-NEXT:    global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00]
+; GFX10-NEXT:    global_store_dword v11, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x0b,0x04,0x08,0x00]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:
@@ -662,18 +687,21 @@ define amdgpu_ps <4 x float> @load_1darray_tfe(<8 x i32> inreg %rsrc, i32 addrsp
 ;
 ; GFX6789-LABEL: load_1darray_tfe:
 ; GFX6789:       ; %bb.0: ; %main_body
-; GFX6789-NEXT:    v_mov_b32_e32 v5, v0
-; GFX6789-NEXT:    v_mov_b32_e32 v0, 0
+; GFX6789-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX6789-NEXT:    v_mov_b32_e32 v6, v1
-; GFX6789-NEXT:    v_mov_b32_e32 v1, v0
-; GFX6789-NEXT:    v_mov_b32_e32 v2, v0
-; GFX6789-NEXT:    v_mov_b32_e32 v3, v0
-; GFX6789-NEXT:    v_mov_b32_e32 v4, v0
+; GFX6789-NEXT:    v_mov_b32_e32 v5, v0
+; GFX6789-NEXT:    v_mov_b32_e32 v8, v7
+; GFX6789-NEXT:    v_mov_b32_e32 v9, v7
+; GFX6789-NEXT:    v_mov_b32_e32 v10, v7
+; GFX6789-NEXT:    v_mov_b32_e32 v11, v7
+; GFX6789-NEXT:    v_mov_b32_e32 v0, v7
+; GFX6789-NEXT:    v_mov_b32_e32 v1, v8
+; GFX6789-NEXT:    v_mov_b32_e32 v2, v9
+; GFX6789-NEXT:    v_mov_b32_e32 v3, v10
+; GFX6789-NEXT:    v_mov_b32_e32 v4, v11
 ; GFX6789-NEXT:    image_load v[0:4], v[5:6], s[0:7] dmask:0xf unorm tfe da
-; GFX6789-NEXT:    v_mov_b32_e32 v5, s8
-; GFX6789-NEXT:    v_mov_b32_e32 v6, s9
 ; GFX6789-NEXT:    s_waitcnt vmcnt(0)
-; GFX6789-NEXT:    global_store_dword v[5:6], v4, off
+; GFX6789-NEXT:    global_store_dword v7, v4, s[8:9]
 ; GFX6789-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6789-NEXT:    ; return to shader part epilog
 ;
@@ -681,28 +709,30 @@ define amdgpu_ps <4 x float> @load_1darray_tfe(<8 x i32> inreg %rsrc, i32 addrsp
 ; NOPRT:       ; %bb.0: ; %main_body
 ; NOPRT-NEXT:    v_mov_b32_e32 v4, 0
 ; NOPRT-NEXT:    image_load v[0:4], v[0:1], s[0:7] dmask:0xf unorm tfe da
-; NOPRT-NEXT:    v_mov_b32_e32 v5, s8
-; NOPRT-NEXT:    v_mov_b32_e32 v6, s9
+; NOPRT-NEXT:    v_mov_b32_e32 v5, 0
 ; NOPRT-NEXT:    s_waitcnt vmcnt(0)
-; NOPRT-NEXT:    global_store_dword v[5:6], v4, off
+; NOPRT-NEXT:    global_store_dword v5, v4, s[8:9]
 ; NOPRT-NEXT:    s_waitcnt vmcnt(0)
 ; NOPRT-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: load_1darray_tfe:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v7, 0 ; encoding: [0x80,0x02,0x0e,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e]
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v8, v7 ; encoding: [0x07,0x03,0x10,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v9, v7 ; encoding: [0x07,0x03,0x12,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v10, v7 ; encoding: [0x07,0x03,0x14,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v11, v7 ; encoding: [0x07,0x03,0x16,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v0, v7 ; encoding: [0x07,0x03,0x00,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v1, v8 ; encoding: [0x08,0x03,0x02,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v2, v9 ; encoding: [0x09,0x03,0x04,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v3, v10 ; encoding: [0x0a,0x03,0x06,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v4, v11 ; encoding: [0x0b,0x03,0x08,0x7e]
 ; GFX10-NEXT:    image_load v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm tfe ; encoding: [0x20,0x1f,0x01,0xf0,0x05,0x00,0x00,0x00]
-; GFX10-NEXT:    v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
-; GFX10-NEXT:    global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00]
+; GFX10-NEXT:    global_store_dword v7, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x07,0x04,0x08,0x00]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:
@@ -788,19 +818,22 @@ define amdgpu_ps <4 x float> @load_2darray_lwe(<8 x i32> inreg %rsrc, i32 addrsp
 ;
 ; GFX6789-LABEL: load_2darray_lwe:
 ; GFX6789:       ; %bb.0: ; %main_body
-; GFX6789-NEXT:    v_mov_b32_e32 v5, v0
-; GFX6789-NEXT:    v_mov_b32_e32 v0, 0
+; GFX6789-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX6789-NEXT:    v_mov_b32_e32 v7, v2
 ; GFX6789-NEXT:    v_mov_b32_e32 v6, v1
-; GFX6789-NEXT:    v_mov_b32_e32 v1, v0
-; GFX6789-NEXT:    v_mov_b32_e32 v2, v0
-; GFX6789-NEXT:    v_mov_b32_e32 v3, v0
-; GFX6789-NEXT:    v_mov_b32_e32 v4, v0
+; GFX6789-NEXT:    v_mov_b32_e32 v5, v0
+; GFX6789-NEXT:    v_mov_b32_e32 v9, v8
+; GFX6789-NEXT:    v_mov_b32_e32 v10, v8
+; GFX6789-NEXT:    v_mov_b32_e32 v11, v8
+; GFX6789-NEXT:    v_mov_b32_e32 v12, v8
+; GFX6789-NEXT:    v_mov_b32_e32 v0, v8
+; GFX6789-NEXT:    v_mov_b32_e32 v1, v9
+; GFX6789-NEXT:    v_mov_b32_e32 v2, v10
+; GFX6789-NEXT:    v_mov_b32_e32 v3, v11
+; GFX6789-NEXT:    v_mov_b32_e32 v4, v12
 ; GFX6789-NEXT:    image_load v[0:4], v[5:7], s[0:7] dmask:0xf unorm lwe da
-; GFX6789-NEXT:    v_mov_b32_e32 v5, s8
-; GFX6789-NEXT:    v_mov_b32_e32 v6, s9
 ; GFX6789-NEXT:    s_waitcnt vmcnt(0)
-; GFX6789-NEXT:    global_store_dword v[5:6], v4, off
+; GFX6789-NEXT:    global_store_dword v8, v4, s[8:9]
 ; GFX6789-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6789-NEXT:    ; return to shader part epilog
 ;
@@ -808,29 +841,31 @@ define amdgpu_ps <4 x float> @load_2darray_lwe(<8 x i32> inreg %rsrc, i32 addrsp
 ; NOPRT:       ; %bb.0: ; %main_body
 ; NOPRT-NEXT:    v_mov_b32_e32 v4, 0
 ; NOPRT-NEXT:    image_load v[0:4], v[0:2], s[0:7] dmask:0xf unorm lwe da
-; NOPRT-NEXT:    v_mov_b32_e32 v5, s8
-; NOPRT-NEXT:    v_mov_b32_e32 v6, s9
+; NOPRT-NEXT:    v_mov_b32_e32 v5, 0
 ; NOPRT-NEXT:    s_waitcnt vmcnt(0)
-; NOPRT-NEXT:    global_store_dword v[5:6], v4, off
+; NOPRT-NEXT:    global_store_dword v5, v4, s[8:9]
 ; NOPRT-NEXT:    s_waitcnt vmcnt(0)
 ; NOPRT-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: load_2darray_lwe:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v11, 0 ; encoding: [0x80,0x02,0x16,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e]
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v12, v11 ; encoding: [0x0b,0x03,0x18,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v13, v11 ; encoding: [0x0b,0x03,0x1a,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v14, v11 ; encoding: [0x0b,0x03,0x1c,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v15, v11 ; encoding: [0x0b,0x03,0x1e,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v0, v11 ; encoding: [0x0b,0x03,0x00,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v1, v12 ; encoding: [0x0c,0x03,0x02,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v2, v13 ; encoding: [0x0d,0x03,0x04,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v3, v14 ; encoding: [0x0e,0x03,0x06,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v4, v15 ; encoding: [0x0f,0x03,0x08,0x7e]
 ; GFX10-NEXT:    image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm lwe ; encoding: [0x28,0x1f,0x02,0xf0,0x05,0x00,0x00,0x00]
-; GFX10-NEXT:    v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
-; GFX10-NEXT:    global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00]
+; GFX10-NEXT:    global_store_dword v11, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x0b,0x04,0x08,0x00]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:
@@ -916,19 +951,22 @@ define amdgpu_ps <4 x float> @load_2dmsaa_both(<8 x i32> inreg %rsrc, i32 addrsp
 ;
 ; GFX6789-LABEL: load_2dmsaa_both:
 ; GFX6789:       ; %bb.0: ; %main_body
-; GFX6789-NEXT:    v_mov_b32_e32 v5, v0
-; GFX6789-NEXT:    v_mov_b32_e32 v0, 0
+; GFX6789-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX6789-NEXT:    v_mov_b32_e32 v7, v2
 ; GFX6789-NEXT:    v_mov_b32_e32 v6, v1
-; GFX6789-NEXT:    v_mov_b32_e32 v1, v0
-; GFX6789-NEXT:    v_mov_b32_e32 v2, v0
-; GFX6789-NEXT:    v_mov_b32_e32 v3, v0
-; GFX6789-NEXT:    v_mov_b32_e32 v4, v0
+; GFX6789-NEXT:    v_mov_b32_e32 v5, v0
+; GFX6789-NEXT:    v_mov_b32_e32 v9, v8
+; GFX6789-NEXT:    v_mov_b32_e32 v10, v8
+; GFX6789-NEXT:    v_mov_b32_e32 v11, v8
+; GFX6789-NEXT:    v_mov_b32_e32 v12, v8
+; GFX6789-NEXT:    v_mov_b32_e32 v0, v8
+; GFX6789-NEXT:    v_mov_b32_e32 v1, v9
+; GFX6789-NEXT:    v_mov_b32_e32 v2, v10
+; GFX6789-NEXT:    v_mov_b32_e32 v3, v11
+; GFX6789-NEXT:    v_mov_b32_e32 v4, v12
 ; GFX6789-NEXT:    image_load v[0:4], v[5:7], s[0:7] dmask:0xf unorm tfe lwe
-; GFX6789-NEXT:    v_mov_b32_e32 v5, s8
-; GFX6789-NEXT:    v_mov_b32_e32 v6, s9
 ; GFX6789-NEXT:    s_waitcnt vmcnt(0)
-; GFX6789-NEXT:    global_store_dword v[5:6], v4, off
+; GFX6789-NEXT:    global_store_dword v8, v4, s[8:9]
 ; GFX6789-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6789-NEXT:    ; return to shader part epilog
 ;
@@ -936,29 +974,31 @@ define amdgpu_ps <4 x float> @load_2dmsaa_both(<8 x i32> inreg %rsrc, i32 addrsp
 ; NOPRT:       ; %bb.0: ; %main_body
 ; NOPRT-NEXT:    v_mov_b32_e32 v4, 0
 ; NOPRT-NEXT:    image_load v[0:4], v[0:2], s[0:7] dmask:0xf unorm tfe lwe
-; NOPRT-NEXT:    v_mov_b32_e32 v5, s8
-; NOPRT-NEXT:    v_mov_b32_e32 v6, s9
+; NOPRT-NEXT:    v_mov_b32_e32 v5, 0
 ; NOPRT-NEXT:    s_waitcnt vmcnt(0)
-; NOPRT-NEXT:    global_store_dword v[5:6], v4, off
+; NOPRT-NEXT:    global_store_dword v5, v4, s[8:9]
 ; NOPRT-NEXT:    s_waitcnt vmcnt(0)
 ; NOPRT-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: load_2dmsaa_both:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v11, 0 ; encoding: [0x80,0x02,0x16,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e]
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v12, v11 ; encoding: [0x0b,0x03,0x18,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v13, v11 ; encoding: [0x0b,0x03,0x1a,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v14, v11 ; encoding: [0x0b,0x03,0x1c,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v15, v11 ; encoding: [0x0b,0x03,0x1e,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v0, v11 ; encoding: [0x0b,0x03,0x00,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v1, v12 ; encoding: [0x0c,0x03,0x02,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v2, v13 ; encoding: [0x0d,0x03,0x04,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v3, v14 ; encoding: [0x0e,0x03,0x06,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v4, v15 ; encoding: [0x0f,0x03,0x08,0x7e]
 ; GFX10-NEXT:    image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm tfe lwe ; encoding: [0x30,0x1f,0x03,0xf0,0x05,0x00,0x00,0x00]
-; GFX10-NEXT:    v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
-; GFX10-NEXT:    global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00]
+; GFX10-NEXT:    global_store_dword v11, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x0b,0x04,0x08,0x00]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:
@@ -1046,20 +1086,23 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, i32 ad
 ;
 ; GFX6789-LABEL: load_2darraymsaa_tfe:
 ; GFX6789:       ; %bb.0: ; %main_body
-; GFX6789-NEXT:    v_mov_b32_e32 v5, v0
-; GFX6789-NEXT:    v_mov_b32_e32 v0, 0
+; GFX6789-NEXT:    v_mov_b32_e32 v9, 0
 ; GFX6789-NEXT:    v_mov_b32_e32 v8, v3
 ; GFX6789-NEXT:    v_mov_b32_e32 v7, v2
 ; GFX6789-NEXT:    v_mov_b32_e32 v6, v1
-; GFX6789-NEXT:    v_mov_b32_e32 v1, v0
-; GFX6789-NEXT:    v_mov_b32_e32 v2, v0
-; GFX6789-NEXT:    v_mov_b32_e32 v3, v0
-; GFX6789-NEXT:    v_mov_b32_e32 v4, v0
+; GFX6789-NEXT:    v_mov_b32_e32 v5, v0
+; GFX6789-NEXT:    v_mov_b32_e32 v10, v9
+; GFX6789-NEXT:    v_mov_b32_e32 v11, v9
+; GFX6789-NEXT:    v_mov_b32_e32 v12, v9
+; GFX6789-NEXT:    v_mov_b32_e32 v13, v9
+; GFX6789-NEXT:    v_mov_b32_e32 v0, v9
+; GFX6789-NEXT:    v_mov_b32_e32 v1, v10
+; GFX6789-NEXT:    v_mov_b32_e32 v2, v11
+; GFX6789-NEXT:    v_mov_b32_e32 v3, v12
+; GFX6789-NEXT:    v_mov_b32_e32 v4, v13
 ; GFX6789-NEXT:    image_load v[0:4], v[5:8], s[0:7] dmask:0xf unorm tfe da
-; GFX6789-NEXT:    v_mov_b32_e32 v5, s8
-; GFX6789-NEXT:    v_mov_b32_e32 v6, s9
 ; GFX6789-NEXT:    s_waitcnt vmcnt(0)
-; GFX6789-NEXT:    global_store_dword v[5:6], v4, off
+; GFX6789-NEXT:    global_store_dword v9, v4, s[8:9]
 ; GFX6789-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6789-NEXT:    ; return to shader part epilog
 ;
@@ -1067,30 +1110,32 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, i32 ad
 ; NOPRT:       ; %bb.0: ; %main_body
 ; NOPRT-NEXT:    v_mov_b32_e32 v4, 0
 ; NOPRT-NEXT:    image_load v[0:4], v[0:3], s[0:7] dmask:0xf unorm tfe da
-; NOPRT-NEXT:    v_mov_b32_e32 v5, s8
-; NOPRT-NEXT:    v_mov_b32_e32 v6, s9
+; NOPRT-NEXT:    v_mov_b32_e32 v5, 0
 ; NOPRT-NEXT:    s_waitcnt vmcnt(0)
-; NOPRT-NEXT:    global_store_dword v[5:6], v4, off
+; NOPRT-NEXT:    global_store_dword v5, v4, s[8:9]
 ; NOPRT-NEXT:    s_waitcnt vmcnt(0)
 ; NOPRT-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: load_2darraymsaa_tfe:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v9, 0 ; encoding: [0x80,0x02,0x12,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v8, v3 ; encoding: [0x03,0x03,0x10,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e]
-; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v10, v9 ; encoding: [0x09,0x03,0x14,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v11, v9 ; encoding: [0x09,0x03,0x16,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v12, v9 ; encoding: [0x09,0x03,0x18,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v13, v9 ; encoding: [0x09,0x03,0x1a,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v0, v9 ; encoding: [0x09,0x03,0x00,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v1, v10 ; encoding: [0x0a,0x03,0x02,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v2, v11 ; encoding: [0x0b,0x03,0x04,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v3, v12 ; encoding: [0x0c,0x03,0x06,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v4, v13 ; encoding: [0x0d,0x03,0x08,0x7e]
 ; GFX10-NEXT:    image_load v[0:4], v[5:8], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe ; encoding: [0x38,0x1f,0x01,0xf0,0x05,0x00,0x00,0x00]
-; GFX10-NEXT:    v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e]
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
-; GFX10-NEXT:    global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00]
+; GFX10-NEXT:    global_store_dword v9, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x09,0x04,0x08,0x00]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:
@@ -1174,18 +1219,21 @@ define amdgpu_ps <4 x float> @load_mip_1d_lwe(<8 x i32> inreg %rsrc, i32 addrspa
 ;
 ; GFX6789-LABEL: load_mip_1d_lwe:
 ; GFX6789:       ; %bb.0: ; %main_body
-; GFX6789-NEXT:    v_mov_b32_e32 v5, v0
-; GFX6789-NEXT:    v_mov_b32_e32 v0, 0
+; GFX6789-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX6789-NEXT:    v_mov_b32_e32 v6, v1
-; GFX6789-NEXT:    v_mov_b32_e32 v1, v0
-; GFX6789-NEXT:    v_mov_b32_e32 v2, v0
-; GFX6789-NEXT:    v_mov_b32_e32 v3, v0
-; GFX6789-NEXT:    v_mov_b32_e32 v4, v0
+; GFX6789-NEXT:    v_mov_b32_e32 v5, v0
+; GFX6789-NEXT:    v_mov_b32_e32 v8, v7
+; GFX6789-NEXT:    v_mov_b32_e32 v9, v7
+; GFX6789-NEXT:    v_mov_b32_e32 v10, v7
+; GFX6789-NEXT:    v_mov_b32_e32 v11, v7
+; GFX6789-NEXT:    v_mov_b32_e32 v0, v7
+; GFX6789-NEXT:    v_mov_b32_e32 v1, v8
+; GFX6789-NEXT:    v_mov_b32_e32 v2, v9
+; GFX6789-NEXT:    v_mov_b32_e32 v3, v10
+; GFX6789-NEXT:    v_mov_b32_e32 v4, v11
 ; GFX6789-NEXT:    image_load_mip v[0:4], v[5:6], s[0:7] dmask:0xf unorm lwe
-; GFX6789-NEXT:    v_mov_b32_e32 v5, s8
-; GFX6789-NEXT:    v_mov_b32_e32 v6, s9
 ; GFX6789-NEXT:    s_waitcnt vmcnt(0)
-; GFX6789-NEXT:    global_store_dword v[5:6], v4, off
+; GFX6789-NEXT:    global_store_dword v7, v4, s[8:9]
 ; GFX6789-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6789-NEXT:    ; return to shader part epilog
 ;
@@ -1193,28 +1241,30 @@ define amdgpu_ps <4 x float> @load_mip_1d_lwe(<8 x i32> inreg %rsrc, i32 addrspa
 ; NOPRT:       ; %bb.0: ; %main_body
 ; NOPRT-NEXT:    v_mov_b32_e32 v4, 0
 ; NOPRT-NEXT:    image_load_mip v[0:4], v[0:1], s[0:7] dmask:0xf unorm lwe
-; NOPRT-NEXT:    v_mov_b32_e32 v5, s8
-; NOPRT-NEXT:    v_mov_b32_e32 v6, s9
+; NOPRT-NEXT:    v_mov_b32_e32 v5, 0
 ; NOPRT-NEXT:    s_waitcnt vmcnt(0)
-; NOPRT-NEXT:    global_store_dword v[5:6], v4, off
+; NOPRT-NEXT:    global_store_dword v5, v4, s[8:9]
 ; NOPRT-NEXT:    s_waitcnt vmcnt(0)
 ; NOPRT-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: load_mip_1d_lwe:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v7, 0 ; encoding: [0x80,0x02,0x0e,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e]
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v8, v7 ; encoding: [0x07,0x03,0x10,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v9, v7 ; encoding: [0x07,0x03,0x12,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v10, v7 ; encoding: [0x07,0x03,0x14,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v11, v7 ; encoding: [0x07,0x03,0x16,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v0, v7 ; encoding: [0x07,0x03,0x00,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v1, v8 ; encoding: [0x08,0x03,0x02,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v2, v9 ; encoding: [0x09,0x03,0x04,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v3, v10 ; encoding: [0x0a,0x03,0x06,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v4, v11 ; encoding: [0x0b,0x03,0x08,0x7e]
 ; GFX10-NEXT:    image_load_mip v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm lwe ; encoding: [0x00,0x1f,0x06,0xf0,0x05,0x00,0x00,0x00]
-; GFX10-NEXT:    v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
-; GFX10-NEXT:    global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00]
+; GFX10-NEXT:    global_store_dword v7, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x07,0x04,0x08,0x00]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:
@@ -1300,19 +1350,22 @@ define amdgpu_ps <4 x float> @load_mip_2d_tfe(<8 x i32> inreg %rsrc, i32 addrspa
 ;
 ; GFX6789-LABEL: load_mip_2d_tfe:
 ; GFX6789:       ; %bb.0: ; %main_body
-; GFX6789-NEXT:    v_mov_b32_e32 v5, v0
-; GFX6789-NEXT:    v_mov_b32_e32 v0, 0
+; GFX6789-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX6789-NEXT:    v_mov_b32_e32 v7, v2
 ; GFX6789-NEXT:    v_mov_b32_e32 v6, v1
-; GFX6789-NEXT:    v_mov_b32_e32 v1, v0
-; GFX6789-NEXT:    v_mov_b32_e32 v2, v0
-; GFX6789-NEXT:    v_mov_b32_e32 v3, v0
-; GFX6789-NEXT:    v_mov_b32_e32 v4, v0
+; GFX6789-NEXT:    v_mov_b32_e32 v5, v0
+; GFX6789-NEXT:    v_mov_b32_e32 v9, v8
+; GFX6789-NEXT:    v_mov_b32_e32 v10, v8
+; GFX6789-NEXT:    v_mov_b32_e32 v11, v8
+; GFX6789-NEXT:    v_mov_b32_e32 v12, v8
+; GFX6789-NEXT:    v_mov_b32_e32 v0, v8
+; GFX6789-NEXT:    v_mov_b32_e32 v1, v9
+; GFX6789-NEXT:    v_mov_b32_e32 v2, v10
+; GFX6789-NEXT:    v_mov_b32_e32 v3, v11
+; GFX6789-NEXT:    v_mov_b32_e32 v4, v12
 ; GFX6789-NEXT:    image_load_mip v[0:4], v[5:7], s[0:7] dmask:0xf unorm tfe
-; GFX6789-NEXT:    v_mov_b32_e32 v5, s8
-; GFX6789-NEXT:    v_mov_b32_e32 v6, s9
 ; GFX6789-NEXT:    s_waitcnt vmcnt(0)
-; GFX6789-NEXT:    global_store_dword v[5:6], v4, off
+; GFX6789-NEXT:    global_store_dword v8, v4, s[8:9]
 ; GFX6789-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6789-NEXT:    ; return to shader part epilog
 ;
@@ -1320,29 +1373,31 @@ define amdgpu_ps <4 x float> @load_mip_2d_tfe(<8 x i32> inreg %rsrc, i32 addrspa
 ; NOPRT:       ; %bb.0: ; %main_body
 ; NOPRT-NEXT:    v_mov_b32_e32 v4, 0
 ; NOPRT-NEXT:    image_load_mip v[0:4], v[0:2], s[0:7] dmask:0xf unorm tfe
-; NOPRT-NEXT:    v_mov_b32_e32 v5, s8
-; NOPRT-NEXT:    v_mov_b32_e32 v6, s9
+; NOPRT-NEXT:    v_mov_b32_e32 v5, 0
 ; NOPRT-NEXT:    s_waitcnt vmcnt(0)
-; NOPRT-NEXT:    global_store_dword v[5:6], v4, off
+; NOPRT-NEXT:    global_store_dword v5, v4, s[8:9]
 ; NOPRT-NEXT:    s_waitcnt vmcnt(0)
 ; NOPRT-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: load_mip_2d_tfe:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v11, 0 ; encoding: [0x80,0x02,0x16,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e]
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v12, v11 ; encoding: [0x0b,0x03,0x18,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v13, v11 ; encoding: [0x0b,0x03,0x1a,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v14, v11 ; encoding: [0x0b,0x03,0x1c,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v15, v11 ; encoding: [0x0b,0x03,0x1e,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v0, v11 ; encoding: [0x0b,0x03,0x00,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v1, v12 ; encoding: [0x0c,0x03,0x02,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v2, v13 ; encoding: [0x0d,0x03,0x04,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v3, v14 ; encoding: [0x0e,0x03,0x06,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v4, v15 ; encoding: [0x0f,0x03,0x08,0x7e]
 ; GFX10-NEXT:    image_load_mip v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm tfe ; encoding: [0x08,0x1f,0x05,0xf0,0x05,0x00,0x00,0x00]
-; GFX10-NEXT:    v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
-; GFX10-NEXT:    global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00]
+; GFX10-NEXT:    global_store_dword v11, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x0b,0x04,0x08,0x00]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:
@@ -1698,16 +1753,18 @@ define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask3(<8 x i32> inreg %rsrc, i32 a
 ;
 ; GFX6789-LABEL: load_1d_tfe_V4_dmask3:
 ; GFX6789:       ; %bb.0: ; %main_body
+; GFX6789-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX6789-NEXT:    v_mov_b32_e32 v4, v0
-; GFX6789-NEXT:    v_mov_b32_e32 v0, 0
-; GFX6789-NEXT:    v_mov_b32_e32 v1, v0
-; GFX6789-NEXT:    v_mov_b32_e32 v2, v0
-; GFX6789-NEXT:    v_mov_b32_e32 v3, v0
+; GFX6789-NEXT:    v_mov_b32_e32 v6, v5
+; GFX6789-NEXT:    v_mov_b32_e32 v7, v5
+; GFX6789-NEXT:    v_mov_b32_e32 v8, v5
+; GFX6789-NEXT:    v_mov_b32_e32 v0, v5
+; GFX6789-NEXT:    v_mov_b32_e32 v1, v6
+; GFX6789-NEXT:    v_mov_b32_e32 v2, v7
+; GFX6789-NEXT:    v_mov_b32_e32 v3, v8
 ; GFX6789-NEXT:    image_load v[0:3], v4, s[0:7] dmask:0x7 unorm tfe
-; GFX6789-NEXT:    v_mov_b32_e32 v4, s8
-; GFX6789-NEXT:    v_mov_b32_e32 v5, s9
 ; GFX6789-NEXT:    s_waitcnt vmcnt(0)
-; GFX6789-NEXT:    global_store_dword v[4:5], v3, off
+; GFX6789-NEXT:    global_store_dword v5, v3, s[8:9]
 ; GFX6789-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6789-NEXT:    ; return to shader part epilog
 ;
@@ -1715,26 +1772,27 @@ define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask3(<8 x i32> inreg %rsrc, i32 a
 ; NOPRT:       ; %bb.0: ; %main_body
 ; NOPRT-NEXT:    v_mov_b32_e32 v3, 0
 ; NOPRT-NEXT:    image_load v[0:3], v0, s[0:7] dmask:0x7 unorm tfe
-; NOPRT-NEXT:    v_mov_b32_e32 v4, s8
-; NOPRT-NEXT:    v_mov_b32_e32 v5, s9
+; NOPRT-NEXT:    v_mov_b32_e32 v4, 0
 ; NOPRT-NEXT:    s_waitcnt vmcnt(0)
-; NOPRT-NEXT:    global_store_dword v[4:5], v3, off
+; NOPRT-NEXT:    global_store_dword v4, v3, s[8:9]
 ; NOPRT-NEXT:    s_waitcnt vmcnt(0)
 ; NOPRT-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: load_1d_tfe_V4_dmask3:
 ; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_mov_b32_e32 v5, 0 ; encoding: [0x80,0x02,0x0a,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v5, s9 ; encoding: [0x09,0x02,0x0a,0x7e]
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v6, v5 ; encoding: [0x05,0x03,0x0c,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v7, v5 ; encoding: [0x05,0x03,0x0e,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v8, v5 ; encoding: [0x05,0x03,0x10,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v0, v5 ; encoding: [0x05,0x03,0x00,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v1, v6 ; encoding: [0x06,0x03,0x02,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v2, v7 ; encoding: [0x07,0x03,0x04,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v3, v8 ; encoding: [0x08,0x03,0x06,0x7e]
 ; GFX10-NEXT:    image_load v[0:3], v4, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D unorm tfe ; encoding: [0x00,0x17,0x01,0xf0,0x04,0x00,0x00,0x00]
-; GFX10-NEXT:    v_mov_b32_e32 v4, s8 ; encoding: [0x08,0x02,0x08,0x7e]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
-; GFX10-NEXT:    global_store_dword v[4:5], v3, off ; encoding: [0x00,0x80,0x70,0xdc,0x04,0x03,0x7d,0x00]
+; GFX10-NEXT:    global_store_dword v5, v3, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x03,0x08,0x00]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:
@@ -1776,15 +1834,16 @@ define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask2(<8 x i32> inreg %rsrc, i32 a
 ;
 ; GFX6789-LABEL: load_1d_tfe_V4_dmask2:
 ; GFX6789:       ; %bb.0: ; %main_body
+; GFX6789-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX6789-NEXT:    v_mov_b32_e32 v3, v0
-; GFX6789-NEXT:    v_mov_b32_e32 v0, 0
-; GFX6789-NEXT:    v_mov_b32_e32 v1, v0
-; GFX6789-NEXT:    v_mov_b32_e32 v2, v0
+; GFX6789-NEXT:    v_mov_b32_e32 v5, v4
+; GFX6789-NEXT:    v_mov_b32_e32 v6, v4
+; GFX6789-NEXT:    v_mov_b32_e32 v0, v4
+; GFX6789-NEXT:    v_mov_b32_e32 v1, v5
+; GFX6789-NEXT:    v_mov_b32_e32 v2, v6
 ; GFX6789-NEXT:    image_load v[0:2], v3, s[0:7] dmask:0x6 unorm tfe
-; GFX6789-NEXT:    v_mov_b32_e32 v3, s8
-; GFX6789-NEXT:    v_mov_b32_e32 v4, s9
 ; GFX6789-NEXT:    s_waitcnt vmcnt(0)
-; GFX6789-NEXT:    global_store_dword v[3:4], v2, off
+; GFX6789-NEXT:    global_store_dword v4, v2, s[8:9]
 ; GFX6789-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6789-NEXT:    ; return to shader part epilog
 ;
@@ -1792,25 +1851,25 @@ define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask2(<8 x i32> inreg %rsrc, i32 a
 ; NOPRT:       ; %bb.0: ; %main_body
 ; NOPRT-NEXT:    v_mov_b32_e32 v2, 0
 ; NOPRT-NEXT:    image_load v[0:2], v0, s[0:7] dmask:0x6 unorm tfe
-; NOPRT-NEXT:    v_mov_b32_e32 v3, s8
-; NOPRT-NEXT:    v_mov_b32_e32 v4, s9
+; NOPRT-NEXT:    v_mov_b32_e32 v3, 0
 ; NOPRT-NEXT:    s_waitcnt vmcnt(0)
-; NOPRT-NEXT:    global_store_dword v[3:4], v2, off
+; NOPRT-NEXT:    global_store_dword v3, v2, s[8:9]
 ; NOPRT-NEXT:    s_waitcnt vmcnt(0)
 ; NOPRT-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: load_1d_tfe_V4_dmask2:
 ; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0 ; encoding: [0x80,0x02,0x08,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v4, s9 ; encoding: [0x09,0x02,0x08,0x7e]
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v5, v4 ; encoding: [0x04,0x03,0x0a,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v6, v4 ; encoding: [0x04,0x03,0x0c,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4 ; encoding: [0x04,0x03,0x00,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5 ; encoding: [0x05,0x03,0x02,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v2, v6 ; encoding: [0x06,0x03,0x04,0x7e]
 ; GFX10-NEXT:    image_load v[0:2], v3, s[0:7] dmask:0x6 dim:SQ_RSRC_IMG_1D unorm tfe ; encoding: [0x00,0x16,0x01,0xf0,0x03,0x00,0x00,0x00]
-; GFX10-NEXT:    v_mov_b32_e32 v3, s8 ; encoding: [0x08,0x02,0x06,0x7e]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
-; GFX10-NEXT:    global_store_dword v[3:4], v2, off ; encoding: [0x00,0x80,0x70,0xdc,0x03,0x02,0x7d,0x00]
+; GFX10-NEXT:    global_store_dword v4, v2, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x04,0x02,0x08,0x00]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:
@@ -1850,14 +1909,14 @@ define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask1(<8 x i32> inreg %rsrc, i32 a
 ;
 ; GFX6789-LABEL: load_1d_tfe_V4_dmask1:
 ; GFX6789:       ; %bb.0: ; %main_body
+; GFX6789-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX6789-NEXT:    v_mov_b32_e32 v2, v0
-; GFX6789-NEXT:    v_mov_b32_e32 v0, 0
-; GFX6789-NEXT:    v_mov_b32_e32 v1, v0
+; GFX6789-NEXT:    v_mov_b32_e32 v4, v3
+; GFX6789-NEXT:    v_mov_b32_e32 v0, v3
+; GFX6789-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX6789-NEXT:    image_load v[0:1], v2, s[0:7] dmask:0x8 unorm tfe
-; GFX6789-NEXT:    v_mov_b32_e32 v2, s8
-; GFX6789-NEXT:    v_mov_b32_e32 v3, s9
 ; GFX6789-NEXT:    s_waitcnt vmcnt(0)
-; GFX6789-NEXT:    global_store_dword v[2:3], v1, off
+; GFX6789-NEXT:    global_store_dword v3, v1, s[8:9]
 ; GFX6789-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6789-NEXT:    ; return to shader part epilog
 ;
@@ -1865,24 +1924,23 @@ define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask1(<8 x i32> inreg %rsrc, i32 a
 ; NOPRT:       ; %bb.0: ; %main_body
 ; NOPRT-NEXT:    v_mov_b32_e32 v1, 0
 ; NOPRT-NEXT:    image_load v[0:1], v0, s[0:7] dmask:0x8 unorm tfe
-; NOPRT-NEXT:    v_mov_b32_e32 v2, s8
-; NOPRT-NEXT:    v_mov_b32_e32 v3, s9
+; NOPRT-NEXT:    v_mov_b32_e32 v2, 0
 ; NOPRT-NEXT:    s_waitcnt vmcnt(0)
-; NOPRT-NEXT:    global_store_dword v[2:3], v1, off
+; NOPRT-NEXT:    global_store_dword v2, v1, s[8:9]
 ; NOPRT-NEXT:    s_waitcnt vmcnt(0)
 ; NOPRT-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: load_1d_tfe_V4_dmask1:
 ; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0 ; encoding: [0x80,0x02,0x06,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v3, s9 ; encoding: [0x09,0x02,0x06,0x7e]
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3 ; encoding: [0x03,0x03,0x08,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v0, v3 ; encoding: [0x03,0x03,0x00,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v1, v4 ; encoding: [0x04,0x03,0x02,0x7e]
 ; GFX10-NEXT:    image_load v[0:1], v2, s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_1D unorm tfe ; encoding: [0x00,0x18,0x01,0xf0,0x02,0x00,0x00,0x00]
-; GFX10-NEXT:    v_mov_b32_e32 v2, s8 ; encoding: [0x08,0x02,0x04,0x7e]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
-; GFX10-NEXT:    global_store_dword v[2:3], v1, off ; encoding: [0x00,0x80,0x70,0xdc,0x02,0x01,0x7d,0x00]
+; GFX10-NEXT:    global_store_dword v3, v1, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x03,0x01,0x08,0x00]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:
@@ -1922,14 +1980,14 @@ define amdgpu_ps <2 x float> @load_1d_tfe_V2_dmask1(<8 x i32> inreg %rsrc, i32 a
 ;
 ; GFX6789-LABEL: load_1d_tfe_V2_dmask1:
 ; GFX6789:       ; %bb.0: ; %main_body
+; GFX6789-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX6789-NEXT:    v_mov_b32_e32 v2, v0
-; GFX6789-NEXT:    v_mov_b32_e32 v0, 0
-; GFX6789-NEXT:    v_mov_b32_e32 v1, v0
+; GFX6789-NEXT:    v_mov_b32_e32 v4, v3
+; GFX6789-NEXT:    v_mov_b32_e32 v0, v3
+; GFX6789-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX6789-NEXT:    image_load v[0:1], v2, s[0:7] dmask:0x8 unorm tfe
-; GFX6789-NEXT:    v_mov_b32_e32 v2, s8
-; GFX6789-NEXT:    v_mov_b32_e32 v3, s9
 ; GFX6789-NEXT:    s_waitcnt vmcnt(0)
-; GFX6789-NEXT:    global_store_dword v[2:3], v1, off
+; GFX6789-NEXT:    global_store_dword v3, v1, s[8:9]
 ; GFX6789-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6789-NEXT:    ; return to shader part epilog
 ;
@@ -1937,24 +1995,23 @@ define amdgpu_ps <2 x float> @load_1d_tfe_V2_dmask1(<8 x i32> inreg %rsrc, i32 a
 ; NOPRT:       ; %bb.0: ; %main_body
 ; NOPRT-NEXT:    v_mov_b32_e32 v1, 0
 ; NOPRT-NEXT:    image_load v[0:1], v0, s[0:7] dmask:0x8 unorm tfe
-; NOPRT-NEXT:    v_mov_b32_e32 v2, s8
-; NOPRT-NEXT:    v_mov_b32_e32 v3, s9
+; NOPRT-NEXT:    v_mov_b32_e32 v2, 0
 ; NOPRT-NEXT:    s_waitcnt vmcnt(0)
-; NOPRT-NEXT:    global_store_dword v[2:3], v1, off
+; NOPRT-NEXT:    global_store_dword v2, v1, s[8:9]
 ; NOPRT-NEXT:    s_waitcnt vmcnt(0)
 ; NOPRT-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: load_1d_tfe_V2_dmask1:
 ; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0 ; encoding: [0x80,0x02,0x06,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v3, s9 ; encoding: [0x09,0x02,0x06,0x7e]
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3 ; encoding: [0x03,0x03,0x08,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v0, v3 ; encoding: [0x03,0x03,0x00,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v1, v4 ; encoding: [0x04,0x03,0x02,0x7e]
 ; GFX10-NEXT:    image_load v[0:1], v2, s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_1D unorm tfe ; encoding: [0x00,0x18,0x01,0xf0,0x02,0x00,0x00,0x00]
-; GFX10-NEXT:    v_mov_b32_e32 v2, s8 ; encoding: [0x08,0x02,0x04,0x7e]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
-; GFX10-NEXT:    global_store_dword v[2:3], v1, off ; encoding: [0x00,0x80,0x70,0xdc,0x02,0x01,0x7d,0x00]
+; GFX10-NEXT:    global_store_dword v3, v1, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x03,0x01,0x08,0x00]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll
index fa744b880974..d09ecc8917b3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll
@@ -83,15 +83,15 @@ define amdgpu_ps half @image_sample_2d_f16_tfe(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    s_mov_b64 s[14:15], exec
 ; GFX9-NEXT:    s_wqm_b64 exec, exec
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    v_mov_b32_e32 v3, v2
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_mov_b32_e32 v5, v4
+; GFX9-NEXT:    v_mov_b32_e32 v2, v4
+; GFX9-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX9-NEXT:    s_and_b64 exec, exec, s[14:15]
 ; GFX9-NEXT:    image_sample v[2:3], v[0:1], s[0:7], s[8:11] dmask:0x1 tfe d16
-; GFX9-NEXT:    v_mov_b32_e32 v0, s12
-; GFX9-NEXT:    v_mov_b32_e32 v1, s13
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_dword v[0:1], v3, off
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-NEXT:    global_store_dword v4, v3, s[12:13]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
@@ -99,16 +99,16 @@ define amdgpu_ps half @image_sample_2d_f16_tfe(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX10:       ; %bb.0: ; %main_body
 ; GFX10-NEXT:    s_mov_b32 s28, exec_lo
 ; GFX10-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0
-; GFX10-NEXT:    v_mov_b32_e32 v3, v2
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0
+; GFX10-NEXT:    v_mov_b32_e32 v5, v4
+; GFX10-NEXT:    v_mov_b32_e32 v2, v4
+; GFX10-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX10-NEXT:    s_and_b32 exec_lo, exec_lo, s28
 ; GFX10-NEXT:    image_sample v[2:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe d16
-; GFX10-NEXT:    v_mov_b32_e32 v0, s12
-; GFX10-NEXT:    v_mov_b32_e32 v1, s13
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_store_dword v[0:1], v3, off
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v2
+; GFX10-NEXT:    global_store_dword v4, v3, s[12:13]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
index 02f57ff00e15..56a2165723e2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
@@ -60,18 +60,21 @@ define amdgpu_ps <4 x float> @sample_1d_tfe(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; GFX6789:       ; %bb.0: ; %main_body
 ; GFX6789-NEXT:    s_mov_b64 s[14:15], exec
 ; GFX6789-NEXT:    s_wqm_b64 exec, exec
+; GFX6789-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX6789-NEXT:    v_mov_b32_e32 v5, v0
-; GFX6789-NEXT:    v_mov_b32_e32 v0, 0
-; GFX6789-NEXT:    v_mov_b32_e32 v1, v0
-; GFX6789-NEXT:    v_mov_b32_e32 v2, v0
-; GFX6789-NEXT:    v_mov_b32_e32 v3, v0
-; GFX6789-NEXT:    v_mov_b32_e32 v4, v0
+; GFX6789-NEXT:    v_mov_b32_e32 v7, v6
+; GFX6789-NEXT:    v_mov_b32_e32 v8, v6
+; GFX6789-NEXT:    v_mov_b32_e32 v9, v6
+; GFX6789-NEXT:    v_mov_b32_e32 v10, v6
+; GFX6789-NEXT:    v_mov_b32_e32 v0, v6
+; GFX6789-NEXT:    v_mov_b32_e32 v1, v7
+; GFX6789-NEXT:    v_mov_b32_e32 v2, v8
+; GFX6789-NEXT:    v_mov_b32_e32 v3, v9
+; GFX6789-NEXT:    v_mov_b32_e32 v4, v10
 ; GFX6789-NEXT:    s_and_b64 exec, exec, s[14:15]
 ; GFX6789-NEXT:    image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf tfe
-; GFX6789-NEXT:    v_mov_b32_e32 v5, s12
-; GFX6789-NEXT:    v_mov_b32_e32 v6, s13
 ; GFX6789-NEXT:    s_waitcnt vmcnt(0)
-; GFX6789-NEXT:    global_store_dword v[5:6], v4, off
+; GFX6789-NEXT:    global_store_dword v6, v4, s[12:13]
 ; GFX6789-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6789-NEXT:    ; return to shader part epilog
 ;
@@ -79,19 +82,22 @@ define amdgpu_ps <4 x float> @sample_1d_tfe(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; GFX10:       ; %bb.0: ; %main_body
 ; GFX10-NEXT:    s_mov_b32 s28, exec_lo ; encoding: [0x7e,0x03,0x9c,0xbe]
 ; GFX10-NEXT:    s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe]
+; GFX10-NEXT:    v_mov_b32_e32 v6, 0 ; encoding: [0x80,0x02,0x0c,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v7, v6 ; encoding: [0x06,0x03,0x0e,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v8, v6 ; encoding: [0x06,0x03,0x10,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v9, v6 ; encoding: [0x06,0x03,0x12,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v10, v6 ; encoding: [0x06,0x03,0x14,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v0, v6 ; encoding: [0x06,0x03,0x00,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v1, v7 ; encoding: [0x07,0x03,0x02,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v2, v8 ; encoding: [0x08,0x03,0x04,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v3, v9 ; encoding: [0x09,0x03,0x06,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v4, v10 ; encoding: [0x0a,0x03,0x08,0x7e]
 ; GFX10-NEXT:    s_and_b32 exec_lo, exec_lo, s28 ; encoding: [0x7e,0x1c,0x7e,0x87]
 ; GFX10-NEXT:    image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D tfe ; encoding: [0x00,0x0f,0x81,0xf0,0x05,0x00,0x40,0x00]
-; GFX10-NEXT:    v_mov_b32_e32 v5, s12 ; encoding: [0x0c,0x02,0x0a,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v6, s13 ; encoding: [0x0d,0x02,0x0c,0x7e]
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
-; GFX10-NEXT:    global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00]
+; GFX10-NEXT:    global_store_dword v6, v4, s[12:13] ; encoding: [0x00,0x80,0x70,0xdc,0x06,0x04,0x0c,0x00]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:
@@ -482,18 +488,21 @@ define amdgpu_ps <4 x float> @sample_1d_lwe(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; GFX6789:       ; %bb.0: ; %main_body
 ; GFX6789-NEXT:    s_mov_b64 s[14:15], exec
 ; GFX6789-NEXT:    s_wqm_b64 exec, exec
+; GFX6789-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX6789-NEXT:    v_mov_b32_e32 v5, v0
-; GFX6789-NEXT:    v_mov_b32_e32 v0, 0
-; GFX6789-NEXT:    v_mov_b32_e32 v1, v0
-; GFX6789-NEXT:    v_mov_b32_e32 v2, v0
-; GFX6789-NEXT:    v_mov_b32_e32 v3, v0
-; GFX6789-NEXT:    v_mov_b32_e32 v4, v0
+; GFX6789-NEXT:    v_mov_b32_e32 v7, v6
+; GFX6789-NEXT:    v_mov_b32_e32 v8, v6
+; GFX6789-NEXT:    v_mov_b32_e32 v9, v6
+; GFX6789-NEXT:    v_mov_b32_e32 v10, v6
+; GFX6789-NEXT:    v_mov_b32_e32 v0, v6
+; GFX6789-NEXT:    v_mov_b32_e32 v1, v7
+; GFX6789-NEXT:    v_mov_b32_e32 v2, v8
+; GFX6789-NEXT:    v_mov_b32_e32 v3, v9
+; GFX6789-NEXT:    v_mov_b32_e32 v4, v10
 ; GFX6789-NEXT:    s_and_b64 exec, exec, s[14:15]
 ; GFX6789-NEXT:    image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf lwe
-; GFX6789-NEXT:    v_mov_b32_e32 v5, s12
-; GFX6789-NEXT:    v_mov_b32_e32 v6, s13
 ; GFX6789-NEXT:    s_waitcnt vmcnt(0)
-; GFX6789-NEXT:    global_store_dword v[5:6], v4, off
+; GFX6789-NEXT:    global_store_dword v6, v4, s[12:13]
 ; GFX6789-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6789-NEXT:    ; return to shader part epilog
 ;
@@ -501,19 +510,22 @@ define amdgpu_ps <4 x float> @sample_1d_lwe(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; GFX10:       ; %bb.0: ; %main_body
 ; GFX10-NEXT:    s_mov_b32 s28, exec_lo ; encoding: [0x7e,0x03,0x9c,0xbe]
 ; GFX10-NEXT:    s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe]
+; GFX10-NEXT:    v_mov_b32_e32 v6, 0 ; encoding: [0x80,0x02,0x0c,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v7, v6 ; encoding: [0x06,0x03,0x0e,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v8, v6 ; encoding: [0x06,0x03,0x10,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v9, v6 ; encoding: [0x06,0x03,0x12,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v10, v6 ; encoding: [0x06,0x03,0x14,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v0, v6 ; encoding: [0x06,0x03,0x00,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v1, v7 ; encoding: [0x07,0x03,0x02,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v2, v8 ; encoding: [0x08,0x03,0x04,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v3, v9 ; encoding: [0x09,0x03,0x06,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v4, v10 ; encoding: [0x0a,0x03,0x08,0x7e]
 ; GFX10-NEXT:    s_and_b32 exec_lo, exec_lo, s28 ; encoding: [0x7e,0x1c,0x7e,0x87]
 ; GFX10-NEXT:    image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D lwe ; encoding: [0x00,0x0f,0x82,0xf0,0x05,0x00,0x40,0x00]
-; GFX10-NEXT:    v_mov_b32_e32 v5, s12 ; encoding: [0x0c,0x02,0x0a,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v6, s13 ; encoding: [0x0d,0x02,0x0c,0x7e]
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
-; GFX10-NEXT:    global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00]
+; GFX10-NEXT:    global_store_dword v6, v4, s[12:13] ; encoding: [0x00,0x80,0x70,0xdc,0x06,0x04,0x0c,0x00]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:
@@ -1767,29 +1779,29 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1_tfe(<8 x i32> inreg %rsrc, <4 x
 ;
 ; GFX6789-LABEL: sample_c_d_o_2darray_V1_tfe:
 ; GFX6789:       ; %bb.0: ; %main_body
-; GFX6789-NEXT:    v_mov_b32_e32 v9, 0
-; GFX6789-NEXT:    v_mov_b32_e32 v10, v9
+; GFX6789-NEXT:    v_mov_b32_e32 v11, 0
+; GFX6789-NEXT:    v_mov_b32_e32 v12, v11
+; GFX6789-NEXT:    v_mov_b32_e32 v9, v11
+; GFX6789-NEXT:    v_mov_b32_e32 v10, v12
 ; GFX6789-NEXT:    image_sample_c_d_o v[9:10], v[0:15], s[0:7], s[8:11] dmask:0x4 tfe da
-; GFX6789-NEXT:    v_mov_b32_e32 v0, s12
-; GFX6789-NEXT:    v_mov_b32_e32 v1, s13
 ; GFX6789-NEXT:    s_waitcnt vmcnt(0)
-; GFX6789-NEXT:    global_store_dword v[0:1], v10, off
 ; GFX6789-NEXT:    v_mov_b32_e32 v0, v9
+; GFX6789-NEXT:    global_store_dword v11, v10, s[12:13]
 ; GFX6789-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6789-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: sample_c_d_o_2darray_V1_tfe:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_mov_b32_e32 v10, v0 ; encoding: [0x00,0x03,0x14,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v11, 0 ; encoding: [0x80,0x02,0x16,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v9, v1 ; encoding: [0x01,0x03,0x12,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v10, v0 ; encoding: [0x00,0x03,0x14,0x7e]
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v12, v11 ; encoding: [0x0b,0x03,0x18,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v0, v11 ; encoding: [0x0b,0x03,0x00,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v1, v12 ; encoding: [0x0c,0x03,0x02,0x7e]
 ; GFX10-NEXT:    image_sample_c_d_o v[0:1], [v10, v9, v2, v3, v4, v5, v6, v7, v8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY tfe ; encoding: [0x2c,0x04,0xe9,0xf0,0x0a,0x00,0x40,0x00,0x09,0x02,0x03,0x04,0x05,0x06,0x07,0x08]
-; GFX10-NEXT:    v_mov_b32_e32 v2, s12 ; encoding: [0x0c,0x02,0x04,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v3, s13 ; encoding: [0x0d,0x02,0x06,0x7e]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
-; GFX10-NEXT:    global_store_dword v[2:3], v1, off ; encoding: [0x00,0x80,0x70,0xdc,0x02,0x01,0x7d,0x00]
+; GFX10-NEXT:    global_store_dword v11, v1, s[12:13] ; encoding: [0x00,0x80,0x70,0xdc,0x0b,0x01,0x0c,0x00]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
index 25742666a579..ddb3f3f3c42c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
@@ -1287,8 +1287,41 @@ bb:
 ; GCN: v_accvgpr_read_b32
 ; GCN: v_accvgpr_read_b32
 ; GCN: global_store_dwordx4
-define amdgpu_kernel void @test_mfma_f32_4x4x1f32_lit_splat(<4 x float> addrspace(1)* %arg) {
+define amdgpu_kernel void @test_mfma_f32_4x4x1f32_lit_splat(<4 x float> addrspace(1)* %arg, i64 %idx) {
 bb:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %arg, i32 %tid
+  %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> <float 123.0, float 123.0, float 123.0, float 123.0>, i32 0, i32 0, i32 0)
+  ;store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
+  store <4 x float> %mai.1, <4 x float> addrspace(1)* %gep
+  ret void
+}
+
+; FIXME: Resulting code for splat is pretty bad. A v_mov_b32 is moved
+; in the middle of the expanded agpr reg_sequence. The broadcast of
+; the individual AGPR->AGPR components should avoid the intermediate AGPR case.
+; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32_lit_splat_bad_code:
+; GCN: v_mov_b32_e32 [[VTMP0:v[0-9]+]], 0x42f60000
+; GCN: v_accvgpr_write_b32 [[AGPR:a[0-9]+]], [[VTMP0]]
+; GCN: s_nop 0
+; GCN: v_accvgpr_read_b32 [[VTMP1:v[0-9]+]], [[AGPR]]
+; GCN: v_accvgpr_read_b32 [[VTMP2:v[0-9]+]], [[AGPR]]
+; GCN: v_accvgpr_read_b32 [[VTMP3:v[0-9]+]], [[AGPR]]
+; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[VTMP1]]
+; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[VTMP2]]
+; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[VTMP3]]
+; GCN: s_nop 0
+; GCN: v_mfma_f32_4x4x1f32 a[{{[0-9]+:[0-9]+}}], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}]
+; GCN: v_accvgpr_read_b32
+; GCN: v_accvgpr_read_b32
+; GCN: v_accvgpr_read_b32
+; GCN: v_accvgpr_read_b32
+; GCN: global_store_dwordx4
+define amdgpu_kernel void @test_mfma_f32_4x4x1f32_lit_splat_bad_code(<4 x float> addrspace(1)* %arg) {
+bb:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %arg, i32 %tid
+
   %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> <float 123.0, float 123.0, float 123.0, float 123.0>, i32 0, i32 0, i32 0)
   store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
   ret void

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.get.waveid.in.workgroup.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.get.waveid.in.workgroup.ll
index 7b1fb0110e89..111fd35e1ce4 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.get.waveid.in.workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.get.waveid.in.workgroup.ll
@@ -7,7 +7,7 @@ declare i32 @llvm.amdgcn.s.get.waveid.in.workgroup() #0
 ; GFX10: s_get_waveid_in_workgroup [[DEST:s[0-9]+]]
 ; GFX10: s_waitcnt lgkmcnt(0)
 ; GFX10: v_mov_b32_e32 [[VDEST:v[0-9]+]], [[DEST]]
-; GFX10: global_store_dword v[{{[0-9:]+}}], [[VDEST]], off
+; GFX10: global_store_dword v{{[0-9]+}}, [[VDEST]], s{{\[[0-9]+:[0-9]+\]$}}
 define amdgpu_kernel void @test_s_get_waveid_in_workgroup(i32 addrspace(1)* %out) {
 ; Make sure %out is loaded and assiciated wait count already inserted
   store i32 0, i32 addrspace(1)* %out

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll
index c97c43ab5f04..0ae7d45454ed 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll
@@ -15,7 +15,7 @@
 
 ; W32:       v_mov_b32_e32 [[V:v[0-9]+]], 32
 ; W64:       v_mov_b32_e32 [[V:v[0-9]+]], 64
-; GCN:       store_dword v[{{[0-9:]+}}], [[V]]
+; GCN:       store_dword v{{.+}}, [[V]]
 
 ; OPT-W32:   store i32 32, i32 addrspace(1)* %arg, align 4
 ; OPT-W64:   store i32 64, i32 addrspace(1)* %arg, align 4
@@ -36,7 +36,7 @@ bb:
 ; W32:       v_mov_b32_e32 [[V:v[0-9]+]], 1{{$}}
 ; W64:       v_mov_b32_e32 [[V:v[0-9]+]], 2{{$}}
 ; GCN-NOT:   cndmask
-; GCN:       store_dword v[{{[0-9:]+}}], [[V]]
+; GCN:       store_dword v{{.+}}, [[V]]
 
 ; OPT-W32:   store i32 1, i32 addrspace(1)* %arg, align 4
 ; OPT-W64:   store i32 2, i32 addrspace(1)* %arg, align 4

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
index e251985141d3..45bacd48ff5a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
@@ -45,16 +45,13 @@ define amdgpu_kernel void @cos_f16(half addrspace(1)* %r, half addrspace(1)* %a)
 ; GFX9-LABEL: cos_f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_load_ushort v0, v[0:1], off
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mul_f16_e32 v0, 0.15915494, v0
-; GFX9-NEXT:    v_cos_f16_e32 v2, v0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    global_store_short v[0:1], v2, off
+; GFX9-NEXT:    v_mul_f16_e32 v1, 0.15915494, v1
+; GFX9-NEXT:    v_cos_f16_e32 v1, v1
+; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %a.val = load half, half addrspace(1)* %a
   %r.val = call half @llvm.cos.f16(half %a.val)
@@ -118,21 +115,18 @@ define amdgpu_kernel void @cos_v2f16(<2 x half> addrspace(1)* %r, <2 x half> add
 ; GFX9-LABEL: cos_v2f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x3118
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_load_dword v0, v[0:1], off
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x3118
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mul_f16_e32 v2, 0.15915494, v0
-; GFX9-NEXT:    v_cos_f16_e32 v2, v2
-; GFX9-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_cos_f16_e32 v3, v0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_mul_f16_e32 v3, 0.15915494, v1
+; GFX9-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_cos_f16_e32 v3, v3
+; GFX9-NEXT:    v_cos_f16_e32 v1, v1
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v3
+; GFX9-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
   %r.val = call <2 x half> @llvm.cos.v2f16(<2 x half> %a.val)

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
index e4de730d993b..6cd0c2162f22 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
@@ -45,16 +45,13 @@ define amdgpu_kernel void @sin_f16(half addrspace(1)* %r, half addrspace(1)* %a)
 ; GFX9-LABEL: sin_f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_load_ushort v0, v[0:1], off
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mul_f16_e32 v0, 0.15915494, v0
-; GFX9-NEXT:    v_sin_f16_e32 v2, v0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    global_store_short v[0:1], v2, off
+; GFX9-NEXT:    v_mul_f16_e32 v1, 0.15915494, v1
+; GFX9-NEXT:    v_sin_f16_e32 v1, v1
+; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %a.val = load half, half addrspace(1)* %a
   %r.val = call half @llvm.sin.f16(half %a.val)
@@ -118,21 +115,18 @@ define amdgpu_kernel void @sin_v2f16(<2 x half> addrspace(1)* %r, <2 x half> add
 ; GFX9-LABEL: sin_v2f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x3118
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_load_dword v0, v[0:1], off
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x3118
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mul_f16_e32 v2, 0.15915494, v0
-; GFX9-NEXT:    v_sin_f16_e32 v2, v2
-; GFX9-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_sin_f16_e32 v3, v0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_mul_f16_e32 v3, 0.15915494, v1
+; GFX9-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_sin_f16_e32 v3, v3
+; GFX9-NEXT:    v_sin_f16_e32 v1, v1
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v3
+; GFX9-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
   %r.val = call <2 x half> @llvm.sin.v2f16(<2 x half> %a.val)

diff  --git a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
index 0a60413494af..eaae56fe04f6 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
@@ -97,7 +97,7 @@ entry:
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
 
 ; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]
-; GCN-HSA: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]]
+; GCN-HSA: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]]
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY
 define amdgpu_kernel void @global_zextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
@@ -112,7 +112,7 @@ define amdgpu_kernel void @global_zextload_i32_to_i64(i64 addrspace(1)* %out, i3
 ; GCN-HSA: {{flat|global}}_load_dword v[[LO:[0-9]+]]
 ; GCN: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
 ; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-; GCN-HSA: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN-HSA: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
 
 
 ; EG: MEM_RAT
@@ -144,7 +144,7 @@ define amdgpu_kernel void @global_zextload_v1i32_to_v1i64(<1 x i64> addrspace(1)
 ; GCN-HSA: {{flat|global}}_load_dword v[[LO:[0-9]+]]
 ; GCN: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
 ; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-; GCN-HSA: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN-HSA: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
 define amdgpu_kernel void @global_sextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(1)* %in) #0 {
   %ld = load <1 x i32>, <1 x i32> addrspace(1)* %in
   %ext = sext <1 x i32> %ld to <1 x i64>

diff  --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll
index 94ea5875f84e..2b08a9d8bd29 100644
--- a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll
@@ -46,12 +46,11 @@ define amdgpu_kernel void @local_stack_offset_uses_sp(i64 addrspace(1)* %out, i8
 ; MUBUF-NEXT:    buffer_load_dword v4, v0, s[0:3], 0 offen offset:4
 ; MUBUF-NEXT:    s_waitcnt vmcnt(1)
 ; MUBUF-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v3
-; MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
-; MUBUF-NEXT:    v_mov_b32_e32 v2, s4
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
-; MUBUF-NEXT:    v_mov_b32_e32 v3, s5
-; MUBUF-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; MUBUF-NEXT:    v_mov_b32_e32 v2, 0
+; MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
+; MUBUF-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; MUBUF-NEXT:    s_endpgm
 ;
 ; FLATSCR-LABEL: local_stack_offset_uses_sp:
@@ -79,10 +78,9 @@ define amdgpu_kernel void @local_stack_offset_uses_sp(i64 addrspace(1)* %out, i8
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
 ; FLATSCR-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; FLATSCR-NEXT:    v_mov_b32_e32 v2, 0
 ; FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
-; FLATSCR-NEXT:    v_mov_b32_e32 v3, s1
-; FLATSCR-NEXT:    v_mov_b32_e32 v2, s0
-; FLATSCR-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; FLATSCR-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; FLATSCR-NEXT:    s_endpgm
 entry:
   %pin.low = alloca i32, align 8192, addrspace(5)

diff  --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
index d0d8788cde3b..71ec8792d813 100644
--- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
@@ -11,12 +11,11 @@ define amdgpu_kernel void @s_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16>
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
 ; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x30
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    v_pk_lshrrev_b16 v2, s0, v0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:    v_pk_lshrrev_b16 v1, s0, v1
+; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_lshr_v2i16:

diff  --git a/llvm/test/CodeGen/AMDGPU/mad.u16.ll b/llvm/test/CodeGen/AMDGPU/mad.u16.ll
index e93ee2ceee63..7d165b09b3f6 100644
--- a/llvm/test/CodeGen/AMDGPU/mad.u16.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad.u16.ll
@@ -11,7 +11,7 @@
 ; GFX8: v_mad_u16 v[[R:[0-9]+]], v[[A]], v[[B]], v[[C]]
 ; GFX9: v_mad_legacy_u16 v[[R:[0-9]+]], v[[A]], v[[B]], v[[C]]
 ; GFX10: v_mad_u16 v[[R:[0-9]+]], v[[A]], v[[B]], v[[C]]
-; GCN: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, v[[R]]
+; GCN: {{flat|global}}_store_short v{{.+}}, v[[R]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @mad_u16(
     i16 addrspace(1)* %r,

diff  --git a/llvm/test/CodeGen/AMDGPU/mai-inline.ll b/llvm/test/CodeGen/AMDGPU/mai-inline.ll
index a9d8c5cb2e07..8b7f542118d9 100644
--- a/llvm/test/CodeGen/AMDGPU/mai-inline.ll
+++ b/llvm/test/CodeGen/AMDGPU/mai-inline.ll
@@ -3,7 +3,7 @@
 ; GCN-LABEL: {{^}}accvgpr_write_read:
 ; GFX908: v_accvgpr_write [[AREG:a[0-9]+]], 1
 ; GFX908: v_accvgpr_read [[VREG:v[0-9]+]], [[AREG]]
-; GFX908: global_store_dword {{[^,]+}}, [[VREG]], off
+; GFX908: global_store_dword v{{[0-9]+}}, [[VREG]], s{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @accvgpr_write_read(float addrspace(1)* %arg) {
 bb:
   %in.1 = load float, float addrspace(1)* %arg

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-store.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-store.ll
index f6ee09fc609a..5e2e2df1effc 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-store.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-store.ll
@@ -347,8 +347,8 @@ entry:
 
 ; GCN-LABEL: {{^}}nontemporal_global_0:
 ; GFX8:  flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc slc{{$}}
-; GFX9:  global_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off glc slc{{$}}
-; GFX10: global_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off slc{{$}}
+; GFX9:  global_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} glc slc{{$}}
+; GFX10: global_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} slc{{$}}
 ; GFX10:         .amdhsa_kernel nontemporal_global_0
 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0

diff  --git a/llvm/test/CodeGen/AMDGPU/memory_clause.ll b/llvm/test/CodeGen/AMDGPU/memory_clause.ll
index ab387361c05a..2c5931ef57b6 100644
--- a/llvm/test/CodeGen/AMDGPU/memory_clause.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory_clause.ll
@@ -51,17 +51,16 @@ define amdgpu_kernel void @scalar_clause(<4 x i32> addrspace(1)* noalias nocaptu
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x24
 ; GCN-NEXT:    s_load_dwordx2 s[18:19], s[0:1], 0x2c
+; GCN-NEXT:    v_mov_b32_e32 v12, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[16:17], 0x0
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[16:17], 0x10
 ; GCN-NEXT:    s_load_dwordx4 s[8:11], s[16:17], 0x20
 ; GCN-NEXT:    s_load_dwordx4 s[12:15], s[16:17], 0x30
-; GCN-NEXT:    v_mov_b32_e32 v12, s18
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v4, s4
 ; GCN-NEXT:    v_mov_b32_e32 v8, s8
-; GCN-NEXT:    v_mov_b32_e32 v13, s19
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN-NEXT:    v_mov_b32_e32 v3, s3
@@ -71,14 +70,14 @@ define amdgpu_kernel void @scalar_clause(<4 x i32> addrspace(1)* noalias nocaptu
 ; GCN-NEXT:    v_mov_b32_e32 v9, s9
 ; GCN-NEXT:    v_mov_b32_e32 v10, s10
 ; GCN-NEXT:    v_mov_b32_e32 v11, s11
-; GCN-NEXT:    global_store_dwordx4 v[12:13], v[0:3], off
-; GCN-NEXT:    global_store_dwordx4 v[12:13], v[4:7], off offset:16
-; GCN-NEXT:    global_store_dwordx4 v[12:13], v[8:11], off offset:32
+; GCN-NEXT:    global_store_dwordx4 v12, v[0:3], s[18:19]
+; GCN-NEXT:    global_store_dwordx4 v12, v[4:7], s[18:19] offset:16
+; GCN-NEXT:    global_store_dwordx4 v12, v[8:11], s[18:19] offset:32
 ; GCN-NEXT:    v_mov_b32_e32 v0, s12
 ; GCN-NEXT:    v_mov_b32_e32 v1, s13
 ; GCN-NEXT:    v_mov_b32_e32 v2, s14
 ; GCN-NEXT:    v_mov_b32_e32 v3, s15
-; GCN-NEXT:    global_store_dwordx4 v[12:13], v[0:3], off offset:48
+; GCN-NEXT:    global_store_dwordx4 v12, v[0:3], s[18:19] offset:48
 ; GCN-NEXT:    s_endpgm
 bb:
   %tmp = load <4 x i32>, <4 x i32> addrspace(1)* %arg, align 16
@@ -179,12 +178,11 @@ define amdgpu_kernel void @vector_clause_indirect(i64 addrspace(1)* noalias noca
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    global_load_dwordx4 v[0:3], v[8:9], off
 ; GCN-NEXT:    global_load_dwordx4 v[4:7], v[8:9], off offset:16
-; GCN-NEXT:    v_mov_b32_e32 v9, s5
-; GCN-NEXT:    v_mov_b32_e32 v8, s4
+; GCN-NEXT:    v_mov_b32_e32 v8, 0
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off
+; GCN-NEXT:    global_store_dwordx4 v8, v[0:3], s[4:5]
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    global_store_dwordx4 v[8:9], v[4:7], off offset:16
+; GCN-NEXT:    global_store_dwordx4 v8, v[4:7], s[4:5] offset:16
 ; GCN-NEXT:    s_endpgm
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()

diff  --git a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
index 0457030a0455..b5f2ee49441e 100644
--- a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
+++ b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
@@ -30,23 +30,21 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
 ; MUBUF-NEXT:    s_cbranch_scc1 BB0_3
 ; MUBUF-NEXT:  ; %bb.2: ; %bb.1
 ; MUBUF-NEXT:    s_add_i32 s6, s32, 0x1000
-; MUBUF-NEXT:    v_mov_b32_e32 v1, 0
-; MUBUF-NEXT:    v_mov_b32_e32 v2, s6
 ; MUBUF-NEXT:    s_lshl_b32 s7, s10, 2
 ; MUBUF-NEXT:    s_mov_b32 s32, s6
-; MUBUF-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; MUBUF-NEXT:    v_mov_b32_e32 v1, 1
+; MUBUF-NEXT:    v_mov_b32_e32 v2, s6
+; MUBUF-NEXT:    v_mov_b32_e32 v1, 0
+; MUBUF-NEXT:    v_mov_b32_e32 v3, 1
 ; MUBUF-NEXT:    s_add_i32 s6, s6, s7
-; MUBUF-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
-; MUBUF-NEXT:    v_mov_b32_e32 v1, s6
-; MUBUF-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen
+; MUBUF-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; MUBUF-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
+; MUBUF-NEXT:    v_mov_b32_e32 v2, s6
+; MUBUF-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
 ; MUBUF-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
-; MUBUF-NEXT:    v_add_u32_e32 v2, v1, v0
+; MUBUF-NEXT:    v_add_u32_e32 v0, v2, v0
 ; MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
-; MUBUF-NEXT:    v_mov_b32_e32 v0, s4
-; MUBUF-NEXT:    v_mov_b32_e32 v1, s5
-; MUBUF-NEXT:    global_store_dword v[0:1], v2, off
+; MUBUF-NEXT:    global_store_dword v1, v0, s[4:5]
 ; MUBUF-NEXT:  BB0_3: ; %bb.2
 ; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
 ; MUBUF-NEXT:    global_store_dword v[0:1], v0, off
@@ -76,14 +74,12 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
 ; FLATSCR-NEXT:    s_lshl_b32 s2, s6, 2
 ; FLATSCR-NEXT:    s_mov_b32 s32, s4
 ; FLATSCR-NEXT:    s_add_i32 s4, s4, s2
-; FLATSCR-NEXT:    scratch_load_dword v1, off, s4
+; FLATSCR-NEXT:    scratch_load_dword v2, off, s4
 ; FLATSCR-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR-NEXT:    v_add_u32_e32 v2, v1, v0
+; FLATSCR-NEXT:    v_add_u32_e32 v0, v2, v0
 ; FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
-; FLATSCR-NEXT:    v_mov_b32_e32 v0, s0
-; FLATSCR-NEXT:    v_mov_b32_e32 v1, s1
-; FLATSCR-NEXT:    global_store_dword v[0:1], v2, off
+; FLATSCR-NEXT:    global_store_dword v1, v0, s[0:1]
 ; FLATSCR-NEXT:  BB0_3: ; %bb.2
 ; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
 ; FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
@@ -137,23 +133,21 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
 ; MUBUF-NEXT:  ; %bb.1: ; %bb.0
 ; MUBUF-NEXT:    s_add_i32 s6, s32, 0x1000
 ; MUBUF-NEXT:    s_and_b32 s6, s6, 0xfffff000
-; MUBUF-NEXT:    v_mov_b32_e32 v1, 0
-; MUBUF-NEXT:    v_mov_b32_e32 v2, s6
 ; MUBUF-NEXT:    s_lshl_b32 s7, s7, 2
 ; MUBUF-NEXT:    s_mov_b32 s32, s6
-; MUBUF-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; MUBUF-NEXT:    v_mov_b32_e32 v1, 1
+; MUBUF-NEXT:    v_mov_b32_e32 v2, s6
+; MUBUF-NEXT:    v_mov_b32_e32 v1, 0
+; MUBUF-NEXT:    v_mov_b32_e32 v3, 1
 ; MUBUF-NEXT:    s_add_i32 s6, s6, s7
-; MUBUF-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
-; MUBUF-NEXT:    v_mov_b32_e32 v1, s6
-; MUBUF-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen
+; MUBUF-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; MUBUF-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
+; MUBUF-NEXT:    v_mov_b32_e32 v2, s6
+; MUBUF-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
 ; MUBUF-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
-; MUBUF-NEXT:    v_add_u32_e32 v2, v1, v0
+; MUBUF-NEXT:    v_add_u32_e32 v0, v2, v0
 ; MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
-; MUBUF-NEXT:    v_mov_b32_e32 v0, s4
-; MUBUF-NEXT:    v_mov_b32_e32 v1, s5
-; MUBUF-NEXT:    global_store_dword v[0:1], v2, off
+; MUBUF-NEXT:    global_store_dword v1, v0, s[4:5]
 ; MUBUF-NEXT:  BB1_2: ; %bb.1
 ; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
 ; MUBUF-NEXT:    global_store_dword v[0:1], v0, off
@@ -178,14 +172,12 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
 ; FLATSCR-NEXT:    s_mov_b32 s32, s2
 ; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[1:2], s2
 ; FLATSCR-NEXT:    s_add_i32 s2, s2, s3
-; FLATSCR-NEXT:    scratch_load_dword v1, off, s2
+; FLATSCR-NEXT:    scratch_load_dword v2, off, s2
 ; FLATSCR-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR-NEXT:    v_add_u32_e32 v2, v1, v0
+; FLATSCR-NEXT:    v_add_u32_e32 v0, v2, v0
 ; FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
-; FLATSCR-NEXT:    v_mov_b32_e32 v0, s0
-; FLATSCR-NEXT:    v_mov_b32_e32 v1, s1
-; FLATSCR-NEXT:    global_store_dword v[0:1], v2, off
+; FLATSCR-NEXT:    global_store_dword v1, v0, s[0:1]
 ; FLATSCR-NEXT:  BB1_2: ; %bb.1
 ; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
 ; FLATSCR-NEXT:    global_store_dword v[0:1], v0, off

diff  --git a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll
index 3b0795d25a14..27d01efd54a2 100644
--- a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll
+++ b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll
@@ -632,10 +632,9 @@ define amdgpu_kernel void @global_inst_salu_offset_1(i8 addrspace(1)* %p) {
 ; GFX9-LABEL: global_inst_salu_offset_1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:1
+; GFX9-NEXT:    global_load_ubyte v0, v0, s[0:1] offset:1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_store_byte v[0:1], v0, off
 ; GFX9-NEXT:    s_endpgm
@@ -643,11 +642,10 @@ define amdgpu_kernel void @global_inst_salu_offset_1(i8 addrspace(1)* %p) {
 ; GFX10-LABEL: global_inst_salu_offset_1:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:1
+; GFX10-NEXT:    global_load_ubyte v0, v0, s[0:1] offset:1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    global_store_byte v[0:1], v0, off
 ; GFX10-NEXT:    s_endpgm
@@ -661,10 +659,9 @@ define amdgpu_kernel void @global_inst_salu_offset_11bit_max(i8 addrspace(1)* %p
 ; GFX9-LABEL: global_inst_salu_offset_11bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
+; GFX9-NEXT:    global_load_ubyte v0, v0, s[0:1] offset:2047
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_store_byte v[0:1], v0, off
 ; GFX9-NEXT:    s_endpgm
@@ -672,11 +669,10 @@ define amdgpu_kernel void @global_inst_salu_offset_11bit_max(i8 addrspace(1)* %p
 ; GFX10-LABEL: global_inst_salu_offset_11bit_max:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
+; GFX10-NEXT:    global_load_ubyte v0, v0, s[0:1] offset:2047
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    global_store_byte v[0:1], v0, off
 ; GFX10-NEXT:    s_endpgm
@@ -690,10 +686,9 @@ define amdgpu_kernel void @global_inst_salu_offset_12bit_max(i8 addrspace(1)* %p
 ; GFX9-LABEL: global_inst_salu_offset_12bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:4095
+; GFX9-NEXT:    global_load_ubyte v0, v0, s[0:1] offset:4095
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_store_byte v[0:1], v0, off
 ; GFX9-NEXT:    s_endpgm
@@ -745,10 +740,9 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_11bit_max(i8 addrspace(1)
 ; GFX9-LABEL: global_inst_salu_offset_neg_11bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:-2048
+; GFX9-NEXT:    global_load_ubyte v0, v0, s[0:1] offset:-2048
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_store_byte v[0:1], v0, off
 ; GFX9-NEXT:    s_endpgm
@@ -756,11 +750,10 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_11bit_max(i8 addrspace(1)
 ; GFX10-LABEL: global_inst_salu_offset_neg_11bit_max:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:-2048
+; GFX10-NEXT:    global_load_ubyte v0, v0, s[0:1] offset:-2048
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    global_store_byte v[0:1], v0, off
 ; GFX10-NEXT:    s_endpgm
@@ -774,10 +767,9 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_12bit_max(i8 addrspace(1)
 ; GFX9-LABEL: global_inst_salu_offset_neg_12bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:-4096
+; GFX9-NEXT:    global_load_ubyte v0, v0, s[0:1] offset:-4096
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_store_byte v[0:1], v0, off
 ; GFX9-NEXT:    s_endpgm
@@ -834,10 +826,9 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_11bit_max(i8 addrspace(1)*
 ; GFX9-LABEL: global_inst_salu_offset_2x_11bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:4095
+; GFX9-NEXT:    global_load_ubyte v0, v0, s[0:1] offset:4095
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_store_byte v[0:1], v0, off
 ; GFX9-NEXT:    s_endpgm
@@ -916,10 +907,9 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_11bit_max(i8 addrspace
 ; GFX9-LABEL: global_inst_salu_offset_2x_neg_11bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:-4096
+; GFX9-NEXT:    global_load_ubyte v0, v0, s[0:1] offset:-4096
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_store_byte v[0:1], v0, off
 ; GFX9-NEXT:    s_endpgm

diff  --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll
index da52bcee3637..ca41899b055e 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll
@@ -9,7 +9,7 @@
 ; GCN: v_cndmask_b32
 ; GCN: v_cndmask_b32
 ; GCN: v_cndmask_b32_e32 [[RES:v[0-9]+]], 4.0,
-; GCN: store_dword v[{{[0-9:]+}}], [[RES]]
+; GCN: store_dword v{{.+}}, [[RES]]
 
 ; OPT:  %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(5)* %alloca, i32 0, i32 %sel2
 ; OPT:  store <4 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00>, <4 x float> addrspace(5)* %alloca, align 4
@@ -44,7 +44,7 @@ entry:
 ; GCN:     v_mov_b32_e32 v{{[0-9]+}}, [[ONE]]
 ; GCN:     v_mov_b32_e32 v{{[0-9]+}}, [[ONE]]
 ; GCN:     v_mov_b32_e32 v{{[0-9]+}}, [[ONE]]
-; GCN:     store_dwordx4 v[{{[0-9:]+}}],
+; GCN:     store_dwordx4 v{{.+}},
 
 ; OPT: %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(5)* %alloca, i32 0, i32 %sel2
 ; OPT: %0 = load <4 x float>, <4 x float> addrspace(5)* %alloca

diff  --git a/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll b/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll
index 1b709dc241a0..a9182aaf1189 100644
--- a/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll
+++ b/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll
@@ -17,10 +17,10 @@ declare i64 @llvm.readcyclecounter() #0
 ; GETREG-DAG:  v_mov_b32_e32 v[[ZERO:[0-9]+]], 0
 ; GETREG-DAG:  s_getreg_b32 [[CNT1:s[0-9]+]], hwreg(HW_REG_SHADER_CYCLES, 0, 20)
 ; GETREG-DAG:  v_mov_b32_e32 v[[VCNT1:[0-9]+]], [[CNT1]]
-; GETREG:      global_store_dwordx2 v[{{[0-9:]+}}], v{{\[}}[[VCNT1]]:[[ZERO]]], off
+; GETREG:      global_store_dwordx2 v{{.+}}, v{{\[}}[[VCNT1]]:[[ZERO]]]
 ; GETREG:      s_getreg_b32 [[CNT2:s[0-9]+]], hwreg(HW_REG_SHADER_CYCLES, 0, 20)
 ; GETREG:      v_mov_b32_e32 v[[VCNT2:[0-9]+]], [[CNT2]]
-; GETREG:      global_store_dwordx2 v[{{[0-9:]+}}], v{{\[}}[[VCNT2]]:[[ZERO]]], off
+; GETREG:      global_store_dwordx2 v{{.+}}, v{{\[}}[[VCNT2]]:[[ZERO]]]
 
 define amdgpu_kernel void @test_readcyclecounter(i64 addrspace(1)* %out) #0 {
   %cycle0 = call i64 @llvm.readcyclecounter()

diff  --git a/llvm/test/CodeGen/AMDGPU/saddo.ll b/llvm/test/CodeGen/AMDGPU/saddo.ll
index 50ac22851af8..d01dbbfce000 100644
--- a/llvm/test/CodeGen/AMDGPU/saddo.ll
+++ b/llvm/test/CodeGen/AMDGPU/saddo.ll
@@ -59,21 +59,20 @@ define amdgpu_kernel void @saddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9-NEXT:    s_add_u32 s2, s6, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    s_addc_u32 s3, s7, s1
-; GFX9-NEXT:    v_mov_b32_e32 v2, s7
-; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[1:2]
+; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
 ; GFX9-NEXT:    v_cmp_lt_i64_e64 s[8:9], s[0:1], 0
-; GFX9-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    s_xor_b64 s[0:1], s[8:9], vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s2, v2
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind
   %val = extractvalue { i64, i1 } %sadd, 0
@@ -134,19 +133,16 @@ define amdgpu_kernel void @s_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)*
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    v_mov_b32_e32 v4, s1
-; GFX9-NEXT:    v_add_i32 v4, s0, v4 clamp
-; GFX9-NEXT:    s_add_i32 s0, s0, s1
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    v_mov_b32_e32 v5, s0
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, s0, v4
-; GFX9-NEXT:    global_store_dword v[0:1], v5, off
-; GFX9-NEXT:    v_mov_b32_e32 v2, s6
-; GFX9-NEXT:    v_mov_b32_e32 v3, s7
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT:    global_store_byte v[2:3], v0, off
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    s_add_i32 s1, s0, s1
+; GFX9-NEXT:    v_add_i32 v1, s0, v1 clamp
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s1
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT:    global_store_dword v0, v2, s[4:5]
+; GFX9-NEXT:    global_store_byte v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
   %sadd = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b) nounwind
   %val = extractvalue { i32, i1 } %sadd, 0
@@ -214,24 +210,17 @@ define amdgpu_kernel void @v_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)*
 ; GFX9-LABEL: v_saddo_i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    v_mov_b32_e32 v2, s6
-; GFX9-NEXT:    v_mov_b32_e32 v3, s7
-; GFX9-NEXT:    global_load_dword v4, v[0:1], off
-; GFX9-NEXT:    global_load_dword v5, v[2:3], off
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_i32 v6, v4, v5 clamp
-; GFX9-NEXT:    v_add_u32_e32 v4, v4, v5
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, v4, v6
-; GFX9-NEXT:    global_store_dword v[0:1], v4, off
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT:    global_store_byte v[2:3], v0, off
+; GFX9-NEXT:    v_add_i32 v3, v1, v2 clamp
+; GFX9-NEXT:    v_add_u32_e32 v1, v1, v2
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, v1, v3
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT:    global_store_byte v0, v1, s[2:3]
 ; GFX9-NEXT:    s_endpgm
   %a = load i32, i32 addrspace(1)* %aptr, align 4
   %b = load i32, i32 addrspace(1)* %bptr, align 4
@@ -296,23 +285,20 @@ define amdgpu_kernel void @s_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)*
 ; GFX9-LABEL: s_saddo_i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v4, s4
-; GFX9-NEXT:    s_add_u32 s0, s4, s6
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    s_addc_u32 s1, s5, s7
-; GFX9-NEXT:    v_mov_b32_e32 v5, s5
-; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[4:5]
-; GFX9-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-NEXT:    v_cmp_lt_i64_e64 s[2:3], s[6:7], 0
-; GFX9-NEXT:    v_mov_b32_e32 v5, s1
-; GFX9-NEXT:    v_mov_b32_e32 v4, s0
-; GFX9-NEXT:    s_xor_b64 s[0:1], s[2:3], vcc
-; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[4:5], off
+; GFX9-NEXT:    s_add_u32 s8, s4, s6
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    s_addc_u32 s9, s5, s7
+; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_cmp_lt_i64_e64 s[10:11], s[6:7], 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_xor_b64 s[0:1], s[10:11], vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; GFX9-NEXT:    global_store_byte v[2:3], v0, off
+; GFX9-NEXT:    global_store_byte v2, v0, s[2:3]
 ; GFX9-NEXT:    s_endpgm
   %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind
   %val = extractvalue { i64, i1 } %sadd, 0
@@ -381,27 +367,20 @@ define amdgpu_kernel void @v_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)*
 ;
 ; GFX9-LABEL: v_saddo_i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    v_mov_b32_e32 v2, s6
-; GFX9-NEXT:    v_mov_b32_e32 v3, s7
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
-; GFX9-NEXT:    v_mov_b32_e32 v4, s0
-; GFX9-NEXT:    v_mov_b32_e32 v5, s1
-; GFX9-NEXT:    v_mov_b32_e32 v6, s2
-; GFX9-NEXT:    v_mov_b32_e32 v7, s3
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v6, s[8:9]
+; GFX9-NEXT:    global_load_dwordx2 v[2:3], v6, s[10:11]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v0, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, v1, v3, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v1, v3, vcc
 ; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[2:3]
-; GFX9-NEXT:    v_cmp_lt_i64_e64 s[0:1], v[8:9], v[0:1]
-; GFX9-NEXT:    global_store_dwordx2 v[4:5], v[8:9], off
+; GFX9-NEXT:    v_cmp_lt_i64_e64 s[0:1], v[4:5], v[0:1]
+; GFX9-NEXT:    global_store_dwordx2 v6, v[4:5], s[4:5]
 ; GFX9-NEXT:    s_xor_b64 s[0:1], vcc, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; GFX9-NEXT:    global_store_byte v[6:7], v0, off
+; GFX9-NEXT:    global_store_byte v6, v0, s[6:7]
 ; GFX9-NEXT:    s_endpgm
   %a = load i64, i64 addrspace(1)* %aptr, align 4
   %b = load i64, i64 addrspace(1)* %bptr, align 4
@@ -481,28 +460,21 @@ define amdgpu_kernel void @v_saddo_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32>
 ; GFX9-LABEL: v_saddo_v2i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    v_mov_b32_e32 v2, s6
-; GFX9-NEXT:    v_mov_b32_e32 v3, s7
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
-; GFX9-NEXT:    v_mov_b32_e32 v4, s0
-; GFX9-NEXT:    v_mov_b32_e32 v5, s1
-; GFX9-NEXT:    v_mov_b32_e32 v6, s2
-; GFX9-NEXT:    v_mov_b32_e32 v7, s3
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v4, s[4:5]
+; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_i32 v8, v0, v2 clamp
+; GFX9-NEXT:    v_add_i32 v5, v0, v2 clamp
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_add_i32 v2, v1, v3 clamp
 ; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    global_store_dwordx2 v[4:5], v[0:1], off
+; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, v0, v8
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, v0, v5
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT:    global_store_dwordx2 v[6:7], v[0:1], off
+; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
   %a = load <2 x i32>, <2 x i32> addrspace(1)* %aptr, align 4
   %b = load <2 x i32>, <2 x i32> addrspace(1)* %bptr, align 4

diff  --git a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
index e089ac0afc16..c91a54cd8c78 100644
--- a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
@@ -165,8 +165,8 @@ define amdgpu_kernel void @reorder_smrd_load_local_store_smrd_load(i32 addrspace
 ; CI: buffer_load_dword
 ; CI: buffer_store_dword
 
-; GFX9: global_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:4
-; GFX9: global_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:12
+; GFX9: global_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:4
+; GFX9: global_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:12
 ; GFX9: ds_write_b32
 define amdgpu_kernel void @reorder_global_load_local_store_global_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr, i32 addrspace(1)* %ptr0) #0 {
   %ptr1 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i64 1
@@ -216,11 +216,11 @@ define amdgpu_kernel void @reorder_local_offsets(i32 addrspace(1)* nocapture %ou
 ; CI: buffer_store_dword
 ; CI: s_endpgm
 
-; GFX9-DAG: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:400
-; GFX9-DAG: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:408
-; GFX9-DAG: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, off offset:12
-; GFX9-DAG: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, off offset:400
-; GFX9-DAG: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, off offset:408
+; GFX9-DAG: global_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:400
+; GFX9-DAG: global_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:408
+; GFX9-DAG: global_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:12
+; GFX9-DAG: global_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:400
+; GFX9-DAG: global_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:408
 ; GFX9: global_store_dword
 ; GFX9: s_endpgm
 define amdgpu_kernel void @reorder_global_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(1)* noalias nocapture %ptr0) #0 {

diff  --git a/llvm/test/CodeGen/AMDGPU/smrd_vmem_war.ll b/llvm/test/CodeGen/AMDGPU/smrd_vmem_war.ll
index c376886a3e80..2e0633d6b70b 100644
--- a/llvm/test/CodeGen/AMDGPU/smrd_vmem_war.ll
+++ b/llvm/test/CodeGen/AMDGPU/smrd_vmem_war.ll
@@ -3,7 +3,7 @@
 ; GCN-LABEL: ; %bb.0:
 ; GCN: s_load_dword s{{[0-9]+}}, s{{\[}}[[ADDR_LO:[0-9]+]]{{\:}}[[ADDR_HI:[0-9]+]]{{\]}}, 0x0
 ; GCN: s_waitcnt lgkmcnt(0)
-; GCN: global_store_dword v{{\[}}[[ADDR_LO]]{{\:}}[[ADDR_HI]]{{\]}}, v{{[0-9]+}}, off
+; GCN: global_store_dword v
 
 define amdgpu_kernel void @zot(i32 addrspace(1)* nocapture %arg, i64 addrspace(1)* nocapture %arg1) {
 bb:

diff  --git a/llvm/test/CodeGen/AMDGPU/store-global.ll b/llvm/test/CodeGen/AMDGPU/store-global.ll
index a8843365421d..6085e1a88549 100644
--- a/llvm/test/CodeGen/AMDGPU/store-global.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-global.ll
@@ -65,7 +65,7 @@ entry:
 ; SIVI-DAG: buffer_store_byte
 ; SIVI-DAG: buffer_store_short
 
-; GFX9-DAG: global_store_byte_d16_hi v{{\[[0-9]:[0-9]+\]}}, v{{[0-9]+}}, off offset:2
+; GFX9-DAG: global_store_byte_d16_hi v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:2
 ; GFX9-DAG: global_store_short
 
 ; EG: MEM_RAT MSKOR
@@ -80,7 +80,7 @@ entry:
 ; GCN: s_and_b32 [[AND:s[0-9]+]], s{{[0-9]+}}, 0x1ffffff{{$}}
 ; GCN: v_mov_b32_e32 [[VAND:v[0-9]+]], [[AND]]
 ; SIVI: buffer_store_dword [[VAND]]
-; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VAND]]
+; GFX9: global_store_dword v{{[0-9]+}}, [[VAND]], s
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW
 ; EG-NOT: MEM_RAT

diff  --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
index 90336ca79ac2..f66cb6b7b6dc 100644
--- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
@@ -75,24 +75,22 @@ define amdgpu_kernel void @local_store_i55(i55 addrspace(3)* %ptr, i55 %arg) #0
 ;
 ; GFX9-LABEL: local_store_i55:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    global_load_ubyte_d16_hi v2, v[0:1], off offset:14
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    global_load_ubyte_d16_hi v0, v0, s[4:5] offset:14
+; GFX9-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX9-NEXT:    s_load_dword s1, s[4:5], 0x8
-; GFX9-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9-NEXT:    s_and_b32 s3, s2, 0xffff
-; GFX9-NEXT:    ds_write_b16 v0, v1 offset:4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    ds_write_b16 v1, v2 offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_or_b32_e32 v1, s3, v2
-; GFX9-NEXT:    v_and_b32_e32 v1, 0x7fffff, v1
-; GFX9-NEXT:    ds_write_b8_d16_hi v0, v1 offset:6
-; GFX9-NEXT:    ds_write_b32 v0, v3
+; GFX9-NEXT:    v_or_b32_e32 v0, s3, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fffff, v0
+; GFX9-NEXT:    ds_write_b8_d16_hi v1, v0 offset:6
+; GFX9-NEXT:    ds_write_b32 v1, v3
 ; GFX9-NEXT:    s_endpgm
   store i55 %arg, i55 addrspace(3)* %ptr, align 8
   ret void

diff  --git a/llvm/test/CodeGen/AMDGPU/trunc-store-i64.ll b/llvm/test/CodeGen/AMDGPU/trunc-store-i64.ll
index 627ba9e0f717..85029a544383 100644
--- a/llvm/test/CodeGen/AMDGPU/trunc-store-i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/trunc-store-i64.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
 ; GCN-LABEL: {{^}}trunc_store_v4i64_v4i8:
-; GCN: global_store_dword v{{\[[0-9]:[0-9]+\]}}, v{{[0-9]+}}, off
+; GCN: global_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @trunc_store_v4i64_v4i8(< 4 x i8> addrspace(1)* %out, <4 x i64> %in) {
 entry:
   %trunc = trunc <4 x i64> %in to < 4 x i8>
@@ -10,7 +10,7 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}trunc_store_v8i64_v8i8:
-; GCN: global_store_dwordx2 v{{\[[0-9]:[0-9]+\]}}, v{{\[[0-9]:[0-9]+\]}}, off
+; GCN: global_store_dwordx2 v{{[0-9]+}}, v{{\[[0-9]:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @trunc_store_v8i64_v8i8(< 8 x i8> addrspace(1)* %out, <8 x i64> %in) {
 entry:
   %trunc = trunc <8 x i64> %in to < 8 x i8>
@@ -19,7 +19,7 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}trunc_store_v8i64_v8i16:
-; GCN: global_store_dwordx4 v{{\[[0-9]:[0-9]+\]}}, v{{\[[0-9]:[0-9]+\]}}, off
+; GCN: global_store_dwordx4 v{{[0-9]+}}, v{{\[[0-9]:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @trunc_store_v8i64_v8i16(< 8 x i16> addrspace(1)* %out, <8 x i64> %in) {
 entry:
   %trunc = trunc <8 x i64> %in to < 8 x i16>
@@ -28,8 +28,8 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}trunc_store_v8i64_v8i32:
-; GCN: global_store_dwordx4 v{{\[[0-9]:[0-9]+\]}}, v{{\[[0-9]:[0-9]+\]}}, off offset:16
-; GCN: global_store_dwordx4 v{{\[[0-9]:[0-9]+\]}}, v{{\[[0-9]:[0-9]+\]}}, off
+; GCN: global_store_dwordx4 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:16
+; GCN: global_store_dwordx4 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]$}}
 define amdgpu_kernel void @trunc_store_v8i64_v8i32(< 8 x i32> addrspace(1)* %out, <8 x i64> %in) {
 entry:
   %trunc = trunc <8 x i64> %in to <8 x i32>
@@ -38,10 +38,10 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}trunc_store_v16i64_v16i32:
-; GCN: global_store_dwordx4 v{{\[[0-9]:[0-9]+\]}}, v{{\[[0-9]:[0-9]+\]}}, off offset:48
-; GCN: global_store_dwordx4 v{{\[[0-9]:[0-9]+\]}}, v{{\[[0-9]:[0-9]+\]}}, off offset:32
-; GCN: global_store_dwordx4 v{{\[[0-9]:[0-9]+\]}}, v{{\[[0-9]:[0-9]+\]}}, off offset:16
-; GCN: global_store_dwordx4 v{{\[[0-9]:[0-9]+\]}}, v{{\[[0-9]:[0-9]+\]}}, off
+; GCN: global_store_dwordx4 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:48
+; GCN: global_store_dwordx4 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:32
+; GCN: global_store_dwordx4 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:16
+; GCN: global_store_dwordx4 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]$}}
 define amdgpu_kernel void @trunc_store_v16i64_v16i32(< 16 x i32> addrspace(1)* %out, <16 x i64> %in) {
 entry:
   %trunc = trunc <16 x i64> %in to <16 x i32>

diff  --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
index bc8ccf9697ef..b3e158f90ae8 100644
--- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
@@ -798,16 +798,15 @@ define amdgpu_kernel void @shuffle_scalar_load_v8i32_0123(<8 x i32> addrspace(4)
 ; GFX9-LABEL: shuffle_scalar_load_v8i32_0123:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v4, s2
-; GFX9-NEXT:    v_mov_b32_e32 v5, s3
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
 ; GFX9-NEXT:    s_endpgm
   %ld8 = load <8 x i32>, <8 x i32> addrspace(4)* %in, align 16
   %id = shufflevector <8 x i32> %ld8, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>

diff  --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
index 58737e6222f6..b5b42d893205 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -233,11 +233,11 @@ bb13:
 
 ; GFX1032: s_or_b32 [[MASK0:s[0-9]+]], [[MASK0]], vcc_lo
 ; GFX1064: s_or_b64 [[MASK0:s\[[0-9:]+\]]], [[MASK0]], vcc
+; GCN:     global_store_dword
 ; GFX1032: s_andn2_b32 [[MASK1:s[0-9]+]], [[MASK1]], exec_lo
 ; GFX1064: s_andn2_b64 [[MASK1:s\[[0-9:]+\]]], [[MASK1]], exec
 ; GFX1032: s_and_b32 [[MASK0]], [[MASK0]], exec_lo
 ; GFX1064: s_and_b64 [[MASK0]], [[MASK0]], exec
-; GCN:     global_store_dword
 ; GFX1032: s_or_b32 [[MASK1]], [[MASK1]], [[MASK0]]
 ; GFX1064: s_or_b64 [[MASK1]], [[MASK1]], [[MASK0]]
 ; GCN:   BB{{.*}}: ; %Flow
@@ -476,9 +476,12 @@ exit:
 }
 
 ; GCN-LABEL: {{^}}fdiv_f32:
+; GFX1032: v_div_scale_f32 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
+; GFX1064: v_div_scale_f32 v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
+; GCN: v_rcp_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}
 ; GFX1032: v_div_scale_f32 v{{[0-9]+}}, vcc_lo, s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GFX1064: v_div_scale_f32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-; GCN: v_rcp_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}
+
 ; GCN-NOT: vcc
 ; GCN: v_div_fmas_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 define amdgpu_kernel void @fdiv_f32(float addrspace(1)* %out, float %a, float %b) #0 {
@@ -774,7 +777,7 @@ main_body:
 ; GFX1064:     v_cmp_eq_f32_e64 s{{\[}}[[C_LO:[0-9]+]]:[[C_HI:[0-9]+]]], {{s[0-9]+}}, |{{[vs][0-9]+}}|
 ; GFX1064-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[C_LO]]
 ; GFX1064-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[C_HI]]
-; GCN:         store_dwordx2 v[{{[0-9:]+}}], v{{\[}}[[V_LO]]:[[V_HI]]],
+; GCN:         store_dwordx2 v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]], s
 define amdgpu_kernel void @test_intr_fcmp_i64(i64 addrspace(1)* %out, float %src, float %a) {
   %temp = call float @llvm.fabs.f32(float %a)
   %result = call i64 @llvm.amdgcn.fcmp.i64.f32(float %src, float %temp, i32 1)
@@ -789,7 +792,7 @@ define amdgpu_kernel void @test_intr_fcmp_i64(i64 addrspace(1)* %out, float %src
 ; GFX1064:     v_cmp_eq_u32_e64 s{{\[}}[[C_LO:[0-9]+]]:[[C_HI:[0-9]+]]], 0x64, {{s[0-9]+}}
 ; GFX1064-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[C_LO]]
 ; GFX1064-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[C_HI]]
-; GCN:         store_dwordx2 v[{{[0-9:]+}}], v{{\[}}[[V_LO]]:[[V_HI]]],
+; GCN:         store_dwordx2 v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]], s
 define amdgpu_kernel void @test_intr_icmp_i64(i64 addrspace(1)* %out, i32 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i64.i32(i32 %src, i32 100, i32 32)
   store i64 %result, i64 addrspace(1)* %out
@@ -801,7 +804,7 @@ define amdgpu_kernel void @test_intr_icmp_i64(i64 addrspace(1)* %out, i32 %src)
 ; GFX1032-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[C_LO]]
 ; GFX1064:     v_cmp_eq_f32_e64 s{{\[}}[[C_LO:[0-9]+]]:[[C_HI:[0-9]+]]], {{s[0-9]+}}, |{{[vs][0-9]+}}|
 ; GFX1064-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[C_LO]]
-; GCN:         store_dword v[{{[0-9:]+}}], v[[V_LO]],
+; GCN:         store_dword v{{[0-9]+}}, v[[V_LO]], s
 define amdgpu_kernel void @test_intr_fcmp_i32(i32 addrspace(1)* %out, float %src, float %a) {
   %temp = call float @llvm.fabs.f32(float %a)
   %result = call i32 @llvm.amdgcn.fcmp.i32.f32(float %src, float %temp, i32 1)
@@ -814,7 +817,7 @@ define amdgpu_kernel void @test_intr_fcmp_i32(i32 addrspace(1)* %out, float %src
 ; GFX1032-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[C_LO]]{{$}}
 ; GFX1064:     v_cmp_eq_u32_e64 s{{\[}}[[C_LO:[0-9]+]]:{{[0-9]+}}], 0x64, {{s[0-9]+}}
 ; GFX1064-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[C_LO]]{{$}}
-; GCN:         store_dword v[{{[0-9:]+}}], v[[V_LO]],
+; GCN:         store_dword v{{[0-9]+}}, v[[V_LO]], s
 define amdgpu_kernel void @test_intr_icmp_i32(i32 addrspace(1)* %out, i32 %src) {
   %result = call i32 @llvm.amdgcn.icmp.i32.i32(i32 %src, i32 100, i32 32)
   store i32 %result, i32 addrspace(1)* %out


        


More information about the llvm-commits mailing list