[llvm] 33fd4a1 - [AMDGPU/MemOpsCluster] Clean-up fixme's around mem ops clustering logic

via llvm-commits llvm-commits at lists.llvm.org
Thu Jul 30 09:11:43 PDT 2020


Author: hsmahesha
Date: 2020-07-30T21:41:13+05:30
New Revision: 33fd4a18e7d373344c8af0012dd97c1c739f2916

URL: https://github.com/llvm/llvm-project/commit/33fd4a18e7d373344c8af0012dd97c1c739f2916
DIFF: https://github.com/llvm/llvm-project/commit/33fd4a18e7d373344c8af0012dd97c1c739f2916.diff

LOG: [AMDGPU/MemOpsCluster] Clean-up fixme's around mem ops clustering logic

Get rid of all fixmes and base heuristic on `num-clustered-dwords`. The main intuition behind this is as
follows. The existing heuristic roughly summarizes as below:

* Assume, all the mem ops instructions participating in the clustering process,  loads/stores same num bytes
* If num bytes loaded by each mem op is 4 bytes, then cluster at max 5 mem ops, that is at max 20 bytes
* If num bytes loaded by each mem op is 8 bytes, then cluster at max 3 mem ops, that is at max 24 bytes
* If num bytes loaded by each mem op is 16 bytes, then cluster at max 2 mem ops, that is at max 32 bytes

So, we need to make sure that the new heuristic do not completey deviate away from the above one, and it
properly handles both the sub-word loads and the wide loads.

Reviewed By: arsenm, rampitec

Differential Revision: https://reviews.llvm.org/D84354

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
    llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll
    llvm/test/CodeGen/AMDGPU/call-argument-types.ll
    llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
    llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
    llvm/test/CodeGen/AMDGPU/fshr.ll
    llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
    llvm/test/CodeGen/AMDGPU/kernel-args.ll
    llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
    llvm/test/CodeGen/AMDGPU/merge-stores.ll
    llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
    llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
    llvm/test/CodeGen/AMDGPU/token-factor-inline-limit-test.ll
    llvm/test/CodeGen/AMDGPU/udivrem.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 59915cb48324..d3dbf4737067 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -474,65 +474,29 @@ bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
                                       ArrayRef<const MachineOperand *> BaseOps2,
                                       unsigned NumLoads,
                                       unsigned NumBytes) const {
+  // If the mem ops (to be clustered) do not have the same base ptr, then they
+  // should not be clustered
   assert(!BaseOps1.empty() && !BaseOps2.empty());
   const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
   const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
-
   if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
     return false;
 
-  const MachineOperand *FirstDst = nullptr;
-  const MachineOperand *SecondDst = nullptr;
-
-  if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) ||
-      (isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt)) ||
-      (isMIMG(FirstLdSt) && isMIMG(SecondLdSt)) ||
-      (isFLAT(FirstLdSt) && isFLAT(SecondLdSt))) {
-    const unsigned MaxGlobalLoadCluster = 7;
-    if (NumLoads > MaxGlobalLoadCluster)
-      return false;
-
-    FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdata);
-    if (!FirstDst)
-      FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst);
-    SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdata);
-    if (!SecondDst)
-      SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst);
-  } else if (isSMRD(FirstLdSt) && isSMRD(SecondLdSt)) {
-    FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::sdst);
-    SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::sdst);
-  } else if (isDS(FirstLdSt) && isDS(SecondLdSt)) {
-    FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst);
-    SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst);
-  }
-
-  if (!FirstDst || !SecondDst)
-    return false;
-
-  // Try to limit clustering based on the total number of bytes loaded
-  // rather than the number of instructions.  This is done to help reduce
-  // register pressure.  The method used is somewhat inexact, though,
-  // because it assumes that all loads in the cluster will load the
-  // same number of bytes as FirstLdSt.
-
-  // The unit of this value is bytes.
-  // FIXME: This needs finer tuning.
-  unsigned LoadClusterThreshold = 16;
-
-  const MachineRegisterInfo &MRI =
-      FirstLdSt.getParent()->getParent()->getRegInfo();
-
-  const Register Reg = FirstDst->getReg();
-
-  const TargetRegisterClass *DstRC = Register::isVirtualRegister(Reg)
-                                         ? MRI.getRegClass(Reg)
-                                         : RI.getPhysRegClass(Reg);
-
-  // FIXME: NumLoads should not be subtracted 1. This is to match behavior
-  // of clusterNeighboringMemOps which was previosly passing cluster length
-  // less 1. LoadClusterThreshold should be tuned instead.
-  return ((NumLoads - 1) * (RI.getRegSizeInBits(*DstRC) / 8)) <=
-         LoadClusterThreshold;
+  // In order to avoid regester pressure, on an average, the number of DWORDS
+  // loaded together by all clustered mem ops should not exceed 8. This is an
+  // empirical value based on certain observations and performance related
+  // experiments.
+  // The good thing about this heuristic is - it avoids clustering of too many
+  // sub-word loads, and also avoids clustering of wide loads. Below is the
+  // brief summary of how the heuristic behaves for various `LoadSize`.
+  // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops
+  // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops
+  // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops
+  // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops
+  // (5) LoadSize >= 17: do not cluster
+  const unsigned LoadSize = NumBytes / NumLoads;
+  const unsigned NumDWORDs = ((LoadSize + 3) / 4) * NumLoads;
+  return NumDWORDs <= 8;
 }
 
 // FIXME: This behaves strangely. If, for example, you have 32 load + stores,

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll
index 6fc1cd575308..ab3fbc03e81d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll
@@ -950,22 +950,22 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_18(i32 addrspace(1)* %out)
 define amdgpu_kernel void @simplify_bfe_u32_multi_use_arg(i32 addrspace(1)* %out0,
 ; GFX6-LABEL: simplify_bfe_u32_multi_use_arg:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
-; GFX6-NEXT:    s_mov_b32 s6, -1
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_load_dword s8, s[2:3], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
-; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_and_b32 s8, s8, 63
-; GFX6-NEXT:    s_bfe_u32 s9, s8, 0x20002
-; GFX6-NEXT:    v_mov_b32_e32 v1, s9
-; GFX6-NEXT:    v_mov_b32_e32 v0, s8
-; GFX6-NEXT:    buffer_store_dword v1, off, s[4:7], 0
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
-; GFX6-NEXT:    s_endpgm
+; GFX6-NEXT:        s_load_dwordx2 s[4:5], s[0:1], 0x9
+; GFX6-NEXT:        s_load_dwordx2 s[8:9], s[0:1], 0xb
+; GFX6-NEXT:        s_load_dwordx2 s[0:1], s[0:1], 0xd
+; GFX6-NEXT:        s_mov_b32 s6, -1
+; GFX6-NEXT:        s_mov_b32 s7, 0xf000
+; GFX6-NEXT:        s_mov_b64 s[10:11], s[6:7]
+; GFX6-NEXT:        s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:        s_load_dword s0, s[0:1], 0x0
+; GFX6-NEXT:        s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:        s_and_b32 s0, s0, 63
+; GFX6-NEXT:        s_bfe_u32 s1, s0, 0x20002
+; GFX6-NEXT:        v_mov_b32_e32 v1, s1
+; GFX6-NEXT:        v_mov_b32_e32 v0, s0
+; GFX6-NEXT:        buffer_store_dword v1, off, s[4:7], 0
+; GFX6-NEXT:        buffer_store_dword v0, off, s[8:11], 0
+; GFX6-NEXT:        s_endpgm
                                             i32 addrspace(1)* %out1,
                                             i32 addrspace(1)* %in) #0 {
   %src = load i32, i32 addrspace(1)* %in, align 4

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
index af8b4f0f9e5a..6dceaf2e22fa 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
@@ -17,57 +17,56 @@ define <3 x i32> @v_load_constant_v3i32_align1(<3 x i32> addrspace(4)* %ptr) {
 ;
 ; GFX9-NOUNALIGNED-LABEL: v_load_constant_v3i32_align1:
 ; GFX9-NOUNALIGNED:       ; %bb.0:
-; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NOUNALIGNED-NEXT:    v_add_co_u32_e32 v2, vcc, 11, v0
-; GFX9-NOUNALIGNED-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
-; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v4, v[2:3], off offset:-6
-; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v5, v[2:3], off offset:-5
-; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v6, v[2:3], off offset:-4
-; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v7, v[2:3], off offset:-3
-; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v8, v[2:3], off offset:-2
-; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v9, v[2:3], off offset:-1
-; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v10, v[2:3], off
-; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v1, v[2:3], off offset:-10
-; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v11, v[2:3], off offset:-9
-; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v12, v[2:3], off offset:-8
-; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v2, v[2:3], off offset:-7
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v3, 0xff
-; GFX9-NOUNALIGNED-NEXT:    s_movk_i32 s4, 0xff
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v13, 8
-; GFX9-NOUNALIGNED-NEXT:    s_mov_b32 s5, 8
-; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(11)
-; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_sdwa v4, v13, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(10)
-; GFX9-NOUNALIGNED-NEXT:    v_and_b32_e32 v5, v5, v3
-; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(9)
-; GFX9-NOUNALIGNED-NEXT:    v_and_b32_e32 v6, v6, v3
-; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(7)
-; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_sdwa v8, v13, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(6)
-; GFX9-NOUNALIGNED-NEXT:    v_and_b32_e32 v9, v9, v3
-; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(5)
-; GFX9-NOUNALIGNED-NEXT:    v_and_b32_e32 v10, v10, v3
-; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(3)
-; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NOUNALIGNED-NEXT:    v_and_b32_e32 v11, s4, v11
-; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NOUNALIGNED-NEXT:    v_and_b32_e32 v12, s4, v12
-; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NOUNALIGNED-NEXT:    v_and_or_b32 v2, v2, v3, v4
-; GFX9-NOUNALIGNED-NEXT:    v_and_or_b32 v0, v0, s4, v1
-; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v1, 16, v11
-; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v11, 24, v12
-; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v5, 24, v6
-; GFX9-NOUNALIGNED-NEXT:    v_and_or_b32 v3, v7, v3, v8
-; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v6, 16, v9
-; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v7, 24, v10
-; GFX9-NOUNALIGNED-NEXT:    v_or3_b32 v0, v0, v1, v11
-; GFX9-NOUNALIGNED-NEXT:    v_or3_b32 v1, v2, v4, v5
-; GFX9-NOUNALIGNED-NEXT:    v_or3_b32 v2, v3, v6, v7
-; GFX9-NOUNALIGNED-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-NOUNALIGNED-NEXT:	s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NOUNALIGNED-NEXT:	v_add_co_u32_e32 v2, vcc, 11, v0
+; GFX9-NOUNALIGNED-NEXT:	v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
+; GFX9-NOUNALIGNED-NEXT:	global_load_ubyte v0, v[0:1], off
+; GFX9-NOUNALIGNED-NEXT:	global_load_ubyte v1, v[2:3], off offset:-10
+; GFX9-NOUNALIGNED-NEXT:	global_load_ubyte v4, v[2:3], off offset:-9
+; GFX9-NOUNALIGNED-NEXT:	global_load_ubyte v5, v[2:3], off offset:-8
+; GFX9-NOUNALIGNED-NEXT:	global_load_ubyte v6, v[2:3], off offset:-7
+; GFX9-NOUNALIGNED-NEXT:	global_load_ubyte v7, v[2:3], off offset:-6
+; GFX9-NOUNALIGNED-NEXT:	global_load_ubyte v8, v[2:3], off offset:-5
+; GFX9-NOUNALIGNED-NEXT:	global_load_ubyte v9, v[2:3], off offset:-4
+; GFX9-NOUNALIGNED-NEXT:	global_load_ubyte v10, v[2:3], off offset:-3
+; GFX9-NOUNALIGNED-NEXT:	global_load_ubyte v11, v[2:3], off offset:-2
+; GFX9-NOUNALIGNED-NEXT:	global_load_ubyte v12, v[2:3], off offset:-1
+; GFX9-NOUNALIGNED-NEXT:	global_load_ubyte v2, v[2:3], off
+; GFX9-NOUNALIGNED-NEXT:	v_mov_b32_e32 v3, 0xff
+; GFX9-NOUNALIGNED-NEXT:	s_movk_i32 s4, 0xff
+; GFX9-NOUNALIGNED-NEXT:	v_mov_b32_e32 v13, 8
+; GFX9-NOUNALIGNED-NEXT:	s_mov_b32 s5, 8
+; GFX9-NOUNALIGNED-NEXT:	s_waitcnt vmcnt(10)
+; GFX9-NOUNALIGNED-NEXT:	v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NOUNALIGNED-NEXT:	s_waitcnt vmcnt(9)
+; GFX9-NOUNALIGNED-NEXT:	v_and_b32_e32 v4, s4, v4
+; GFX9-NOUNALIGNED-NEXT:	s_waitcnt vmcnt(8)
+; GFX9-NOUNALIGNED-NEXT:	v_and_b32_e32 v5, s4, v5
+; GFX9-NOUNALIGNED-NEXT:	v_and_or_b32 v0, v0, s4, v1
+; GFX9-NOUNALIGNED-NEXT:	s_waitcnt vmcnt(6)
+; GFX9-NOUNALIGNED-NEXT:	v_lshlrev_b32_sdwa v7, v13, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NOUNALIGNED-NEXT:	s_waitcnt vmcnt(5)
+; GFX9-NOUNALIGNED-NEXT:	v_and_b32_e32 v8, v8, v3
+; GFX9-NOUNALIGNED-NEXT:	s_waitcnt vmcnt(4)
+; GFX9-NOUNALIGNED-NEXT:	v_and_b32_e32 v9, v9, v3
+; GFX9-NOUNALIGNED-NEXT:	v_lshlrev_b32_e32 v1, 16, v4
+; GFX9-NOUNALIGNED-NEXT:	s_waitcnt vmcnt(2)
+; GFX9-NOUNALIGNED-NEXT:	v_lshlrev_b32_sdwa v11, v13, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NOUNALIGNED-NEXT:	s_waitcnt vmcnt(1)
+; GFX9-NOUNALIGNED-NEXT:	v_and_b32_e32 v12, v12, v3
+; GFX9-NOUNALIGNED-NEXT:	s_waitcnt vmcnt(0)
+; GFX9-NOUNALIGNED-NEXT:	v_and_b32_e32 v2, v2, v3
+; GFX9-NOUNALIGNED-NEXT:	v_lshlrev_b32_e32 v4, 24, v5
+; GFX9-NOUNALIGNED-NEXT:	v_and_or_b32 v5, v6, v3, v7
+; GFX9-NOUNALIGNED-NEXT:	v_lshlrev_b32_e32 v6, 16, v8
+; GFX9-NOUNALIGNED-NEXT:	v_lshlrev_b32_e32 v7, 24, v9
+; GFX9-NOUNALIGNED-NEXT:	v_and_or_b32 v3, v10, v3, v11
+; GFX9-NOUNALIGNED-NEXT:	v_lshlrev_b32_e32 v8, 16, v12
+; GFX9-NOUNALIGNED-NEXT:	v_lshlrev_b32_e32 v2, 24, v2
+; GFX9-NOUNALIGNED-NEXT:	v_or3_b32 v0, v0, v1, v4
+; GFX9-NOUNALIGNED-NEXT:	v_or3_b32 v1, v5, v6, v7
+; GFX9-NOUNALIGNED-NEXT:	v_or3_b32 v2, v3, v8, v2
+; GFX9-NOUNALIGNED-NEXT:	s_setpc_b64 s[30:31]
 ;
 ; GFX7-UNALIGNED-LABEL: v_load_constant_v3i32_align1:
 ; GFX7-UNALIGNED:       ; %bb.0:
@@ -156,31 +155,30 @@ define <3 x i32> @v_load_constant_v3i32_align2(<3 x i32> addrspace(4)* %ptr) {
 ;
 ; GFX9-NOUNALIGNED-LABEL: v_load_constant_v3i32_align2:
 ; GFX9-NOUNALIGNED:       ; %bb.0:
-; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NOUNALIGNED-NEXT:    v_add_co_u32_e32 v2, vcc, 10, v0
-; GFX9-NOUNALIGNED-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
-; GFX9-NOUNALIGNED-NEXT:    global_load_ushort v4, v[2:3], off
-; GFX9-NOUNALIGNED-NEXT:    global_load_ushort v0, v[0:1], off
-; GFX9-NOUNALIGNED-NEXT:    global_load_ushort v1, v[2:3], off offset:-8
-; GFX9-NOUNALIGNED-NEXT:    global_load_ushort v5, v[2:3], off offset:-6
-; GFX9-NOUNALIGNED-NEXT:    global_load_ushort v6, v[2:3], off offset:-4
-; GFX9-NOUNALIGNED-NEXT:    global_load_ushort v2, v[2:3], off offset:-2
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v3, 0xffff
-; GFX9-NOUNALIGNED-NEXT:    s_mov_b32 s4, 0xffff
-; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(5)
-; GFX9-NOUNALIGNED-NEXT:    v_and_b32_e32 v4, v4, v3
-; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(3)
-; GFX9-NOUNALIGNED-NEXT:    v_and_b32_e32 v1, s4, v1
-; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NOUNALIGNED-NEXT:    v_and_b32_e32 v6, v6, v3
-; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX9-NOUNALIGNED-NEXT:    v_and_or_b32 v0, v0, s4, v1
-; GFX9-NOUNALIGNED-NEXT:    v_and_or_b32 v1, v5, v3, v6
-; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NOUNALIGNED-NEXT:    v_and_or_b32 v2, v2, v3, v4
-; GFX9-NOUNALIGNED-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-NOUNALIGNED-NEXT:        s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NOUNALIGNED-NEXT:        v_add_co_u32_e32 v2, vcc, 10, v0
+; GFX9-NOUNALIGNED-NEXT:        v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
+; GFX9-NOUNALIGNED-NEXT:        global_load_ushort v0, v[0:1], off
+; GFX9-NOUNALIGNED-NEXT:        global_load_ushort v1, v[2:3], off offset:-8
+; GFX9-NOUNALIGNED-NEXT:        global_load_ushort v4, v[2:3], off offset:-6
+; GFX9-NOUNALIGNED-NEXT:        global_load_ushort v5, v[2:3], off offset:-4
+; GFX9-NOUNALIGNED-NEXT:        global_load_ushort v6, v[2:3], off offset:-2
+; GFX9-NOUNALIGNED-NEXT:        global_load_ushort v2, v[2:3], off
+; GFX9-NOUNALIGNED-NEXT:        v_mov_b32_e32 v3, 0xffff
+; GFX9-NOUNALIGNED-NEXT:        s_mov_b32 s4, 0xffff
+; GFX9-NOUNALIGNED-NEXT:        s_waitcnt vmcnt(4)
+; GFX9-NOUNALIGNED-NEXT:        v_and_b32_e32 v1, s4, v1
+; GFX9-NOUNALIGNED-NEXT:        v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NOUNALIGNED-NEXT:        s_waitcnt vmcnt(2)
+; GFX9-NOUNALIGNED-NEXT:        v_and_b32_e32 v5, v5, v3
+; GFX9-NOUNALIGNED-NEXT:        v_lshlrev_b32_e32 v5, 16, v5
+; GFX9-NOUNALIGNED-NEXT:        s_waitcnt vmcnt(0)
+; GFX9-NOUNALIGNED-NEXT:        v_and_b32_e32 v2, v2, v3
+; GFX9-NOUNALIGNED-NEXT:        v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NOUNALIGNED-NEXT:        v_and_or_b32 v0, v0, s4, v1
+; GFX9-NOUNALIGNED-NEXT:        v_and_or_b32 v1, v4, v3, v5
+; GFX9-NOUNALIGNED-NEXT:        v_and_or_b32 v2, v6, v3, v2
+; GFX9-NOUNALIGNED-NEXT:        s_setpc_b64 s[30:31]
 ;
 ; GFX7-UNALIGNED-LABEL: v_load_constant_v3i32_align2:
 ; GFX7-UNALIGNED:       ; %bb.0:
@@ -194,35 +192,36 @@ define <3 x i32> @v_load_constant_v3i32_align2(<3 x i32> addrspace(4)* %ptr) {
 ;
 ; GFX7-NOUNALIGNED-LABEL: v_load_constant_v3i32_align2:
 ; GFX7-NOUNALIGNED:       ; %bb.0:
-; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NOUNALIGNED-NEXT:    s_mov_b32 s6, 0
-; GFX7-NOUNALIGNED-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NOUNALIGNED-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NOUNALIGNED-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:10
-; GFX7-NOUNALIGNED-NEXT:    buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NOUNALIGNED-NEXT:    buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:2
-; GFX7-NOUNALIGNED-NEXT:    buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:4
-; GFX7-NOUNALIGNED-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:6
-; GFX7-NOUNALIGNED-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:8
-; GFX7-NOUNALIGNED-NEXT:    s_mov_b32 s4, 0xffff
-; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(4)
-; GFX7-NOUNALIGNED-NEXT:    v_and_b32_e32 v1, s4, v3
-; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(3)
-; GFX7-NOUNALIGNED-NEXT:    v_and_b32_e32 v3, s4, v4
-; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(2)
-; GFX7-NOUNALIGNED-NEXT:    v_and_b32_e32 v4, s4, v5
-; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-NOUNALIGNED-NEXT:    v_and_b32_e32 v5, s4, v6
-; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NOUNALIGNED-NEXT:    v_and_b32_e32 v6, s4, v0
-; GFX7-NOUNALIGNED-NEXT:    v_and_b32_e32 v0, s4, v2
-; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
-; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v0, v1, v2
-; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v1, v4, v3
-; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v2, v6, v5
-; GFX7-NOUNALIGNED-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-NOUNALIGNED-NEXT:        s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NOUNALIGNED-NEXT:        s_mov_b32 s6, 0
+; GFX7-NOUNALIGNED-NEXT:        s_mov_b32 s7, 0xf000
+; GFX7-NOUNALIGNED-NEXT:        s_mov_b64 s[4:5], 0
+; GFX7-NOUNALIGNED-NEXT:        buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64
+; GFX7-NOUNALIGNED-NEXT:        buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:2
+; GFX7-NOUNALIGNED-NEXT:        buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4
+; GFX7-NOUNALIGNED-NEXT:        buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6
+; GFX7-NOUNALIGNED-NEXT:        buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:8
+; GFX7-NOUNALIGNED-NEXT:        buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:10
+; GFX7-NOUNALIGNED-NEXT:        s_mov_b32 s4, 0xffff
+; GFX7-NOUNALIGNED-NEXT:        s_waitcnt vmcnt(5)
+; GFX7-NOUNALIGNED-NEXT:        v_and_b32_e32 v1, s4, v2
+; GFX7-NOUNALIGNED-NEXT:        s_waitcnt vmcnt(4)
+; GFX7-NOUNALIGNED-NEXT:        v_and_b32_e32 v2, s4, v3
+; GFX7-NOUNALIGNED-NEXT:        s_waitcnt vmcnt(3)
+; GFX7-NOUNALIGNED-NEXT:        v_and_b32_e32 v3, s4, v4
+; GFX7-NOUNALIGNED-NEXT:        s_waitcnt vmcnt(2)
+; GFX7-NOUNALIGNED-NEXT:        v_and_b32_e32 v4, s4, v5
+; GFX7-NOUNALIGNED-NEXT:        s_waitcnt vmcnt(1)
+; GFX7-NOUNALIGNED-NEXT:        v_and_b32_e32 v5, s4, v6
+; GFX7-NOUNALIGNED-NEXT:        s_waitcnt vmcnt(0)
+; GFX7-NOUNALIGNED-NEXT:        v_and_b32_e32 v0, s4, v0
+; GFX7-NOUNALIGNED-NEXT:        v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NOUNALIGNED-NEXT:        v_lshlrev_b32_e32 v6, 16, v0
+; GFX7-NOUNALIGNED-NEXT:        v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NOUNALIGNED-NEXT:        v_or_b32_e32 v0, v1, v2
+; GFX7-NOUNALIGNED-NEXT:        v_or_b32_e32 v1, v3, v4
+; GFX7-NOUNALIGNED-NEXT:        v_or_b32_e32 v2, v5, v6
+; GFX7-NOUNALIGNED-NEXT:        s_setpc_b64 s[30:31]
   %load = load <3 x i32>, <3 x i32> addrspace(4)* %ptr, align 2
   ret <3 x i32> %load
 }
@@ -399,97 +398,101 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align1(<3 x i32> addrspace(4)*
 ;
 ; GFX9-NOUNALIGNED-LABEL: s_load_constant_v3i32_align1:
 ; GFX9-NOUNALIGNED:       ; %bb.0:
-; GFX9-NOUNALIGNED-NEXT:    s_add_u32 s2, s0, 1
-; GFX9-NOUNALIGNED-NEXT:    s_addc_u32 s3, s1, 0
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-NOUNALIGNED-NEXT:    s_add_u32 s2, s0, 2
-; GFX9-NOUNALIGNED-NEXT:    s_addc_u32 s3, s1, 0
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v5, s3
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v4, s2
-; GFX9-NOUNALIGNED-NEXT:    s_add_u32 s2, s0, 3
-; GFX9-NOUNALIGNED-NEXT:    s_addc_u32 s3, s1, 0
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v7, s3
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v6, s2
-; GFX9-NOUNALIGNED-NEXT:    s_add_u32 s2, s0, 4
-; GFX9-NOUNALIGNED-NEXT:    s_addc_u32 s3, s1, 0
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v9, s3
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v8, s2
-; GFX9-NOUNALIGNED-NEXT:    s_add_u32 s2, s0, 5
-; GFX9-NOUNALIGNED-NEXT:    s_addc_u32 s3, s1, 0
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v11, s3
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v10, s2
-; GFX9-NOUNALIGNED-NEXT:    s_add_u32 s2, s0, 6
-; GFX9-NOUNALIGNED-NEXT:    s_addc_u32 s3, s1, 0
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v13, s3
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v12, s2
-; GFX9-NOUNALIGNED-NEXT:    s_add_u32 s2, s0, 7
-; GFX9-NOUNALIGNED-NEXT:    s_addc_u32 s3, s1, 0
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v15, s3
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v14, s2
-; GFX9-NOUNALIGNED-NEXT:    s_add_u32 s2, s0, 8
-; GFX9-NOUNALIGNED-NEXT:    s_addc_u32 s3, s1, 0
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v17, s3
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v16, s2
-; GFX9-NOUNALIGNED-NEXT:    s_add_u32 s2, s0, 9
-; GFX9-NOUNALIGNED-NEXT:    s_addc_u32 s3, s1, 0
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v19, s3
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v18, s2
-; GFX9-NOUNALIGNED-NEXT:    s_add_u32 s2, s0, 10
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NOUNALIGNED-NEXT:    s_addc_u32 s3, s1, 0
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NOUNALIGNED-NEXT:    s_add_u32 s0, s0, 11
-; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v20, v[10:11], off
-; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v21, v[12:13], off
-; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v14, v[14:15], off
-; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v15, v[16:17], off
-; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v16, v[18:19], off
-; GFX9-NOUNALIGNED-NEXT:    s_addc_u32 s1, s1, 0
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v11, s3
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v13, s1
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v10, s2
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v12, s0
-; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v10, v[10:11], off
-; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v11, v[12:13], off
-; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v1, v[2:3], off
-; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v2, v[4:5], off
-; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v3, v[6:7], off
-; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v4, v[8:9], off
-; GFX9-NOUNALIGNED-NEXT:    s_mov_b32 s1, 8
-; GFX9-NOUNALIGNED-NEXT:    s_movk_i32 s0, 0xff
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v5, 0xff
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v6, 8
-; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(3)
-; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_sdwa v1, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NOUNALIGNED-NEXT:    v_and_or_b32 v0, v0, s0, v1
-; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NOUNALIGNED-NEXT:    v_and_b32_e32 v1, s0, v2
-; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NOUNALIGNED-NEXT:    v_and_b32_e32 v2, s0, v3
-; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
-; GFX9-NOUNALIGNED-NEXT:    v_or3_b32 v0, v0, v1, v2
-; GFX9-NOUNALIGNED-NEXT:    v_and_b32_e32 v1, v21, v5
-; GFX9-NOUNALIGNED-NEXT:    v_and_b32_e32 v2, v14, v5
-; GFX9-NOUNALIGNED-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_sdwa v0, v6, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NOUNALIGNED-NEXT:    v_and_or_b32 v0, v4, v5, v0
-; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
-; GFX9-NOUNALIGNED-NEXT:    v_or3_b32 v1, v0, v1, v2
-; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_sdwa v0, v6, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NOUNALIGNED-NEXT:    v_and_b32_e32 v2, v11, v5
-; GFX9-NOUNALIGNED-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX9-NOUNALIGNED-NEXT:    v_and_b32_e32 v1, v10, v5
-; GFX9-NOUNALIGNED-NEXT:    v_and_or_b32 v0, v15, v5, v0
-; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
-; GFX9-NOUNALIGNED-NEXT:    v_or3_b32 v2, v0, v1, v2
-; GFX9-NOUNALIGNED-NEXT:    v_readfirstlane_b32 s2, v2
-; GFX9-NOUNALIGNED-NEXT:    ; return to shader part epilog
+; GFX9-NOUNALIGNED-NEXT:	s_add_u32 s2, s0, 1
+; GFX9-NOUNALIGNED-NEXT:	s_addc_u32 s3, s1, 0
+; GFX9-NOUNALIGNED-NEXT:	v_mov_b32_e32 v2, s2
+; GFX9-NOUNALIGNED-NEXT:	v_mov_b32_e32 v3, s3
+; GFX9-NOUNALIGNED-NEXT:	s_add_u32 s2, s0, 2
+; GFX9-NOUNALIGNED-NEXT:	s_addc_u32 s3, s1, 0
+; GFX9-NOUNALIGNED-NEXT:	v_mov_b32_e32 v5, s3
+; GFX9-NOUNALIGNED-NEXT:	v_mov_b32_e32 v4, s2
+; GFX9-NOUNALIGNED-NEXT:	s_add_u32 s2, s0, 3
+; GFX9-NOUNALIGNED-NEXT:	s_addc_u32 s3, s1, 0
+; GFX9-NOUNALIGNED-NEXT:	v_mov_b32_e32 v7, s3
+; GFX9-NOUNALIGNED-NEXT:	v_mov_b32_e32 v6, s2
+; GFX9-NOUNALIGNED-NEXT:	s_add_u32 s2, s0, 4
+; GFX9-NOUNALIGNED-NEXT:	s_addc_u32 s3, s1, 0
+; GFX9-NOUNALIGNED-NEXT:	v_mov_b32_e32 v9, s3
+; GFX9-NOUNALIGNED-NEXT:	v_mov_b32_e32 v8, s2
+; GFX9-NOUNALIGNED-NEXT:	s_add_u32 s2, s0, 5
+; GFX9-NOUNALIGNED-NEXT:	s_addc_u32 s3, s1, 0
+; GFX9-NOUNALIGNED-NEXT:	v_mov_b32_e32 v11, s3
+; GFX9-NOUNALIGNED-NEXT:	v_mov_b32_e32 v10, s2
+; GFX9-NOUNALIGNED-NEXT:	s_add_u32 s2, s0, 6
+; GFX9-NOUNALIGNED-NEXT:	s_addc_u32 s3, s1, 0
+; GFX9-NOUNALIGNED-NEXT:	v_mov_b32_e32 v13, s3
+; GFX9-NOUNALIGNED-NEXT:	v_mov_b32_e32 v12, s2
+; GFX9-NOUNALIGNED-NEXT:	s_add_u32 s2, s0, 7
+; GFX9-NOUNALIGNED-NEXT:	s_addc_u32 s3, s1, 0
+; GFX9-NOUNALIGNED-NEXT:	v_mov_b32_e32 v0, s0
+; GFX9-NOUNALIGNED-NEXT:	v_mov_b32_e32 v15, s3
+; GFX9-NOUNALIGNED-NEXT:	v_mov_b32_e32 v14, s2
+; GFX9-NOUNALIGNED-NEXT:	s_add_u32 s2, s0, 8
+; GFX9-NOUNALIGNED-NEXT:	v_mov_b32_e32 v1, s1
+; GFX9-NOUNALIGNED-NEXT:	global_load_ubyte v16, v[0:1], off
+; GFX9-NOUNALIGNED-NEXT:	global_load_ubyte v17, v[2:3], off
+; GFX9-NOUNALIGNED-NEXT:	global_load_ubyte v18, v[4:5], off
+; GFX9-NOUNALIGNED-NEXT:	global_load_ubyte v19, v[6:7], off
+; GFX9-NOUNALIGNED-NEXT:	global_load_ubyte v8, v[8:9], off
+; GFX9-NOUNALIGNED-NEXT:	global_load_ubyte v9, v[10:11], off
+; GFX9-NOUNALIGNED-NEXT:	global_load_ubyte v10, v[12:13], off
+; GFX9-NOUNALIGNED-NEXT:	global_load_ubyte v11, v[14:15], off
+; GFX9-NOUNALIGNED-NEXT:	s_addc_u32 s3, s1, 0
+; GFX9-NOUNALIGNED-NEXT:	v_mov_b32_e32 v0, s2
+; GFX9-NOUNALIGNED-NEXT:	v_mov_b32_e32 v1, s3
+; GFX9-NOUNALIGNED-NEXT:	s_add_u32 s2, s0, 9
+; GFX9-NOUNALIGNED-NEXT:	s_addc_u32 s3, s1, 0
+; GFX9-NOUNALIGNED-NEXT:	v_mov_b32_e32 v2, s2
+; GFX9-NOUNALIGNED-NEXT:	v_mov_b32_e32 v3, s3
+; GFX9-NOUNALIGNED-NEXT:	s_add_u32 s2, s0, 10
+; GFX9-NOUNALIGNED-NEXT:	s_addc_u32 s3, s1, 0
+; GFX9-NOUNALIGNED-NEXT:	s_add_u32 s0, s0, 11
+; GFX9-NOUNALIGNED-NEXT:	s_addc_u32 s1, s1, 0
+; GFX9-NOUNALIGNED-NEXT:	v_mov_b32_e32 v5, s3
+; GFX9-NOUNALIGNED-NEXT:	v_mov_b32_e32 v7, s1
+; GFX9-NOUNALIGNED-NEXT:	v_mov_b32_e32 v4, s2
+; GFX9-NOUNALIGNED-NEXT:	v_mov_b32_e32 v6, s0
+; GFX9-NOUNALIGNED-NEXT:	global_load_ubyte v12, v[0:1], off
+; GFX9-NOUNALIGNED-NEXT:	global_load_ubyte v2, v[2:3], off
+; GFX9-NOUNALIGNED-NEXT:	global_load_ubyte v3, v[4:5], off
+; GFX9-NOUNALIGNED-NEXT:	global_load_ubyte v4, v[6:7], off
+; GFX9-NOUNALIGNED-NEXT:	s_movk_i32 s0, 0xff
+; GFX9-NOUNALIGNED-NEXT:	s_mov_b32 s1, 8
+; GFX9-NOUNALIGNED-NEXT:	v_mov_b32_e32 v5, 0xff
+; GFX9-NOUNALIGNED-NEXT:	v_mov_b32_e32 v6, 8
+; GFX9-NOUNALIGNED-NEXT:	s_waitcnt vmcnt(10)
+; GFX9-NOUNALIGNED-NEXT:	v_lshlrev_b32_sdwa v0, s1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NOUNALIGNED-NEXT:	s_waitcnt vmcnt(9)
+; GFX9-NOUNALIGNED-NEXT:	v_and_b32_e32 v1, s0, v18
+; GFX9-NOUNALIGNED-NEXT:	s_waitcnt vmcnt(8)
+; GFX9-NOUNALIGNED-NEXT:	v_and_b32_e32 v7, s0, v19
+; GFX9-NOUNALIGNED-NEXT:	v_and_or_b32 v0, v16, s0, v0
+; GFX9-NOUNALIGNED-NEXT:	v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NOUNALIGNED-NEXT:	v_lshlrev_b32_e32 v7, 24, v7
+; GFX9-NOUNALIGNED-NEXT:	v_or3_b32 v0, v0, v1, v7
+; GFX9-NOUNALIGNED-NEXT:	s_waitcnt vmcnt(5)
+; GFX9-NOUNALIGNED-NEXT:	v_and_b32_e32 v1, v10, v5
+; GFX9-NOUNALIGNED-NEXT:	s_waitcnt vmcnt(4)
+; GFX9-NOUNALIGNED-NEXT:	v_and_b32_e32 v7, v11, v5
+; GFX9-NOUNALIGNED-NEXT:	v_readfirstlane_b32 s0, v0
+; GFX9-NOUNALIGNED-NEXT:	v_lshlrev_b32_sdwa v0, v6, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NOUNALIGNED-NEXT:	v_and_or_b32 v0, v8, v5, v0
+; GFX9-NOUNALIGNED-NEXT:	v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NOUNALIGNED-NEXT:	v_lshlrev_b32_e32 v7, 24, v7
+; GFX9-NOUNALIGNED-NEXT:	v_or3_b32 v1, v0, v1, v7
+; GFX9-NOUNALIGNED-NEXT:	v_readfirstlane_b32 s1, v1
+; GFX9-NOUNALIGNED-NEXT:	s_waitcnt vmcnt(2)
+; GFX9-NOUNALIGNED-NEXT:	v_lshlrev_b32_sdwa v0, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NOUNALIGNED-NEXT:	s_waitcnt vmcnt(1)
+; GFX9-NOUNALIGNED-NEXT:	v_and_b32_e32 v1, v3, v5
+; GFX9-NOUNALIGNED-NEXT:	s_waitcnt vmcnt(0)
+; GFX9-NOUNALIGNED-NEXT:	v_and_b32_e32 v2, v4, v5
+; GFX9-NOUNALIGNED-NEXT:	v_and_or_b32 v0, v12, v5, v0
+; GFX9-NOUNALIGNED-NEXT:	v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NOUNALIGNED-NEXT:	v_lshlrev_b32_e32 v2, 24, v2
+; GFX9-NOUNALIGNED-NEXT:	v_or3_b32 v2, v0, v1, v2
+; GFX9-NOUNALIGNED-NEXT:	v_readfirstlane_b32 s2, v2
+; GFX9-NOUNALIGNED-NEXT:	; return to shader part epilog
 ;
 ; GFX7-UNALIGNED-LABEL: s_load_constant_v3i32_align1:
 ; GFX7-UNALIGNED:       ; %bb.0:
@@ -585,52 +588,52 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align2(<3 x i32> addrspace(4)*
 ;
 ; GFX9-NOUNALIGNED-LABEL: s_load_constant_v3i32_align2:
 ; GFX9-NOUNALIGNED:       ; %bb.0:
-; GFX9-NOUNALIGNED-NEXT:    s_add_u32 s2, s0, 2
-; GFX9-NOUNALIGNED-NEXT:    s_addc_u32 s3, s1, 0
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-NOUNALIGNED-NEXT:    s_add_u32 s2, s0, 4
-; GFX9-NOUNALIGNED-NEXT:    s_addc_u32 s3, s1, 0
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v5, s3
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v4, s2
-; GFX9-NOUNALIGNED-NEXT:    s_add_u32 s2, s0, 6
-; GFX9-NOUNALIGNED-NEXT:    s_addc_u32 s3, s1, 0
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v7, s3
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v6, s2
-; GFX9-NOUNALIGNED-NEXT:    s_add_u32 s2, s0, 8
-; GFX9-NOUNALIGNED-NEXT:    s_addc_u32 s3, s1, 0
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NOUNALIGNED-NEXT:    s_add_u32 s0, s0, 10
-; GFX9-NOUNALIGNED-NEXT:    s_addc_u32 s1, s1, 0
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v11, s1
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v9, s3
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v10, s0
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v8, s2
-; GFX9-NOUNALIGNED-NEXT:    global_load_ushort v10, v[10:11], off
-; GFX9-NOUNALIGNED-NEXT:    global_load_ushort v0, v[0:1], off
-; GFX9-NOUNALIGNED-NEXT:    global_load_ushort v1, v[2:3], off
-; GFX9-NOUNALIGNED-NEXT:    global_load_ushort v2, v[4:5], off
-; GFX9-NOUNALIGNED-NEXT:    global_load_ushort v3, v[6:7], off
-; GFX9-NOUNALIGNED-NEXT:    global_load_ushort v4, v[8:9], off
-; GFX9-NOUNALIGNED-NEXT:    s_mov_b32 s0, 0xffff
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v5, 0xffff
-; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(3)
-; GFX9-NOUNALIGNED-NEXT:    v_and_b32_e32 v1, s0, v1
-; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NOUNALIGNED-NEXT:    v_and_or_b32 v0, v0, s0, v1
-; GFX9-NOUNALIGNED-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NOUNALIGNED-NEXT:    v_and_b32_e32 v0, v3, v5
-; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NOUNALIGNED-NEXT:    v_and_or_b32 v1, v2, v5, v0
-; GFX9-NOUNALIGNED-NEXT:    v_and_b32_e32 v0, v10, v5
-; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NOUNALIGNED-NEXT:    v_and_or_b32 v2, v4, v5, v0
-; GFX9-NOUNALIGNED-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX9-NOUNALIGNED-NEXT:    v_readfirstlane_b32 s2, v2
-; GFX9-NOUNALIGNED-NEXT:    ; return to shader part epilog
+; GFX9-NOUNALIGNED-NEXT:	s_add_u32 s2, s0, 2
+; GFX9-NOUNALIGNED-NEXT:	s_addc_u32 s3, s1, 0
+; GFX9-NOUNALIGNED-NEXT:	v_mov_b32_e32 v2, s2
+; GFX9-NOUNALIGNED-NEXT:	v_mov_b32_e32 v3, s3
+; GFX9-NOUNALIGNED-NEXT:	s_add_u32 s2, s0, 4
+; GFX9-NOUNALIGNED-NEXT:	s_addc_u32 s3, s1, 0
+; GFX9-NOUNALIGNED-NEXT:	v_mov_b32_e32 v5, s3
+; GFX9-NOUNALIGNED-NEXT:	v_mov_b32_e32 v4, s2
+; GFX9-NOUNALIGNED-NEXT:	s_add_u32 s2, s0, 6
+; GFX9-NOUNALIGNED-NEXT:	s_addc_u32 s3, s1, 0
+; GFX9-NOUNALIGNED-NEXT:	v_mov_b32_e32 v7, s3
+; GFX9-NOUNALIGNED-NEXT:	v_mov_b32_e32 v6, s2
+; GFX9-NOUNALIGNED-NEXT:	s_add_u32 s2, s0, 8
+; GFX9-NOUNALIGNED-NEXT:	s_addc_u32 s3, s1, 0
+; GFX9-NOUNALIGNED-NEXT:	v_mov_b32_e32 v0, s0
+; GFX9-NOUNALIGNED-NEXT:	v_mov_b32_e32 v1, s1
+; GFX9-NOUNALIGNED-NEXT:	s_add_u32 s0, s0, 10
+; GFX9-NOUNALIGNED-NEXT:	s_addc_u32 s1, s1, 0
+; GFX9-NOUNALIGNED-NEXT:	v_mov_b32_e32 v9, s3
+; GFX9-NOUNALIGNED-NEXT:	v_mov_b32_e32 v11, s1
+; GFX9-NOUNALIGNED-NEXT:	v_mov_b32_e32 v10, s0
+; GFX9-NOUNALIGNED-NEXT:	v_mov_b32_e32 v8, s2
+; GFX9-NOUNALIGNED-NEXT:	global_load_ushort v0, v[0:1], off
+; GFX9-NOUNALIGNED-NEXT:	global_load_ushort v1, v[2:3], off
+; GFX9-NOUNALIGNED-NEXT:	global_load_ushort v2, v[4:5], off
+; GFX9-NOUNALIGNED-NEXT:	global_load_ushort v3, v[6:7], off
+; GFX9-NOUNALIGNED-NEXT:	global_load_ushort v4, v[8:9], off
+; GFX9-NOUNALIGNED-NEXT:	global_load_ushort v5, v[10:11], off
+; GFX9-NOUNALIGNED-NEXT:	s_mov_b32 s0, 0xffff
+; GFX9-NOUNALIGNED-NEXT:	v_mov_b32_e32 v6, 0xffff
+; GFX9-NOUNALIGNED-NEXT:	s_waitcnt vmcnt(4)
+; GFX9-NOUNALIGNED-NEXT:	v_and_b32_e32 v1, s0, v1
+; GFX9-NOUNALIGNED-NEXT:	v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NOUNALIGNED-NEXT:	v_and_or_b32 v0, v0, s0, v1
+; GFX9-NOUNALIGNED-NEXT:	v_readfirstlane_b32 s0, v0
+; GFX9-NOUNALIGNED-NEXT:	s_waitcnt vmcnt(2)
+; GFX9-NOUNALIGNED-NEXT:	v_and_b32_e32 v0, v3, v6
+; GFX9-NOUNALIGNED-NEXT:	v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NOUNALIGNED-NEXT:	v_and_or_b32 v1, v2, v6, v0
+; GFX9-NOUNALIGNED-NEXT:	s_waitcnt vmcnt(0)
+; GFX9-NOUNALIGNED-NEXT:	v_and_b32_e32 v0, v5, v6
+; GFX9-NOUNALIGNED-NEXT:	v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NOUNALIGNED-NEXT:	v_and_or_b32 v2, v4, v6, v0
+; GFX9-NOUNALIGNED-NEXT:	v_readfirstlane_b32 s1, v1
+; GFX9-NOUNALIGNED-NEXT:	v_readfirstlane_b32 s2, v2
+; GFX9-NOUNALIGNED-NEXT:	; return to shader part epilog
 ;
 ; GFX7-UNALIGNED-LABEL: s_load_constant_v3i32_align2:
 ; GFX7-UNALIGNED:       ; %bb.0:
@@ -648,37 +651,35 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align2(<3 x i32> addrspace(4)*
 ;
 ; GFX7-NOUNALIGNED-LABEL: s_load_constant_v3i32_align2:
 ; GFX7-NOUNALIGNED:       ; %bb.0:
-; GFX7-NOUNALIGNED-NEXT:    s_mov_b32 s2, -1
-; GFX7-NOUNALIGNED-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NOUNALIGNED-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:10
-; GFX7-NOUNALIGNED-NEXT:    buffer_load_ushort v1, off, s[0:3], 0
-; GFX7-NOUNALIGNED-NEXT:    buffer_load_ushort v2, off, s[0:3], 0 offset:2
-; GFX7-NOUNALIGNED-NEXT:    buffer_load_ushort v3, off, s[0:3], 0 offset:4
-; GFX7-NOUNALIGNED-NEXT:    buffer_load_ushort v4, off, s[0:3], 0 offset:6
-; GFX7-NOUNALIGNED-NEXT:    buffer_load_ushort v5, off, s[0:3], 0 offset:8
-; GFX7-NOUNALIGNED-NEXT:    s_mov_b32 s0, 0xffff
-; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(5)
-; GFX7-NOUNALIGNED-NEXT:    v_and_b32_e32 v0, s0, v0
-; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(4)
-; GFX7-NOUNALIGNED-NEXT:    v_and_b32_e32 v1, s0, v1
-; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(3)
-; GFX7-NOUNALIGNED-NEXT:    v_and_b32_e32 v2, s0, v2
-; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(2)
-; GFX7-NOUNALIGNED-NEXT:    v_and_b32_e32 v3, s0, v3
-; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-NOUNALIGNED-NEXT:    v_and_b32_e32 v4, s0, v4
-; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NOUNALIGNED-NEXT:    v_and_b32_e32 v5, s0, v5
-; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
-; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v0, v1, v2
-; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v1, v3, v4
-; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v2, v5, v6
-; GFX7-NOUNALIGNED-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX7-NOUNALIGNED-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX7-NOUNALIGNED-NEXT:    v_readfirstlane_b32 s2, v2
-; GFX7-NOUNALIGNED-NEXT:    ; return to shader part epilog
+; GFX7-NOUNALIGNED-NEXT:        s_mov_b32 s2, -1
+; GFX7-NOUNALIGNED-NEXT:        s_mov_b32 s3, 0xf000
+; GFX7-NOUNALIGNED-NEXT:        buffer_load_ushort v0, off, s[0:3], 0
+; GFX7-NOUNALIGNED-NEXT:        buffer_load_ushort v1, off, s[0:3], 0 offset:2
+; GFX7-NOUNALIGNED-NEXT:        buffer_load_ushort v2, off, s[0:3], 0 offset:4
+; GFX7-NOUNALIGNED-NEXT:        buffer_load_ushort v3, off, s[0:3], 0 offset:6
+; GFX7-NOUNALIGNED-NEXT:        buffer_load_ushort v4, off, s[0:3], 0 offset:8
+; GFX7-NOUNALIGNED-NEXT:        buffer_load_ushort v5, off, s[0:3], 0 offset:10
+; GFX7-NOUNALIGNED-NEXT:        s_mov_b32 s0, 0xffff
+; GFX7-NOUNALIGNED-NEXT:        s_waitcnt vmcnt(5)
+; GFX7-NOUNALIGNED-NEXT:        v_and_b32_e32 v0, s0, v0
+; GFX7-NOUNALIGNED-NEXT:        s_waitcnt vmcnt(4)
+; GFX7-NOUNALIGNED-NEXT:        v_and_b32_e32 v1, s0, v1
+; GFX7-NOUNALIGNED-NEXT:        v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NOUNALIGNED-NEXT:        s_waitcnt vmcnt(2)
+; GFX7-NOUNALIGNED-NEXT:        v_and_b32_e32 v3, s0, v3
+; GFX7-NOUNALIGNED-NEXT:        v_and_b32_e32 v2, s0, v2
+; GFX7-NOUNALIGNED-NEXT:        s_waitcnt vmcnt(0)
+; GFX7-NOUNALIGNED-NEXT:        v_and_b32_e32 v5, s0, v5
+; GFX7-NOUNALIGNED-NEXT:        v_and_b32_e32 v4, s0, v4
+; GFX7-NOUNALIGNED-NEXT:        v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NOUNALIGNED-NEXT:        v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NOUNALIGNED-NEXT:        v_or_b32_e32 v0, v0, v1
+; GFX7-NOUNALIGNED-NEXT:        v_or_b32_e32 v1, v2, v3
+; GFX7-NOUNALIGNED-NEXT:        v_or_b32_e32 v2, v4, v5
+; GFX7-NOUNALIGNED-NEXT:        v_readfirstlane_b32 s0, v0
+; GFX7-NOUNALIGNED-NEXT:        v_readfirstlane_b32 s1, v1
+; GFX7-NOUNALIGNED-NEXT:        v_readfirstlane_b32 s2, v2
+; GFX7-NOUNALIGNED-NEXT:        ; return to shader part epilog
   %load = load <3 x i32>, <3 x i32> addrspace(4)* %ptr, align 2
   ret <3 x i32> %load
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll b/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll
index 3d75eca93cb4..8f4f1c391535 100644
--- a/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll
@@ -3,7 +3,7 @@
 
 ; GCN-LABEL: {{^}}amdhsa_trap_num_sgprs
 ; TRAP-HANDLER-ENABLE:  NumSgprs: 61
-; TRAP-HANDLER-DISABLE: NumSgprs: 79
+; TRAP-HANDLER-DISABLE: NumSgprs: 77
 define amdgpu_kernel void @amdhsa_trap_num_sgprs(
     i32 addrspace(1)* %out0, i32 %in0,
     i32 addrspace(1)* %out1, i32 %in1,

diff  --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
index 84eb94228dec..2986cb48a86e 100644
--- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -840,14 +840,14 @@ entry:
 ; GCN: v_mov_b32_e32 [[REG10:v[0-9]+]], 10
 ; GCN: v_mov_b32_e32 [[REG11:v[0-9]+]], 11
 ; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 12
+; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 13
+; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 14
+; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 15
 ; GCN: buffer_store_dword [[REG8]], {{.*$}}
 ; GCN: buffer_store_dword [[REG9]], {{.*}} offset:4
 ; GCN: buffer_store_dword [[REG10]], {{.*}} offset:8
 ; GCN: buffer_store_dword [[REG11]], {{.*}} offset:12
 ; GCN: buffer_store_dword [[REG12]], {{.*}} offset:16
-; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 13
-; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 14
-; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 15
 ; GCN: buffer_store_dword [[REG13]], {{.*}} offset:20
 ; GCN: buffer_store_dword [[REG14]], {{.*}} offset:24
 ; GCN: buffer_store_dword [[REG15]], {{.*}} offset:28
@@ -874,14 +874,14 @@ entry:
 ; GCN: v_mov_b32_e32 [[REG10:v[0-9]+]], 0x41200000
 ; GCN: v_mov_b32_e32 [[REG11:v[0-9]+]], 0x41300000
 ; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000
+; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000
+; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000
+; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000
 ; GCN: buffer_store_dword [[REG8]], {{.*$}}
 ; GCN: buffer_store_dword [[REG9]], {{.*}} offset:4
 ; GCN: buffer_store_dword [[REG10]], {{.*}} offset:8
 ; GCN: buffer_store_dword [[REG11]], {{.*}} offset:12
 ; GCN: buffer_store_dword [[REG12]], {{.*}} offset:16
-; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000
-; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000
-; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000
 ; GCN: buffer_store_dword [[REG13]], {{.*}} offset:20
 ; GCN: buffer_store_dword [[REG14]], {{.*}} offset:24
 ; GCN: buffer_store_dword [[REG15]], {{.*}} offset:28

diff  --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
index 6bc3073eb18e..27698c58e3c0 100644
--- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
@@ -177,13 +177,13 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(i32 addrspace(1)* n
 ; SI-NOSDWA: v_or_b32_e32 [[VAL2:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
 ; SI-NOSDWA: v_ffbl_b32_e32 v{{[0-9]+}}, [[VAL1]]
 ; SI-NOSDWA: v_ffbl_b32_e32 v{{[0-9]+}}, [[VAL2]]
-; SI-SDWA: v_or_b32_sdwa
 ; SI-SDWA: v_or_b32_e32
+; SI-SDWA: v_or_b32_sdwa
 ; SI-SDWA: v_or_b32_e32
-; SI-SDWA: v_or_b32_e32 [[VAL1:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
-; SI-SDWA: v_ffbl_b32_e32 v{{[0-9]+}}, [[VAL1]]
 ; SI-SDWA: v_or_b32_sdwa
+; SI-SDWA: v_or_b32_e32 [[VAL1:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
 ; SI-SDWA: v_or_b32_e32 [[VAL2:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
+; SI-SDWA: v_ffbl_b32_e32 v{{[0-9]+}}, [[VAL1]]
 ; SI-SDWA: v_ffbl_b32_e32 v{{[0-9]+}}, [[VAL2]]
 ; SI: v_cmp_eq_u32_e32 vcc, 0
 ; SI: v_cmp_ne_u64_e32 vcc, 0

diff  --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
index 21360aa85cbc..7ea072bffecb 100644
--- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@@ -636,81 +636,81 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)
 define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind {
 ; SI-LABEL: load_v4i8_to_v4f32_2_uses:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s6, 0
-; SI-NEXT:    s_mov_b32 s7, s3
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
-; SI-NEXT:    s_mov_b32 s2, -1
-; SI-NEXT:    s_movk_i32 s8, 0xff
-; SI-NEXT:    s_mov_b32 s6, s2
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 24, v4
-; SI-NEXT:    v_and_b32_e32 v7, 0xff00, v4
-; SI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v4
-; SI-NEXT:    v_cvt_f32_ubyte2_e32 v2, v4
-; SI-NEXT:    v_cvt_f32_ubyte1_e32 v1, v4
-; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v4
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 9, v4
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v0, s8, v4
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 9, v5
-; SI-NEXT:    v_or_b32_e32 v0, v7, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v6
-; SI-NEXT:    v_and_b32_e32 v2, s8, v2
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x900, v0
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v0, v1, v0
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x9000000, v0
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
-; SI-NEXT:    s_endpgm
+; SI-NEXT:        s_load_dwordx2 s[4:5], s[0:1], 0x9
+; SI-NEXT:        s_load_dwordx2 s[8:9], s[0:1], 0xb
+; SI-NEXT:        s_load_dwordx2 s[0:1], s[0:1], 0xd
+; SI-NEXT:        s_mov_b32 s11, 0xf000
+; SI-NEXT:        s_mov_b32 s2, 0
+; SI-NEXT:        s_mov_b32 s3, s11
+; SI-NEXT:        v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT:        v_mov_b32_e32 v1, 0
+; SI-NEXT:        s_waitcnt lgkmcnt(0)
+; SI-NEXT:        buffer_load_dword v4, v[0:1], s[0:3], 0 addr64
+; SI-NEXT:        s_mov_b32 s10, -1
+; SI-NEXT:        s_movk_i32 s0, 0xff
+; SI-NEXT:        s_mov_b32 s6, s10
+; SI-NEXT:        s_mov_b32 s7, s11
+; SI-NEXT:        s_waitcnt vmcnt(0)
+; SI-NEXT:        v_lshrrev_b32_e32 v5, 16, v4
+; SI-NEXT:        v_lshrrev_b32_e32 v6, 24, v4
+; SI-NEXT:        v_and_b32_e32 v7, 0xff00, v4
+; SI-NEXT:        v_cvt_f32_ubyte3_e32 v3, v4
+; SI-NEXT:        v_cvt_f32_ubyte2_e32 v2, v4
+; SI-NEXT:        v_cvt_f32_ubyte1_e32 v1, v4
+; SI-NEXT:        v_cvt_f32_ubyte0_e32 v0, v4
+; SI-NEXT:        v_add_i32_e32 v4, vcc, 9, v4
+; SI-NEXT:        buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; SI-NEXT:        s_waitcnt expcnt(0)
+; SI-NEXT:        v_and_b32_e32 v0, s0, v4
+; SI-NEXT:        v_add_i32_e32 v2, vcc, 9, v5
+; SI-NEXT:        v_or_b32_e32 v0, v7, v0
+; SI-NEXT:        v_lshlrev_b32_e32 v1, 8, v6
+; SI-NEXT:        v_and_b32_e32 v2, s0, v2
+; SI-NEXT:        v_add_i32_e32 v0, vcc, 0x900, v0
+; SI-NEXT:        v_or_b32_e32 v1, v1, v2
+; SI-NEXT:        v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:        v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:        v_or_b32_e32 v0, v1, v0
+; SI-NEXT:        v_add_i32_e32 v0, vcc, 0x9000000, v0
+; SI-NEXT:        buffer_store_dword v0, off, s[8:11], 0
+; SI-NEXT:        s_endpgm
 ;
 ; VI-LABEL: load_v4i8_to_v4f32_2_uses:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT:    v_mov_b32_e32 v5, 9
-; VI-NEXT:    s_movk_i32 s8, 0x900
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v4, v[0:1]
-; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
-; VI-NEXT:    s_mov_b32 s6, s2
-; VI-NEXT:    s_mov_b32 s7, s3
-; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_lshrrev_b32_e32 v6, 24, v4
-; VI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v4
-; VI-NEXT:    v_cvt_f32_ubyte2_e32 v2, v4
-; VI-NEXT:    v_cvt_f32_ubyte1_e32 v1, v4
-; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v4
-; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
-; VI-NEXT:    v_and_b32_e32 v7, 0xffffff00, v4
-; VI-NEXT:    v_add_u16_e32 v8, 9, v4
-; VI-NEXT:    v_add_u16_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v6
-; VI-NEXT:    v_or_b32_sdwa v0, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    v_mov_b32_e32 v2, s8
-; VI-NEXT:    v_add_u16_e32 v0, s8, v0
-; VI-NEXT:    v_add_u16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v0, v0, v1
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
-; VI-NEXT:    s_endpgm
+; VI-NEXT:        s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT:        s_load_dwordx2 s[8:9], s[0:1], 0x2c
+; VI-NEXT:        s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT:        v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT:        s_mov_b32 s11, 0xf000
+; VI-NEXT:        s_mov_b32 s10, -1
+; VI-NEXT:        v_mov_b32_e32 v5, 9
+; VI-NEXT:        s_waitcnt lgkmcnt(0)
+; VI-NEXT:        v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT:        v_mov_b32_e32 v1, s1
+; VI-NEXT:        v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:        flat_load_dword v4, v[0:1]
+; VI-NEXT:        s_mov_b32 s6, s10
+; VI-NEXT:        s_mov_b32 s7, s11
+; VI-NEXT:        s_movk_i32 s0, 0x900
+; VI-NEXT:        s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT:        v_lshrrev_b32_e32 v6, 24, v4
+; VI-NEXT:        v_cvt_f32_ubyte3_e32 v3, v4
+; VI-NEXT:        v_cvt_f32_ubyte2_e32 v2, v4
+; VI-NEXT:        v_cvt_f32_ubyte1_e32 v1, v4
+; VI-NEXT:        v_cvt_f32_ubyte0_e32 v0, v4
+; VI-NEXT:        buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; VI-NEXT:        v_and_b32_e32 v7, 0xffffff00, v4
+; VI-NEXT:        v_add_u16_e32 v8, 9, v4
+; VI-NEXT:        v_add_u16_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:        v_lshlrev_b16_e32 v1, 8, v6
+; VI-NEXT:        v_or_b32_sdwa v0, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:        v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:        v_mov_b32_e32 v2, s0
+; VI-NEXT:        v_add_u16_e32 v0, s0, v0
+; VI-NEXT:        v_add_u16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:        v_or_b32_e32 v0, v0, v1
+; VI-NEXT:        buffer_store_dword v0, off, s[8:11], 0
+; VI-NEXT:        s_endpgm
   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
   %in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
   %load = load <4 x i8>, <4 x i8> addrspace(1)* %in.ptr, align 4
@@ -725,41 +725,42 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* n
 define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8> addrspace(1)* noalias %in) nounwind {
 ; SI-LABEL: load_v7i8_to_v7f32:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s2, 0
-; SI-NEXT:    s_mov_b32 s3, s7
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:1
-; SI-NEXT:    buffer_load_ubyte v6, v[0:1], s[0:3], 0 addr64 offset:2
-; SI-NEXT:    buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 offset:3
-; SI-NEXT:    buffer_load_ubyte v7, v[0:1], s[0:3], 0 addr64 offset:4
-; SI-NEXT:    buffer_load_ubyte v5, v[0:1], s[0:3], 0 addr64 offset:5
-; SI-NEXT:    buffer_load_ubyte v8, v[0:1], s[0:3], 0 addr64 offset:6
-; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    s_waitcnt vmcnt(6)
-; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v2
-; SI-NEXT:    s_waitcnt vmcnt(5)
-; SI-NEXT:    v_cvt_f32_ubyte2_e32 v1, v3
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 8, v4
-; SI-NEXT:    v_or_b32_e32 v2, v9, v6
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_cvt_f32_ubyte0_e32 v4, v7
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_ubyte0_e32 v7, v8
-; SI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v2
-; SI-NEXT:    v_cvt_f32_ubyte2_e32 v5, v5
-; SI-NEXT:    v_cvt_f32_ubyte2_e32 v2, v2
-; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:24
-; SI-NEXT:    buffer_store_dwordx2 v[4:5], off, s[4:7], 0 offset:16
-; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
-; SI-NEXT:    s_endpgm
+; SI-NEXT:        s_load_dwordx2 s[4:5], s[0:1], 0x9
+; SI-NEXT:        s_load_dwordx2 s[0:1], s[0:1], 0xb
+; SI-NEXT:        s_mov_b32 s7, 0xf000
+; SI-NEXT:        s_mov_b32 s2, 0
+; SI-NEXT:        s_mov_b32 s3, s7
+; SI-NEXT:        v_lshlrev_b32_e32 v0, 3, v0
+; SI-NEXT:        v_mov_b32_e32 v1, 0
+; SI-NEXT:        s_waitcnt lgkmcnt(0)
+; SI-NEXT:        buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64
+; SI-NEXT:        buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:1
+; SI-NEXT:        buffer_load_ubyte v6, v[0:1], s[0:3], 0 addr64 offset:2
+; SI-NEXT:        buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 offset:3
+; SI-NEXT:        buffer_load_ubyte v7, v[0:1], s[0:3], 0 addr64 offset:4
+; SI-NEXT:        buffer_load_ubyte v5, v[0:1], s[0:3], 0 addr64 offset:5
+; SI-NEXT:        buffer_load_ubyte v8, v[0:1], s[0:3], 0 addr64 offset:6
+; SI-NEXT:        s_mov_b32 s6, -1
+; SI-NEXT:        s_waitcnt vmcnt(6)
+; SI-NEXT:        v_cvt_f32_ubyte0_e32 v0, v2
+; SI-NEXT:        s_waitcnt vmcnt(5)
+; SI-NEXT:        v_cvt_f32_ubyte2_e32 v1, v3
+; SI-NEXT:        s_waitcnt vmcnt(3)
+; SI-NEXT:        v_lshlrev_b32_e32 v9, 8, v4
+; SI-NEXT:        v_or_b32_e32 v3, v9, v6
+; SI-NEXT:        s_waitcnt vmcnt(1)
+; SI-NEXT:        v_cvt_f32_ubyte2_e32 v5, v5
+; SI-NEXT:        s_waitcnt vmcnt(0)
+; SI-NEXT:        v_cvt_f32_ubyte0_e32 v2, v8
+; SI-NEXT:        buffer_store_dword v2, off, s[4:7], 0 offset:24
+; SI-NEXT:        s_waitcnt expcnt(0)
+; SI-NEXT:        v_lshlrev_b32_e32 v2, 16, v3
+; SI-NEXT:        v_cvt_f32_ubyte0_e32 v4, v7
+; SI-NEXT:        v_cvt_f32_ubyte3_e32 v3, v2
+; SI-NEXT:        v_cvt_f32_ubyte2_e32 v2, v2
+; SI-NEXT:        buffer_store_dwordx2 v[4:5], off, s[4:7], 0 offset:16
+; SI-NEXT:        buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; SI-NEXT:        s_endpgm
 ;
 ; VI-LABEL: load_v7i8_to_v7f32:
 ; VI:       ; %bb.0:

diff  --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll
index 83b3b56172c9..444421443b4f 100644
--- a/llvm/test/CodeGen/AMDGPU/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshr.ll
@@ -1210,171 +1210,167 @@ define i24 @v_fshr_i24(i24 %src0, i24 %src1, i24 %src2) {
 define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2) {
 ; SI-LABEL: v_fshr_v2i24:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:20
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:12
-; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:4
-; SI-NEXT:    s_mov_b32 s4, 0xffffff
-; SI-NEXT:    s_mov_b32 s5, 0xaaaaaaab
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 3, v0
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 4, v0
-; SI-NEXT:    v_add_i32_e32 v9, vcc, 5, v0
-; SI-NEXT:    v_add_i32_e32 v10, vcc, 2, v0
-; SI-NEXT:    s_waitcnt vmcnt(5)
-; SI-NEXT:    v_and_b32_e32 v14, s4, v1
-; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_and_b32_e32 v2, s4, v2
-; SI-NEXT:    v_mul_hi_u32 v12, v2, s5
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_and_b32_e32 v3, s4, v3
-; SI-NEXT:    v_mul_hi_u32 v13, v3, s5
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_and_b32_e32 v11, s4, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v12, 4, v12
-; SI-NEXT:    v_mul_lo_u32 v12, v12, 24
-; SI-NEXT:    v_lshrrev_b32_e32 v13, 4, v13
-; SI-NEXT:    v_mul_lo_u32 v13, v13, 24
-; SI-NEXT:    v_sub_i32_e32 v2, vcc, v2, v12
-; SI-NEXT:    v_lshr_b32_e32 v12, v14, v2
-; SI-NEXT:    v_sub_i32_e32 v3, vcc, v3, v13
-; SI-NEXT:    v_sub_i32_e32 v13, vcc, 24, v2
-; SI-NEXT:    v_sub_i32_e32 v14, vcc, 24, v3
-; SI-NEXT:    v_and_b32_e32 v13, s4, v13
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshl_b32_e32 v5, v5, v13
-; SI-NEXT:    v_and_b32_e32 v14, 0xffffff, v14
-; SI-NEXT:    v_lshr_b32_e32 v11, v11, v3
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshl_b32_e32 v6, v6, v14
-; SI-NEXT:    v_or_b32_e32 v5, v5, v12
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
-; SI-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; SI-NEXT:    v_or_b32_e32 v6, v6, v11
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
-; SI-NEXT:    v_cndmask_b32_e32 v2, v6, v4, vcc
-; SI-NEXT:    buffer_store_byte v2, v7, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
-; SI-NEXT:    v_lshrrev_b32_e32 v0, 8, v2
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    buffer_store_byte v0, v8, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_byte v2, v9, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_byte v1, v10, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; SI-NEXT:    s_setpc_b64 s[30:31]
+; SI-NEXT:	s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:	buffer_load_dword v1, off, s[0:3], s32 offset:8
+; SI-NEXT:	buffer_load_dword v2, off, s[0:3], s32 offset:16
+; SI-NEXT:	buffer_load_dword v3, off, s[0:3], s32 offset:20
+; SI-NEXT:	buffer_load_dword v4, off, s[0:3], s32 offset:4
+; SI-NEXT:	buffer_load_dword v5, off, s[0:3], s32 offset:12
+; SI-NEXT:	buffer_load_dword v6, off, s[0:3], s32
+; SI-NEXT:	s_mov_b32 s4, 0xffffff
+; SI-NEXT:	s_mov_b32 s5, 0xaaaaaaab
+; SI-NEXT:	v_add_i32_e32 v7, vcc, 3, v0
+; SI-NEXT:	v_add_i32_e32 v8, vcc, 4, v0
+; SI-NEXT:	v_add_i32_e32 v9, vcc, 5, v0
+; SI-NEXT:	v_add_i32_e32 v10, vcc, 2, v0
+; SI-NEXT:	s_waitcnt vmcnt(5)
+; SI-NEXT:	v_and_b32_e32 v14, s4, v1
+; SI-NEXT:	s_waitcnt vmcnt(4)
+; SI-NEXT:	v_and_b32_e32 v2, s4, v2
+; SI-NEXT:	v_mul_hi_u32 v12, v2, s5
+; SI-NEXT:	s_waitcnt vmcnt(3)
+; SI-NEXT:	v_and_b32_e32 v3, s4, v3
+; SI-NEXT:	v_mul_hi_u32 v13, v3, s5
+; SI-NEXT:	s_waitcnt vmcnt(1)
+; SI-NEXT:	v_and_b32_e32 v11, s4, v5
+; SI-NEXT:	v_lshrrev_b32_e32 v12, 4, v12
+; SI-NEXT:	v_mul_lo_u32 v12, v12, 24
+; SI-NEXT:	v_lshrrev_b32_e32 v13, 4, v13
+; SI-NEXT:	v_mul_lo_u32 v13, v13, 24
+; SI-NEXT:	v_sub_i32_e32 v2, vcc, v2, v12
+; SI-NEXT:	v_lshr_b32_e32 v12, v14, v2
+; SI-NEXT:	v_sub_i32_e32 v3, vcc, v3, v13
+; SI-NEXT:	v_sub_i32_e32 v13, vcc, 24, v2
+; SI-NEXT:	v_sub_i32_e32 v14, vcc, 24, v3
+; SI-NEXT:	v_and_b32_e32 v13, s4, v13
+; SI-NEXT:	s_waitcnt vmcnt(0)
+; SI-NEXT:	v_lshl_b32_e32 v6, v6, v13
+; SI-NEXT:	v_and_b32_e32 v14, 0xffffff, v14
+; SI-NEXT:	v_lshr_b32_e32 v11, v11, v3
+; SI-NEXT:	v_lshl_b32_e32 v4, v4, v14
+; SI-NEXT:	v_or_b32_e32 v6, v6, v12
+; SI-NEXT:	v_cmp_eq_u32_e32 vcc, 0, v2
+; SI-NEXT:	v_cndmask_b32_e32 v1, v6, v1, vcc
+; SI-NEXT:	v_or_b32_e32 v4, v4, v11
+; SI-NEXT:	v_cmp_eq_u32_e32 vcc, 0, v3
+; SI-NEXT:	v_cndmask_b32_e32 v2, v4, v5, vcc
+; SI-NEXT:	buffer_store_byte v2, v7, s[0:3], 0 offen
+; SI-NEXT:	buffer_store_short v1, v0, s[0:3], 0 offen
+; SI-NEXT:	v_lshrrev_b32_e32 v0, 8, v2
+; SI-NEXT:	s_waitcnt expcnt(1)
+; SI-NEXT:	v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT:	s_waitcnt expcnt(0)
+; SI-NEXT:	v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT:	buffer_store_byte v0, v8, s[0:3], 0 offen
+; SI-NEXT:	buffer_store_byte v2, v9, s[0:3], 0 offen
+; SI-NEXT:	buffer_store_byte v1, v10, s[0:3], 0 offen
+; SI-NEXT:	s_waitcnt vmcnt(0) expcnt(0)
+; SI-NEXT:	s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fshr_v2i24:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:16
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:20
-; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:12
-; VI-NEXT:    buffer_load_dword v5, off, s[0:3], s32
-; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:4
-; VI-NEXT:    s_mov_b32 s4, 0xffffff
-; VI-NEXT:    s_mov_b32 s5, 0xaaaaaaab
-; VI-NEXT:    v_add_u32_e32 v7, vcc, 3, v0
-; VI-NEXT:    v_add_u32_e32 v8, vcc, 4, v0
-; VI-NEXT:    v_add_u32_e32 v9, vcc, 5, v0
-; VI-NEXT:    v_add_u32_e32 v10, vcc, 2, v0
-; VI-NEXT:    s_waitcnt vmcnt(5)
-; VI-NEXT:    v_and_b32_e32 v14, s4, v1
-; VI-NEXT:    s_waitcnt vmcnt(4)
-; VI-NEXT:    v_and_b32_e32 v2, s4, v2
-; VI-NEXT:    v_mul_hi_u32 v12, v2, s5
-; VI-NEXT:    s_waitcnt vmcnt(3)
-; VI-NEXT:    v_and_b32_e32 v3, s4, v3
-; VI-NEXT:    v_mul_hi_u32 v13, v3, s5
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_and_b32_e32 v11, s4, v4
-; VI-NEXT:    v_lshrrev_b32_e32 v12, 4, v12
-; VI-NEXT:    v_mul_lo_u32 v12, v12, 24
-; VI-NEXT:    v_lshrrev_b32_e32 v13, 4, v13
-; VI-NEXT:    v_mul_lo_u32 v13, v13, 24
-; VI-NEXT:    v_sub_u32_e32 v2, vcc, v2, v12
-; VI-NEXT:    v_lshrrev_b32_e32 v12, v2, v14
-; VI-NEXT:    v_sub_u32_e32 v3, vcc, v3, v13
-; VI-NEXT:    v_sub_u32_e32 v13, vcc, 24, v2
-; VI-NEXT:    v_sub_u32_e32 v14, vcc, 24, v3
-; VI-NEXT:    v_and_b32_e32 v13, s4, v13
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshlrev_b32_e32 v5, v13, v5
-; VI-NEXT:    v_and_b32_e32 v14, 0xffffff, v14
-; VI-NEXT:    v_lshrrev_b32_e32 v11, v3, v11
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_lshlrev_b32_e32 v6, v14, v6
-; VI-NEXT:    v_or_b32_e32 v5, v5, v12
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
-; VI-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; VI-NEXT:    v_or_b32_e32 v6, v6, v11
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
-; VI-NEXT:    v_cndmask_b32_e32 v2, v6, v4, vcc
-; VI-NEXT:    buffer_store_byte v2, v7, s[0:3], 0 offen
-; VI-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
-; VI-NEXT:    v_lshrrev_b32_e32 v0, 8, v2
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT:    buffer_store_byte v0, v8, s[0:3], 0 offen
-; VI-NEXT:    buffer_store_byte v2, v9, s[0:3], 0 offen
-; VI-NEXT:    buffer_store_byte v1, v10, s[0:3], 0 offen
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    s_setpc_b64 s[30:31]
+; VI-NEXT:	s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:	buffer_load_dword v1, off, s[0:3], s32 offset:8
+; VI-NEXT:	buffer_load_dword v2, off, s[0:3], s32 offset:16
+; VI-NEXT:	buffer_load_dword v3, off, s[0:3], s32 offset:20
+; VI-NEXT:	buffer_load_dword v4, off, s[0:3], s32 offset:4
+; VI-NEXT:	buffer_load_dword v5, off, s[0:3], s32 offset:12
+; VI-NEXT:	buffer_load_dword v6, off, s[0:3], s32
+; VI-NEXT:	s_mov_b32 s4, 0xffffff
+; VI-NEXT:	s_mov_b32 s5, 0xaaaaaaab
+; VI-NEXT:	v_add_u32_e32 v7, vcc, 3, v0
+; VI-NEXT:	v_add_u32_e32 v8, vcc, 4, v0
+; VI-NEXT:	v_add_u32_e32 v9, vcc, 5, v0
+; VI-NEXT:	v_add_u32_e32 v10, vcc, 2, v0
+; VI-NEXT:	s_waitcnt vmcnt(5)
+; VI-NEXT:	v_and_b32_e32 v14, s4, v1
+; VI-NEXT:	s_waitcnt vmcnt(4)
+; VI-NEXT:	v_and_b32_e32 v2, s4, v2
+; VI-NEXT:	v_mul_hi_u32 v12, v2, s5
+; VI-NEXT:	s_waitcnt vmcnt(3)
+; VI-NEXT:	v_and_b32_e32 v3, s4, v3
+; VI-NEXT:	v_mul_hi_u32 v13, v3, s5
+; VI-NEXT:	s_waitcnt vmcnt(1)
+; VI-NEXT:	v_and_b32_e32 v11, s4, v5
+; VI-NEXT:	v_lshrrev_b32_e32 v12, 4, v12
+; VI-NEXT:	v_mul_lo_u32 v12, v12, 24
+; VI-NEXT:	v_lshrrev_b32_e32 v13, 4, v13
+; VI-NEXT:	v_mul_lo_u32 v13, v13, 24
+; VI-NEXT:	v_sub_u32_e32 v2, vcc, v2, v12
+; VI-NEXT:	v_lshrrev_b32_e32 v12, v2, v14
+; VI-NEXT:	v_sub_u32_e32 v3, vcc, v3, v13
+; VI-NEXT:	v_sub_u32_e32 v13, vcc, 24, v2
+; VI-NEXT:	v_sub_u32_e32 v14, vcc, 24, v3
+; VI-NEXT:	v_and_b32_e32 v13, s4, v13
+; VI-NEXT:	s_waitcnt vmcnt(0)
+; VI-NEXT:	v_lshlrev_b32_e32 v6, v13, v6
+; VI-NEXT:	v_and_b32_e32 v14, 0xffffff, v14
+; VI-NEXT:	v_lshrrev_b32_e32 v11, v3, v11
+; VI-NEXT:	v_lshlrev_b32_e32 v4, v14, v4
+; VI-NEXT:	v_or_b32_e32 v6, v6, v12
+; VI-NEXT:	v_cmp_eq_u32_e32 vcc, 0, v2
+; VI-NEXT:	v_cndmask_b32_e32 v1, v6, v1, vcc
+; VI-NEXT:	v_or_b32_e32 v4, v4, v11
+; VI-NEXT:	v_cmp_eq_u32_e32 vcc, 0, v3
+; VI-NEXT:	v_cndmask_b32_e32 v2, v4, v5, vcc
+; VI-NEXT:	buffer_store_byte v2, v7, s[0:3], 0 offen
+; VI-NEXT:	buffer_store_short v1, v0, s[0:3], 0 offen
+; VI-NEXT:	v_lshrrev_b32_e32 v0, 8, v2
+; VI-NEXT:	v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT:	v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:	buffer_store_byte v0, v8, s[0:3], 0 offen
+; VI-NEXT:	buffer_store_byte v2, v9, s[0:3], 0 offen
+; VI-NEXT:	buffer_store_byte v1, v10, s[0:3], 0 offen
+; VI-NEXT:	s_waitcnt vmcnt(0)
+; VI-NEXT:	s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshr_v2i24:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8
-; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:16
-; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:12
-; GFX9-NEXT:    buffer_load_dword v5, off, s[0:3], s32
-; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    s_mov_b32 s4, 0xffffff
-; GFX9-NEXT:    s_mov_b32 s5, 0xaaaaaaab
-; GFX9-NEXT:    s_waitcnt vmcnt(5)
-; GFX9-NEXT:    v_and_b32_e32 v10, s4, v1
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
-; GFX9-NEXT:    v_and_b32_e32 v2, s4, v2
-; GFX9-NEXT:    v_mul_hi_u32 v6, v2, s5
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
-; GFX9-NEXT:    v_and_b32_e32 v3, s4, v3
-; GFX9-NEXT:    v_mul_hi_u32 v7, v3, s5
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_and_b32_e32 v9, s4, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 4, v6
-; GFX9-NEXT:    v_mul_lo_u32 v6, v6, 24
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 4, v7
-; GFX9-NEXT:    v_mul_lo_u32 v7, v7, 24
-; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v6
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, v2, v10
-; GFX9-NEXT:    v_sub_u32_e32 v3, v3, v7
-; GFX9-NEXT:    v_sub_u32_e32 v7, 24, v2
-; GFX9-NEXT:    v_sub_u32_e32 v10, 24, v3
-; GFX9-NEXT:    v_and_b32_e32 v7, s4, v7
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, v3, v9
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xffffff, v10
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshl_or_b32 v5, v5, v7, v6
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshl_or_b32 v6, v8, v10, v9
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v6, v4, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 8, v2
-; GFX9-NEXT:    buffer_store_byte_d16_hi v2, v0, s[0:3], 0 offen offset:5
-; GFX9-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:4
-; GFX9-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:3
-; GFX9-NEXT:    buffer_store_byte_d16_hi v1, v0, s[0:3], 0 offen offset:2
-; GFX9-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-NEXT:        s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:        buffer_load_dword v1, off, s[0:3], s32 offset:16
+; GFX9-NEXT:        buffer_load_dword v2, off, s[0:3], s32 offset:20
+; GFX9-NEXT:        buffer_load_dword v3, off, s[0:3], s32 offset:4
+; GFX9-NEXT:        buffer_load_dword v4, off, s[0:3], s32 offset:12
+; GFX9-NEXT:        buffer_load_dword v5, off, s[0:3], s32
+; GFX9-NEXT:        buffer_load_dword v8, off, s[0:3], s32 offset:8
+; GFX9-NEXT:        s_mov_b32 s4, 0xffffff
+; GFX9-NEXT:        s_mov_b32 s5, 0xaaaaaaab
+; GFX9-NEXT:        s_waitcnt vmcnt(5)
+; GFX9-NEXT:        v_and_b32_e32 v1, s4, v1
+; GFX9-NEXT:        v_mul_hi_u32 v6, v1, s5
+; GFX9-NEXT:        s_waitcnt vmcnt(4)
+; GFX9-NEXT:        v_and_b32_e32 v2, s4, v2
+; GFX9-NEXT:        v_mul_hi_u32 v7, v2, s5
+; GFX9-NEXT:        s_waitcnt vmcnt(2)
+; GFX9-NEXT:        v_and_b32_e32 v9, s4, v4
+; GFX9-NEXT:        v_lshrrev_b32_e32 v6, 4, v6
+; GFX9-NEXT:        v_mul_lo_u32 v6, v6, 24
+; GFX9-NEXT:        v_lshrrev_b32_e32 v7, 4, v7
+; GFX9-NEXT:        v_mul_lo_u32 v7, v7, 24
+; GFX9-NEXT:        s_waitcnt vmcnt(0)
+; GFX9-NEXT:        v_and_b32_e32 v10, s4, v8
+; GFX9-NEXT:        v_sub_u32_e32 v1, v1, v6
+; GFX9-NEXT:        v_lshrrev_b32_e32 v6, v1, v10
+; GFX9-NEXT:        v_sub_u32_e32 v2, v2, v7
+; GFX9-NEXT:        v_sub_u32_e32 v7, 24, v1
+; GFX9-NEXT:        v_sub_u32_e32 v10, 24, v2
+; GFX9-NEXT:        v_and_b32_e32 v7, s4, v7
+; GFX9-NEXT:        v_lshrrev_b32_e32 v9, v2, v9
+; GFX9-NEXT:        v_and_b32_e32 v10, 0xffffff, v10
+; GFX9-NEXT:        v_lshl_or_b32 v5, v5, v7, v6
+; GFX9-NEXT:        v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX9-NEXT:        v_cndmask_b32_e32 v1, v5, v8, vcc
+; GFX9-NEXT:        v_lshl_or_b32 v3, v3, v10, v9
+; GFX9-NEXT:        v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX9-NEXT:        v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX9-NEXT:        v_lshrrev_b32_e32 v3, 8, v2
+; GFX9-NEXT:        buffer_store_byte_d16_hi v2, v0, s[0:3], 0 offen offset:5
+; GFX9-NEXT:        buffer_store_byte v3, v0, s[0:3], 0 offen offset:4
+; GFX9-NEXT:        buffer_store_byte v2, v0, s[0:3], 0 offen offset:3
+; GFX9-NEXT:        buffer_store_byte_d16_hi v1, v0, s[0:3], 0 offen offset:2
+; GFX9-NEXT:        buffer_store_short v1, v0, s[0:3], 0 offen
+; GFX9-NEXT:        s_waitcnt vmcnt(0)
+; GFX9-NEXT:        s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_fshr_v2i24:
 ; R600:       ; %bb.0:

diff  --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
index bca00f69e25c..bfc42e48bca8 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
@@ -273,8 +273,8 @@ entry:
 ; MOVREL: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}}
 ; MOVREL: v_movreld_b32_e32 v0, 5
 
-; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}}
-; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], gpr_idx(DST)
+; IDXMODE: s_add_i32 s{{[0-9]+}}, s{{[0-9]+}}, 0xfffffe00{{$}}
+; IDXMODE: s_set_gpr_idx_on s{{[0-9]+}}, gpr_idx(DST)
 ; IDXMODE-NEXT: v_mov_b32_e32 v0, 5
 ; IDXMODE-NEXT: s_set_gpr_idx_off
 define amdgpu_kernel void @insert_neg_offset_sgpr_loadreg(i32 addrspace(1)* %in, <16 x i32> addrspace(1)* %out, <16 x i32> %vec, i32 %offset) {

diff  --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
index 72f8c9cfc55f..9aee862dbe11 100644
--- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
@@ -855,10 +855,10 @@ define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32,
 ; multiple.
 ; FUNC-LABEL: {{^}}packed_struct_argument_alignment:
 ; HSA-GFX9: kernarg_segment_byte_size = 28
-; HSA-GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:17
-; HSA-GFX9: global_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:13
 ; HSA-GFX9: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
 ; HSA-GFX9: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x4
+; HSA-GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:17
+; HSA-GFX9: global_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:13
 define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) {
   %val0 = extractvalue <{i32, i64}> %arg0, 0
   %val1 = extractvalue <{i32, i64}> %arg0, 1

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
index 76a218760e8e..667425d86c69 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
@@ -237,157 +237,157 @@ define amdgpu_kernel void @round_v2f64(<2 x double> addrspace(1)* %out, <2 x dou
 define amdgpu_kernel void @round_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) #0 {
 ; SI-LABEL: round_v4f64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-NEXT:    s_load_dwordx8 s[8:15], s[0:1], 0x11
-; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    s_movk_i32 s18, 0xfc01
-; SI-NEXT:    s_mov_b32 s3, 0xfffff
-; SI-NEXT:    s_mov_b32 s2, s6
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_bfe_u32 s0, s11, 0xb0014
-; SI-NEXT:    s_add_i32 s19, s0, s18
-; SI-NEXT:    s_lshr_b64 s[0:1], s[2:3], s19
-; SI-NEXT:    s_brev_b32 s20, 1
-; SI-NEXT:    s_andn2_b64 s[16:17], s[10:11], s[0:1]
-; SI-NEXT:    s_and_b32 s0, s11, s20
-; SI-NEXT:    v_mov_b32_e32 v1, s0
-; SI-NEXT:    v_mov_b32_e32 v0, s17
-; SI-NEXT:    v_cmp_lt_i32_e64 vcc, s19, 0
-; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; SI-NEXT:    v_mov_b32_e32 v1, s11
-; SI-NEXT:    v_cmp_gt_i32_e64 s[0:1], s19, 51
-; SI-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[0:1]
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; SI-NEXT:    v_mov_b32_e32 v2, s10
-; SI-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; SI-NEXT:    v_add_f64 v[2:3], s[10:11], -v[0:1]
-; SI-NEXT:    s_bfe_u32 s0, s9, 0xb0014
-; SI-NEXT:    s_add_i32 s17, s0, s18
-; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5
-; SI-NEXT:    s_brev_b32 s16, -2
-; SI-NEXT:    v_mov_b32_e32 v12, 0x3ff00000
-; SI-NEXT:    v_mov_b32_e32 v4, s11
-; SI-NEXT:    v_bfi_b32 v4, s16, v12, v4
-; SI-NEXT:    s_lshr_b64 s[0:1], s[2:3], s17
-; SI-NEXT:    v_cndmask_b32_e32 v3, 0, v4, vcc
-; SI-NEXT:    v_mov_b32_e32 v2, 0
-; SI-NEXT:    s_andn2_b64 s[10:11], s[8:9], s[0:1]
-; SI-NEXT:    s_and_b32 s0, s9, s20
-; SI-NEXT:    v_add_f64 v[2:3], v[0:1], v[2:3]
-; SI-NEXT:    v_mov_b32_e32 v1, s0
-; SI-NEXT:    v_mov_b32_e32 v0, s11
-; SI-NEXT:    v_cmp_lt_i32_e64 vcc, s17, 0
-; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; SI-NEXT:    v_mov_b32_e32 v1, s9
-; SI-NEXT:    v_cmp_gt_i32_e64 s[0:1], s17, 51
-; SI-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[0:1]
-; SI-NEXT:    v_mov_b32_e32 v0, s10
-; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; SI-NEXT:    v_mov_b32_e32 v4, s8
-; SI-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
-; SI-NEXT:    v_add_f64 v[4:5], s[8:9], -v[0:1]
-; SI-NEXT:    s_bfe_u32 s0, s15, 0xb0014
-; SI-NEXT:    s_add_i32 s10, s0, s18
-; SI-NEXT:    v_mov_b32_e32 v6, s9
-; SI-NEXT:    s_lshr_b64 s[0:1], s[2:3], s10
-; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
-; SI-NEXT:    s_andn2_b64 s[8:9], s[14:15], s[0:1]
-; SI-NEXT:    v_bfi_b32 v6, s16, v12, v6
-; SI-NEXT:    s_and_b32 s0, s15, s20
-; SI-NEXT:    v_cndmask_b32_e32 v9, 0, v6, vcc
-; SI-NEXT:    v_mov_b32_e32 v5, s0
-; SI-NEXT:    v_mov_b32_e32 v4, s9
-; SI-NEXT:    v_cmp_lt_i32_e64 vcc, s10, 0
-; SI-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; SI-NEXT:    v_mov_b32_e32 v5, s15
-; SI-NEXT:    v_cmp_gt_i32_e64 s[0:1], s10, 51
-; SI-NEXT:    v_cndmask_b32_e64 v5, v4, v5, s[0:1]
-; SI-NEXT:    v_mov_b32_e32 v4, s8
-; SI-NEXT:    v_cndmask_b32_e64 v4, v4, 0, vcc
-; SI-NEXT:    v_mov_b32_e32 v6, s14
-; SI-NEXT:    v_cndmask_b32_e64 v4, v4, v6, s[0:1]
-; SI-NEXT:    v_add_f64 v[6:7], s[14:15], -v[4:5]
-; SI-NEXT:    s_bfe_u32 s0, s13, 0xb0014
-; SI-NEXT:    v_mov_b32_e32 v10, s15
-; SI-NEXT:    s_add_i32 s8, s0, s18
-; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5
-; SI-NEXT:    s_lshr_b64 s[0:1], s[2:3], s8
-; SI-NEXT:    v_bfi_b32 v10, s16, v12, v10
-; SI-NEXT:    v_cndmask_b32_e32 v7, 0, v10, vcc
-; SI-NEXT:    v_mov_b32_e32 v6, 0
-; SI-NEXT:    s_andn2_b64 s[2:3], s[12:13], s[0:1]
-; SI-NEXT:    s_and_b32 s0, s13, s20
-; SI-NEXT:    v_add_f64 v[6:7], v[4:5], v[6:7]
-; SI-NEXT:    v_mov_b32_e32 v5, s0
-; SI-NEXT:    v_mov_b32_e32 v4, s3
-; SI-NEXT:    v_cmp_lt_i32_e64 vcc, s8, 0
-; SI-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; SI-NEXT:    v_mov_b32_e32 v5, s13
-; SI-NEXT:    v_cmp_gt_i32_e64 s[0:1], s8, 51
-; SI-NEXT:    v_cndmask_b32_e64 v5, v4, v5, s[0:1]
-; SI-NEXT:    v_mov_b32_e32 v4, s2
-; SI-NEXT:    v_cndmask_b32_e64 v4, v4, 0, vcc
-; SI-NEXT:    v_mov_b32_e32 v10, s12
-; SI-NEXT:    v_cndmask_b32_e64 v4, v4, v10, s[0:1]
-; SI-NEXT:    v_add_f64 v[10:11], s[12:13], -v[4:5]
-; SI-NEXT:    v_mov_b32_e32 v13, s13
-; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[10:11]|, 0.5
-; SI-NEXT:    v_bfi_b32 v12, s16, v12, v13
-; SI-NEXT:    v_cndmask_b32_e32 v11, 0, v12, vcc
-; SI-NEXT:    v_mov_b32_e32 v10, 0
-; SI-NEXT:    v_mov_b32_e32 v8, 0
-; SI-NEXT:    v_add_f64 v[4:5], v[4:5], v[10:11]
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    v_add_f64 v[0:1], v[0:1], v[8:9]
-; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
-; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
-; SI-NEXT:    s_endpgm
+; SI-NEXT:	s_load_dwordx8 s[8:15], s[0:1], 0x11
+; SI-NEXT:	s_load_dwordx2 s[4:5], s[0:1], 0x9
+; SI-NEXT:	s_mov_b32 s6, -1
+; SI-NEXT:	s_movk_i32 s18, 0xfc01
+; SI-NEXT:	s_mov_b32 s3, 0xfffff
+; SI-NEXT:	s_waitcnt lgkmcnt(0)
+; SI-NEXT:	s_bfe_u32 s0, s11, 0xb0014
+; SI-NEXT:	s_add_i32 s19, s0, s18
+; SI-NEXT:	s_mov_b32 s2, s6
+; SI-NEXT:	s_lshr_b64 s[0:1], s[2:3], s19
+; SI-NEXT:	s_brev_b32 s20, 1
+; SI-NEXT:	s_andn2_b64 s[16:17], s[10:11], s[0:1]
+; SI-NEXT:	s_and_b32 s0, s11, s20
+; SI-NEXT:	v_mov_b32_e32 v1, s0
+; SI-NEXT:	v_mov_b32_e32 v0, s17
+; SI-NEXT:	v_cmp_lt_i32_e64 vcc, s19, 0
+; SI-NEXT:	v_cndmask_b32_e32 v0, v0, v1, vcc
+; SI-NEXT:	v_mov_b32_e32 v1, s11
+; SI-NEXT:	v_cmp_gt_i32_e64 s[0:1], s19, 51
+; SI-NEXT:	v_cndmask_b32_e64 v1, v0, v1, s[0:1]
+; SI-NEXT:	v_mov_b32_e32 v0, s16
+; SI-NEXT:	v_cndmask_b32_e64 v0, v0, 0, vcc
+; SI-NEXT:	v_mov_b32_e32 v2, s10
+; SI-NEXT:	v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; SI-NEXT:	v_add_f64 v[2:3], s[10:11], -v[0:1]
+; SI-NEXT:	s_bfe_u32 s0, s9, 0xb0014
+; SI-NEXT:	s_add_i32 s17, s0, s18
+; SI-NEXT:	v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5
+; SI-NEXT:	s_brev_b32 s16, -2
+; SI-NEXT:	v_mov_b32_e32 v12, 0x3ff00000
+; SI-NEXT:	v_mov_b32_e32 v4, s11
+; SI-NEXT:	v_bfi_b32 v4, s16, v12, v4
+; SI-NEXT:	s_lshr_b64 s[0:1], s[2:3], s17
+; SI-NEXT:	v_cndmask_b32_e32 v3, 0, v4, vcc
+; SI-NEXT:	v_mov_b32_e32 v2, 0
+; SI-NEXT:	s_andn2_b64 s[10:11], s[8:9], s[0:1]
+; SI-NEXT:	s_and_b32 s0, s9, s20
+; SI-NEXT:	v_add_f64 v[2:3], v[0:1], v[2:3]
+; SI-NEXT:	v_mov_b32_e32 v1, s0
+; SI-NEXT:	v_mov_b32_e32 v0, s11
+; SI-NEXT:	v_cmp_lt_i32_e64 vcc, s17, 0
+; SI-NEXT:	v_cndmask_b32_e32 v0, v0, v1, vcc
+; SI-NEXT:	v_mov_b32_e32 v1, s9
+; SI-NEXT:	v_cmp_gt_i32_e64 s[0:1], s17, 51
+; SI-NEXT:	v_cndmask_b32_e64 v1, v0, v1, s[0:1]
+; SI-NEXT:	v_mov_b32_e32 v0, s10
+; SI-NEXT:	v_cndmask_b32_e64 v0, v0, 0, vcc
+; SI-NEXT:	v_mov_b32_e32 v4, s8
+; SI-NEXT:	v_cndmask_b32_e64 v0, v0, v4, s[0:1]
+; SI-NEXT:	v_add_f64 v[4:5], s[8:9], -v[0:1]
+; SI-NEXT:	s_bfe_u32 s0, s15, 0xb0014
+; SI-NEXT:	s_add_i32 s10, s0, s18
+; SI-NEXT:	v_mov_b32_e32 v6, s9
+; SI-NEXT:	s_lshr_b64 s[0:1], s[2:3], s10
+; SI-NEXT:	v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
+; SI-NEXT:	s_andn2_b64 s[8:9], s[14:15], s[0:1]
+; SI-NEXT:	v_bfi_b32 v6, s16, v12, v6
+; SI-NEXT:	s_and_b32 s0, s15, s20
+; SI-NEXT:	v_cndmask_b32_e32 v9, 0, v6, vcc
+; SI-NEXT:	v_mov_b32_e32 v5, s0
+; SI-NEXT:	v_mov_b32_e32 v4, s9
+; SI-NEXT:	v_cmp_lt_i32_e64 vcc, s10, 0
+; SI-NEXT:	v_cndmask_b32_e32 v4, v4, v5, vcc
+; SI-NEXT:	v_mov_b32_e32 v5, s15
+; SI-NEXT:	v_cmp_gt_i32_e64 s[0:1], s10, 51
+; SI-NEXT:	v_cndmask_b32_e64 v5, v4, v5, s[0:1]
+; SI-NEXT:	v_mov_b32_e32 v4, s8
+; SI-NEXT:	v_cndmask_b32_e64 v4, v4, 0, vcc
+; SI-NEXT:	v_mov_b32_e32 v6, s14
+; SI-NEXT:	v_cndmask_b32_e64 v4, v4, v6, s[0:1]
+; SI-NEXT:	v_add_f64 v[6:7], s[14:15], -v[4:5]
+; SI-NEXT:	s_bfe_u32 s0, s13, 0xb0014
+; SI-NEXT:	v_mov_b32_e32 v10, s15
+; SI-NEXT:	s_add_i32 s8, s0, s18
+; SI-NEXT:	v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5
+; SI-NEXT:	s_lshr_b64 s[0:1], s[2:3], s8
+; SI-NEXT:	v_bfi_b32 v10, s16, v12, v10
+; SI-NEXT:	v_cndmask_b32_e32 v7, 0, v10, vcc
+; SI-NEXT:	v_mov_b32_e32 v6, 0
+; SI-NEXT:	s_andn2_b64 s[2:3], s[12:13], s[0:1]
+; SI-NEXT:	s_and_b32 s0, s13, s20
+; SI-NEXT:	v_add_f64 v[6:7], v[4:5], v[6:7]
+; SI-NEXT:	v_mov_b32_e32 v5, s0
+; SI-NEXT:	v_mov_b32_e32 v4, s3
+; SI-NEXT:	v_cmp_lt_i32_e64 vcc, s8, 0
+; SI-NEXT:	v_cndmask_b32_e32 v4, v4, v5, vcc
+; SI-NEXT:	v_mov_b32_e32 v5, s13
+; SI-NEXT:	v_cmp_gt_i32_e64 s[0:1], s8, 51
+; SI-NEXT:	v_cndmask_b32_e64 v5, v4, v5, s[0:1]
+; SI-NEXT:	v_mov_b32_e32 v4, s2
+; SI-NEXT:	v_cndmask_b32_e64 v4, v4, 0, vcc
+; SI-NEXT:	v_mov_b32_e32 v10, s12
+; SI-NEXT:	v_cndmask_b32_e64 v4, v4, v10, s[0:1]
+; SI-NEXT:	v_add_f64 v[10:11], s[12:13], -v[4:5]
+; SI-NEXT:	v_mov_b32_e32 v13, s13
+; SI-NEXT:	v_cmp_ge_f64_e64 vcc, |v[10:11]|, 0.5
+; SI-NEXT:	v_bfi_b32 v12, s16, v12, v13
+; SI-NEXT:	v_cndmask_b32_e32 v11, 0, v12, vcc
+; SI-NEXT:	v_mov_b32_e32 v10, 0
+; SI-NEXT:	v_mov_b32_e32 v8, 0
+; SI-NEXT:	v_add_f64 v[4:5], v[4:5], v[10:11]
+; SI-NEXT:	s_mov_b32 s7, 0xf000
+; SI-NEXT:	v_add_f64 v[0:1], v[0:1], v[8:9]
+; SI-NEXT:	buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; SI-NEXT:	buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; SI-NEXT:	s_endpgm
 ;
 ; CI-LABEL: round_v4f64:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; CI-NEXT:    s_load_dwordx8 s[8:15], s[0:1], 0x11
-; CI-NEXT:    s_brev_b32 s2, -2
-; CI-NEXT:    v_mov_b32_e32 v12, 0x3ff00000
-; CI-NEXT:    s_mov_b32 s7, 0xf000
-; CI-NEXT:    s_mov_b32 s6, -1
-; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_trunc_f64_e32 v[0:1], s[10:11]
-; CI-NEXT:    v_mov_b32_e32 v4, s11
-; CI-NEXT:    v_add_f64 v[2:3], s[10:11], -v[0:1]
-; CI-NEXT:    v_bfi_b32 v4, s2, v12, v4
-; CI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5
-; CI-NEXT:    v_trunc_f64_e32 v[8:9], s[8:9]
-; CI-NEXT:    v_cndmask_b32_e32 v3, 0, v4, vcc
-; CI-NEXT:    v_mov_b32_e32 v2, 0
-; CI-NEXT:    v_add_f64 v[2:3], v[0:1], v[2:3]
-; CI-NEXT:    v_add_f64 v[0:1], s[8:9], -v[8:9]
-; CI-NEXT:    v_mov_b32_e32 v4, s9
-; CI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[0:1]|, 0.5
-; CI-NEXT:    v_bfi_b32 v4, s2, v12, v4
-; CI-NEXT:    v_cndmask_b32_e32 v1, 0, v4, vcc
-; CI-NEXT:    v_trunc_f64_e32 v[4:5], s[14:15]
-; CI-NEXT:    v_mov_b32_e32 v10, s15
-; CI-NEXT:    v_add_f64 v[6:7], s[14:15], -v[4:5]
-; CI-NEXT:    v_bfi_b32 v10, s2, v12, v10
-; CI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5
-; CI-NEXT:    v_mov_b32_e32 v6, 0
-; CI-NEXT:    v_cndmask_b32_e32 v7, 0, v10, vcc
-; CI-NEXT:    v_trunc_f64_e32 v[10:11], s[12:13]
-; CI-NEXT:    v_add_f64 v[6:7], v[4:5], v[6:7]
-; CI-NEXT:    v_add_f64 v[4:5], s[12:13], -v[10:11]
-; CI-NEXT:    v_mov_b32_e32 v13, s13
-; CI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
-; CI-NEXT:    v_bfi_b32 v12, s2, v12, v13
-; CI-NEXT:    v_cndmask_b32_e32 v5, 0, v12, vcc
-; CI-NEXT:    v_mov_b32_e32 v4, 0
-; CI-NEXT:    v_mov_b32_e32 v0, 0
-; CI-NEXT:    v_add_f64 v[4:5], v[10:11], v[4:5]
-; CI-NEXT:    v_add_f64 v[0:1], v[8:9], v[0:1]
-; CI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
-; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
-; CI-NEXT:    s_endpgm
+; CI-NEXT:	s_load_dwordx8 s[4:11], s[0:1], 0x11
+; CI-NEXT:	s_brev_b32 s12, -2
+; CI-NEXT:	v_mov_b32_e32 v12, 0x3ff00000
+; CI-NEXT:	s_load_dwordx2 s[0:1], s[0:1], 0x9
+; CI-NEXT:	s_mov_b32 s3, 0xf000
+; CI-NEXT:	s_waitcnt lgkmcnt(0)
+; CI-NEXT:	v_trunc_f64_e32 v[0:1], s[6:7]
+; CI-NEXT:	v_mov_b32_e32 v4, s7
+; CI-NEXT:	v_add_f64 v[2:3], s[6:7], -v[0:1]
+; CI-NEXT:	v_bfi_b32 v4, s12, v12, v4
+; CI-NEXT:	v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5
+; CI-NEXT:	v_trunc_f64_e32 v[8:9], s[4:5]
+; CI-NEXT:	v_cndmask_b32_e32 v3, 0, v4, vcc
+; CI-NEXT:	v_mov_b32_e32 v2, 0
+; CI-NEXT:	v_add_f64 v[2:3], v[0:1], v[2:3]
+; CI-NEXT:	v_add_f64 v[0:1], s[4:5], -v[8:9]
+; CI-NEXT:	v_mov_b32_e32 v4, s5
+; CI-NEXT:	v_cmp_ge_f64_e64 vcc, |v[0:1]|, 0.5
+; CI-NEXT:	v_bfi_b32 v4, s12, v12, v4
+; CI-NEXT:	v_cndmask_b32_e32 v1, 0, v4, vcc
+; CI-NEXT:	v_trunc_f64_e32 v[4:5], s[10:11]
+; CI-NEXT:	v_mov_b32_e32 v10, s11
+; CI-NEXT:	v_add_f64 v[6:7], s[10:11], -v[4:5]
+; CI-NEXT:	v_bfi_b32 v10, s12, v12, v10
+; CI-NEXT:	v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5
+; CI-NEXT:	v_mov_b32_e32 v6, 0
+; CI-NEXT:	v_cndmask_b32_e32 v7, 0, v10, vcc
+; CI-NEXT:	v_trunc_f64_e32 v[10:11], s[8:9]
+; CI-NEXT:	v_add_f64 v[6:7], v[4:5], v[6:7]
+; CI-NEXT:	v_add_f64 v[4:5], s[8:9], -v[10:11]
+; CI-NEXT:	v_mov_b32_e32 v13, s9
+; CI-NEXT:	v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
+; CI-NEXT:	v_bfi_b32 v12, s12, v12, v13
+; CI-NEXT:	v_cndmask_b32_e32 v5, 0, v12, vcc
+; CI-NEXT:	v_mov_b32_e32 v4, 0
+; CI-NEXT:	v_mov_b32_e32 v0, 0
+; CI-NEXT:	v_add_f64 v[4:5], v[10:11], v[4:5]
+; CI-NEXT:	s_mov_b32 s2, -1
+; CI-NEXT:	v_add_f64 v[0:1], v[8:9], v[0:1]
+; CI-NEXT:	buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; CI-NEXT:	buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; CI-NEXT:	s_endpgm
   %result = call <4 x double> @llvm.round.v4f64(<4 x double> %in) #1
   store <4 x double> %result, <4 x double> addrspace(1)* %out
   ret void
@@ -600,82 +600,82 @@ define amdgpu_kernel void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x dou
 ;
 ; CI-LABEL: round_v8f64:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; CI-NEXT:    s_load_dwordx16 s[8:23], s[0:1], 0x19
-; CI-NEXT:    s_brev_b32 s2, -2
-; CI-NEXT:    v_mov_b32_e32 v16, 0x3ff00000
-; CI-NEXT:    s_mov_b32 s7, 0xf000
-; CI-NEXT:    s_mov_b32 s6, -1
-; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_trunc_f64_e32 v[0:1], s[10:11]
-; CI-NEXT:    v_mov_b32_e32 v4, s11
-; CI-NEXT:    v_add_f64 v[2:3], s[10:11], -v[0:1]
-; CI-NEXT:    v_bfi_b32 v4, s2, v16, v4
-; CI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5
-; CI-NEXT:    v_mov_b32_e32 v2, 0
-; CI-NEXT:    v_cndmask_b32_e32 v3, 0, v4, vcc
-; CI-NEXT:    v_trunc_f64_e32 v[4:5], s[8:9]
-; CI-NEXT:    v_add_f64 v[2:3], v[0:1], v[2:3]
-; CI-NEXT:    v_add_f64 v[0:1], s[8:9], -v[4:5]
-; CI-NEXT:    v_mov_b32_e32 v6, s9
-; CI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[0:1]|, 0.5
-; CI-NEXT:    v_bfi_b32 v6, s2, v16, v6
-; CI-NEXT:    v_cndmask_b32_e32 v1, 0, v6, vcc
-; CI-NEXT:    v_trunc_f64_e32 v[6:7], s[14:15]
-; CI-NEXT:    v_mov_b32_e32 v0, 0
-; CI-NEXT:    v_add_f64 v[0:1], v[4:5], v[0:1]
-; CI-NEXT:    v_add_f64 v[4:5], s[14:15], -v[6:7]
-; CI-NEXT:    v_mov_b32_e32 v8, s15
-; CI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
-; CI-NEXT:    v_bfi_b32 v8, s2, v16, v8
-; CI-NEXT:    v_cndmask_b32_e32 v5, 0, v8, vcc
-; CI-NEXT:    v_trunc_f64_e32 v[8:9], s[12:13]
-; CI-NEXT:    v_mov_b32_e32 v4, 0
-; CI-NEXT:    v_add_f64 v[6:7], v[6:7], v[4:5]
-; CI-NEXT:    v_add_f64 v[4:5], s[12:13], -v[8:9]
-; CI-NEXT:    v_mov_b32_e32 v10, s13
-; CI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
-; CI-NEXT:    v_bfi_b32 v10, s2, v16, v10
-; CI-NEXT:    v_cndmask_b32_e32 v5, 0, v10, vcc
-; CI-NEXT:    v_mov_b32_e32 v4, 0
-; CI-NEXT:    v_add_f64 v[4:5], v[8:9], v[4:5]
-; CI-NEXT:    v_mov_b32_e32 v8, s19
-; CI-NEXT:    v_bfi_b32 v18, s2, v16, v8
-; CI-NEXT:    v_trunc_f64_e32 v[8:9], s[20:21]
-; CI-NEXT:    v_trunc_f64_e32 v[10:11], s[22:23]
-; CI-NEXT:    v_add_f64 v[14:15], s[20:21], -v[8:9]
-; CI-NEXT:    v_mov_b32_e32 v19, s23
-; CI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[14:15]|, 0.5
-; CI-NEXT:    v_add_f64 v[14:15], s[22:23], -v[10:11]
-; CI-NEXT:    v_mov_b32_e32 v17, s21
-; CI-NEXT:    v_cmp_ge_f64_e64 s[0:1], |v[14:15]|, 0.5
-; CI-NEXT:    v_bfi_b32 v19, s2, v16, v19
-; CI-NEXT:    v_trunc_f64_e32 v[12:13], s[16:17]
-; CI-NEXT:    v_bfi_b32 v17, s2, v16, v17
-; CI-NEXT:    v_cndmask_b32_e64 v15, 0, v19, s[0:1]
-; CI-NEXT:    v_mov_b32_e32 v14, 0
-; CI-NEXT:    v_add_f64 v[10:11], v[10:11], v[14:15]
-; CI-NEXT:    v_cndmask_b32_e32 v15, 0, v17, vcc
-; CI-NEXT:    v_mov_b32_e32 v14, 0
-; CI-NEXT:    v_mov_b32_e32 v17, s17
-; CI-NEXT:    v_add_f64 v[8:9], v[8:9], v[14:15]
-; CI-NEXT:    v_add_f64 v[14:15], s[16:17], -v[12:13]
-; CI-NEXT:    v_bfi_b32 v19, s2, v16, v17
-; CI-NEXT:    v_trunc_f64_e32 v[16:17], s[18:19]
-; CI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[14:15]|, 0.5
-; CI-NEXT:    v_add_f64 v[14:15], s[18:19], -v[16:17]
-; CI-NEXT:    v_cmp_ge_f64_e64 s[0:1], |v[14:15]|, 0.5
-; CI-NEXT:    v_mov_b32_e32 v14, 0
-; CI-NEXT:    v_cndmask_b32_e64 v15, 0, v18, s[0:1]
-; CI-NEXT:    v_add_f64 v[14:15], v[16:17], v[14:15]
-; CI-NEXT:    v_cndmask_b32_e32 v17, 0, v19, vcc
-; CI-NEXT:    v_mov_b32_e32 v16, 0
-; CI-NEXT:    v_add_f64 v[12:13], v[12:13], v[16:17]
-; CI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:48
-; CI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:32
-; CI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
-; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
-; CI-NEXT:    s_endpgm
+; CI-NEXT:	s_load_dwordx16 s[8:23], s[0:1], 0x19
+; CI-NEXT:	s_brev_b32 s2, -2
+; CI-NEXT:	v_mov_b32_e32 v16, 0x3ff00000
+; CI-NEXT:	s_load_dwordx2 s[4:5], s[0:1], 0x9
+; CI-NEXT:	s_mov_b32 s7, 0xf000
+; CI-NEXT:	s_waitcnt lgkmcnt(0)
+; CI-NEXT:	v_trunc_f64_e32 v[0:1], s[10:11]
+; CI-NEXT:	v_mov_b32_e32 v4, s11
+; CI-NEXT:	v_add_f64 v[2:3], s[10:11], -v[0:1]
+; CI-NEXT:	v_bfi_b32 v4, s2, v16, v4
+; CI-NEXT:	v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5
+; CI-NEXT:	v_mov_b32_e32 v2, 0
+; CI-NEXT:	v_cndmask_b32_e32 v3, 0, v4, vcc
+; CI-NEXT:	v_trunc_f64_e32 v[4:5], s[8:9]
+; CI-NEXT:	v_add_f64 v[2:3], v[0:1], v[2:3]
+; CI-NEXT:	v_add_f64 v[0:1], s[8:9], -v[4:5]
+; CI-NEXT:	v_mov_b32_e32 v6, s9
+; CI-NEXT:	v_cmp_ge_f64_e64 vcc, |v[0:1]|, 0.5
+; CI-NEXT:	v_bfi_b32 v6, s2, v16, v6
+; CI-NEXT:	v_cndmask_b32_e32 v1, 0, v6, vcc
+; CI-NEXT:	v_trunc_f64_e32 v[6:7], s[14:15]
+; CI-NEXT:	v_mov_b32_e32 v0, 0
+; CI-NEXT:	v_add_f64 v[0:1], v[4:5], v[0:1]
+; CI-NEXT:	v_add_f64 v[4:5], s[14:15], -v[6:7]
+; CI-NEXT:	v_mov_b32_e32 v8, s15
+; CI-NEXT:	v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
+; CI-NEXT:	v_bfi_b32 v8, s2, v16, v8
+; CI-NEXT:	v_cndmask_b32_e32 v5, 0, v8, vcc
+; CI-NEXT:	v_trunc_f64_e32 v[8:9], s[12:13]
+; CI-NEXT:	v_mov_b32_e32 v4, 0
+; CI-NEXT:	v_add_f64 v[6:7], v[6:7], v[4:5]
+; CI-NEXT:	v_add_f64 v[4:5], s[12:13], -v[8:9]
+; CI-NEXT:	v_mov_b32_e32 v10, s13
+; CI-NEXT:	v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
+; CI-NEXT:	v_bfi_b32 v10, s2, v16, v10
+; CI-NEXT:	v_cndmask_b32_e32 v5, 0, v10, vcc
+; CI-NEXT:	v_mov_b32_e32 v4, 0
+; CI-NEXT:	v_add_f64 v[4:5], v[8:9], v[4:5]
+; CI-NEXT:	v_mov_b32_e32 v8, s19
+; CI-NEXT:	v_bfi_b32 v18, s2, v16, v8
+; CI-NEXT:	v_trunc_f64_e32 v[8:9], s[20:21]
+; CI-NEXT:	v_trunc_f64_e32 v[10:11], s[22:23]
+; CI-NEXT:	v_add_f64 v[14:15], s[20:21], -v[8:9]
+; CI-NEXT:	v_mov_b32_e32 v19, s23
+; CI-NEXT:	v_cmp_ge_f64_e64 vcc, |v[14:15]|, 0.5
+; CI-NEXT:	v_add_f64 v[14:15], s[22:23], -v[10:11]
+; CI-NEXT:	v_mov_b32_e32 v17, s21
+; CI-NEXT:	v_cmp_ge_f64_e64 s[0:1], |v[14:15]|, 0.5
+; CI-NEXT:	v_bfi_b32 v19, s2, v16, v19
+; CI-NEXT:	v_trunc_f64_e32 v[12:13], s[16:17]
+; CI-NEXT:	v_bfi_b32 v17, s2, v16, v17
+; CI-NEXT:	v_cndmask_b32_e64 v15, 0, v19, s[0:1]
+; CI-NEXT:	v_mov_b32_e32 v14, 0
+; CI-NEXT:	v_add_f64 v[10:11], v[10:11], v[14:15]
+; CI-NEXT:	v_cndmask_b32_e32 v15, 0, v17, vcc
+; CI-NEXT:	v_mov_b32_e32 v14, 0
+; CI-NEXT:	v_mov_b32_e32 v17, s17
+; CI-NEXT:	v_add_f64 v[8:9], v[8:9], v[14:15]
+; CI-NEXT:	v_add_f64 v[14:15], s[16:17], -v[12:13]
+; CI-NEXT:	v_bfi_b32 v19, s2, v16, v17
+; CI-NEXT:	v_trunc_f64_e32 v[16:17], s[18:19]
+; CI-NEXT:	v_cmp_ge_f64_e64 vcc, |v[14:15]|, 0.5
+; CI-NEXT:	v_add_f64 v[14:15], s[18:19], -v[16:17]
+; CI-NEXT:	s_mov_b32 s6, -1
+; CI-NEXT:	v_cmp_ge_f64_e64 s[0:1], |v[14:15]|, 0.5
+; CI-NEXT:	v_mov_b32_e32 v14, 0
+; CI-NEXT:	v_cndmask_b32_e64 v15, 0, v18, s[0:1]
+; CI-NEXT:	v_add_f64 v[14:15], v[16:17], v[14:15]
+; CI-NEXT:	v_cndmask_b32_e32 v17, 0, v19, vcc
+; CI-NEXT:	v_mov_b32_e32 v16, 0
+; CI-NEXT:	v_add_f64 v[12:13], v[12:13], v[16:17]
+; CI-NEXT:	buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:48
+; CI-NEXT:	buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:32
+; CI-NEXT:	buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; CI-NEXT:	buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; CI-NEXT:	s_endpgm
   %result = call <8 x double> @llvm.round.v8f64(<8 x double> %in) #1
   store <8 x double> %result, <8 x double> addrspace(1)* %out
   ret void

diff  --git a/llvm/test/CodeGen/AMDGPU/merge-stores.ll b/llvm/test/CodeGen/AMDGPU/merge-stores.ll
index cb45d1172e6c..925a2daa93da 100644
--- a/llvm/test/CodeGen/AMDGPU/merge-stores.ll
+++ b/llvm/test/CodeGen/AMDGPU/merge-stores.ll
@@ -566,7 +566,6 @@ define amdgpu_kernel void @merge_global_store_6_constants_i32(i32 addrspace(1)*
 ; GCN-LABEL: {{^}}merge_global_store_7_constants_i32:
 ; GCN: buffer_store_dwordx4
 ; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dword v
 ; CI: buffer_store_dwordx3
 define amdgpu_kernel void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) {
   store i32 34, i32 addrspace(1)* %out, align 4

diff  --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
index a9a60b93ef54..50df07e0a8f4 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
@@ -17,9 +17,9 @@ define amdgpu_kernel void @clmem_read_simplified(i8 addrspace(1)*  %buffer) {
 ;
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
-; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
+; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
@@ -86,14 +86,14 @@ define hidden amdgpu_kernel void @clmem_read(i8 addrspace(1)*  %buffer) {
 ; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
 ; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
 ;
+; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
-; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
-; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
+; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
@@ -300,9 +300,9 @@ define amdgpu_kernel void @Offset64(i8 addrspace(1)*  %buffer) {
 ; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
 ;
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
+; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
-; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
 ;
 ; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
 ; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
@@ -456,10 +456,10 @@ define amdgpu_kernel void @ReverseOrder(i8 addrspace(1)* %buffer) {
 ;
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
-; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
+; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
 ;

diff  --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
index 2b8eba5f9014..a4d08ec980f4 100644
--- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
@@ -6,14 +6,14 @@
 define void @local_store_i56(i56 addrspace(3)* %ptr, i56 %arg) #0 {
 ; CIVI-LABEL: local_store_i56:
 ; CIVI:       ; %bb.0:
-; CIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CIVI-NEXT:    s_mov_b32 m0, -1
-; CIVI-NEXT:    ds_write_b16 v0, v2 offset:4
-; CIVI-NEXT:    ds_write_b32 v0, v1
-; CIVI-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; CIVI-NEXT:    ds_write_b8 v0, v1 offset:6
-; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
-; CIVI-NEXT:    s_setpc_b64 s[30:31]
+; CIVI-NEXT:        s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIVI-NEXT:        v_lshrrev_b32_e32 v3, 16, v2
+; CIVI-NEXT:        s_mov_b32 m0, -1
+; CIVI-NEXT:        ds_write_b8 v0, v3 offset:6
+; CIVI-NEXT:        ds_write_b16 v0, v2 offset:4
+; CIVI-NEXT:        ds_write_b32 v0, v1
+; CIVI-NEXT:        s_waitcnt lgkmcnt(0)
+; CIVI-NEXT:        s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: local_store_i56:
 ; GFX9:       ; %bb.0:
@@ -30,70 +30,70 @@ define void @local_store_i56(i56 addrspace(3)* %ptr, i56 %arg) #0 {
 define amdgpu_kernel void @local_store_i55(i55 addrspace(3)* %ptr, i55 %arg) #0 {
 ; HAWAII-LABEL: local_store_i55:
 ; HAWAII:       ; %bb.0:
-; HAWAII-NEXT:    s_or_b32 s0, s4, 14
-; HAWAII-NEXT:    v_mov_b32_e32 v0, s0
-; HAWAII-NEXT:    v_mov_b32_e32 v1, s5
-; HAWAII-NEXT:    flat_load_ubyte v0, v[0:1]
-; HAWAII-NEXT:    s_load_dword s0, s[4:5], 0x0
-; HAWAII-NEXT:    s_load_dword s1, s[4:5], 0x2
-; HAWAII-NEXT:    s_load_dword s2, s[4:5], 0x3
-; HAWAII-NEXT:    s_mov_b32 m0, -1
-; HAWAII-NEXT:    s_waitcnt lgkmcnt(0)
-; HAWAII-NEXT:    v_mov_b32_e32 v1, s0
-; HAWAII-NEXT:    v_mov_b32_e32 v3, s1
-; HAWAII-NEXT:    v_mov_b32_e32 v2, s2
-; HAWAII-NEXT:    ds_write_b16 v1, v2 offset:4
-; HAWAII-NEXT:    s_waitcnt vmcnt(0)
-; HAWAII-NEXT:    v_and_b32_e32 v0, 0x7f, v0
-; HAWAII-NEXT:    ds_write_b8 v1, v0 offset:6
-; HAWAII-NEXT:    ds_write_b32 v1, v3
-; HAWAII-NEXT:    s_endpgm
+; HAWAII-NEXT:        s_or_b32 s0, s4, 14
+; HAWAII-NEXT:        v_mov_b32_e32 v0, s0
+; HAWAII-NEXT:        v_mov_b32_e32 v1, s5
+; HAWAII-NEXT:        flat_load_ubyte v0, v[0:1]
+; HAWAII-NEXT:        s_load_dword s0, s[4:5], 0x0
+; HAWAII-NEXT:        s_load_dword s1, s[4:5], 0x2
+; HAWAII-NEXT:        s_load_dword s2, s[4:5], 0x3
+; HAWAII-NEXT:        s_mov_b32 m0, -1
+; HAWAII-NEXT:        s_waitcnt lgkmcnt(0)
+; HAWAII-NEXT:        v_mov_b32_e32 v1, s0
+; HAWAII-NEXT:        v_mov_b32_e32 v2, s1
+; HAWAII-NEXT:        v_mov_b32_e32 v3, s2
+; HAWAII-NEXT:        s_waitcnt vmcnt(0)
+; HAWAII-NEXT:        v_and_b32_e32 v0, 0x7f, v0
+; HAWAII-NEXT:        ds_write_b8 v1, v0 offset:6
+; HAWAII-NEXT:        ds_write_b16 v1, v3 offset:4
+; HAWAII-NEXT:        ds_write_b32 v1, v2
+; HAWAII-NEXT:        s_endpgm
 ;
 ; FIJI-LABEL: local_store_i55:
 ; FIJI:       ; %bb.0:
-; FIJI-NEXT:    s_or_b32 s0, s4, 14
-; FIJI-NEXT:    v_mov_b32_e32 v0, s0
-; FIJI-NEXT:    v_mov_b32_e32 v1, s5
-; FIJI-NEXT:    flat_load_ubyte v0, v[0:1]
-; FIJI-NEXT:    s_load_dword s0, s[4:5], 0x0
-; FIJI-NEXT:    s_load_dword s1, s[4:5], 0x8
-; FIJI-NEXT:    s_load_dword s2, s[4:5], 0xc
-; FIJI-NEXT:    s_mov_b32 m0, -1
-; FIJI-NEXT:    s_waitcnt lgkmcnt(0)
-; FIJI-NEXT:    v_mov_b32_e32 v1, s0
-; FIJI-NEXT:    v_mov_b32_e32 v3, s1
-; FIJI-NEXT:    s_and_b32 s3, s2, 0xffff
-; FIJI-NEXT:    v_mov_b32_e32 v2, s2
-; FIJI-NEXT:    ds_write_b16 v1, v2 offset:4
-; FIJI-NEXT:    s_waitcnt vmcnt(0)
-; FIJI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; FIJI-NEXT:    v_or_b32_e32 v0, s3, v0
-; FIJI-NEXT:    v_bfe_u32 v0, v0, 16, 7
-; FIJI-NEXT:    ds_write_b8 v1, v0 offset:6
-; FIJI-NEXT:    ds_write_b32 v1, v3
-; FIJI-NEXT:    s_endpgm
+; FIJI-NEXT:        s_or_b32 s0, s4, 14
+; FIJI-NEXT:        v_mov_b32_e32 v0, s0
+; FIJI-NEXT:        v_mov_b32_e32 v1, s5
+; FIJI-NEXT:        flat_load_ubyte v0, v[0:1]
+; FIJI-NEXT:        s_load_dword s0, s[4:5], 0x0
+; FIJI-NEXT:        s_load_dword s1, s[4:5], 0x8
+; FIJI-NEXT:        s_load_dword s2, s[4:5], 0xc
+; FIJI-NEXT:        s_mov_b32 m0, -1
+; FIJI-NEXT:        s_waitcnt lgkmcnt(0)
+; FIJI-NEXT:        v_mov_b32_e32 v1, s0
+; FIJI-NEXT:        v_mov_b32_e32 v3, s1
+; FIJI-NEXT:        s_and_b32 s3, s2, 0xffff
+; FIJI-NEXT:        v_mov_b32_e32 v2, s2
+; FIJI-NEXT:        s_waitcnt vmcnt(0)
+; FIJI-NEXT:        v_lshlrev_b32_e32 v0, 16, v0
+; FIJI-NEXT:        v_or_b32_e32 v0, s3, v0
+; FIJI-NEXT:        v_bfe_u32 v0, v0, 16, 7
+; FIJI-NEXT:        ds_write_b8 v1, v0 offset:6
+; FIJI-NEXT:        ds_write_b16 v1, v2 offset:4
+; FIJI-NEXT:        ds_write_b32 v1, v3
+; FIJI-NEXT:        s_endpgm
 ;
 ; GFX9-LABEL: local_store_i55:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    global_load_ubyte_d16_hi v2, v[0:1], off offset:14
-; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX9-NEXT:    s_load_dword s1, s[4:5], 0x8
-; GFX9-NEXT:    s_load_dword s2, s[4:5], 0xc
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    s_and_b32 s3, s2, 0xffff
-; GFX9-NEXT:    ds_write_b16 v0, v1 offset:4
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_or_b32_e32 v1, s3, v2
-; GFX9-NEXT:    v_and_b32_e32 v1, 0x7fffff, v1
-; GFX9-NEXT:    ds_write_b8_d16_hi v0, v1 offset:6
-; GFX9-NEXT:    ds_write_b32 v0, v3
-; GFX9-NEXT:    s_endpgm
+; GFX9-NEXT:        v_mov_b32_e32 v0, s4
+; GFX9-NEXT:        v_mov_b32_e32 v1, s5
+; GFX9-NEXT:        v_mov_b32_e32 v2, 0
+; GFX9-NEXT:        global_load_ubyte_d16_hi v2, v[0:1], off offset:14
+; GFX9-NEXT:        s_load_dword s0, s[4:5], 0x0
+; GFX9-NEXT:        s_load_dword s1, s[4:5], 0x8
+; GFX9-NEXT:        s_load_dword s2, s[4:5], 0xc
+; GFX9-NEXT:        s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:        v_mov_b32_e32 v0, s0
+; GFX9-NEXT:        v_mov_b32_e32 v3, s1
+; GFX9-NEXT:        s_and_b32 s3, s2, 0xffff
+; GFX9-NEXT:        v_mov_b32_e32 v1, s2
+; GFX9-NEXT:        s_waitcnt vmcnt(0)
+; GFX9-NEXT:        v_or_b32_e32 v2, s3, v2
+; GFX9-NEXT:        v_and_b32_e32 v2, 0x7fffff, v2
+; GFX9-NEXT:        ds_write_b8_d16_hi v0, v2 offset:6
+; GFX9-NEXT:        ds_write_b16 v0, v1 offset:4
+; GFX9-NEXT:        ds_write_b32 v0, v3
+; GFX9-NEXT:        s_endpgm
   store i55 %arg, i55 addrspace(3)* %ptr, align 8
   ret void
 }
@@ -101,31 +101,31 @@ define amdgpu_kernel void @local_store_i55(i55 addrspace(3)* %ptr, i55 %arg) #0
 define amdgpu_kernel void @local_store_i48(i48 addrspace(3)* %ptr, i48 %arg) #0 {
 ; HAWAII-LABEL: local_store_i48:
 ; HAWAII:       ; %bb.0:
-; HAWAII-NEXT:    s_load_dword s0, s[4:5], 0x0
-; HAWAII-NEXT:    s_load_dword s1, s[4:5], 0x2
-; HAWAII-NEXT:    s_load_dword s2, s[4:5], 0x3
-; HAWAII-NEXT:    s_mov_b32 m0, -1
-; HAWAII-NEXT:    s_waitcnt lgkmcnt(0)
-; HAWAII-NEXT:    v_mov_b32_e32 v0, s0
-; HAWAII-NEXT:    v_mov_b32_e32 v1, s2
-; HAWAII-NEXT:    ds_write_b16 v0, v1 offset:4
-; HAWAII-NEXT:    v_mov_b32_e32 v1, s1
-; HAWAII-NEXT:    ds_write_b32 v0, v1
-; HAWAII-NEXT:    s_endpgm
+; HAWAII-NEXT:        s_load_dword s0, s[4:5], 0x0
+; HAWAII-NEXT:        s_load_dword s1, s[4:5], 0x2
+; HAWAII-NEXT:        s_load_dword s2, s[4:5], 0x3
+; HAWAII-NEXT:        s_mov_b32 m0, -1
+; HAWAII-NEXT:        s_waitcnt lgkmcnt(0)
+; HAWAII-NEXT:        v_mov_b32_e32 v0, s0
+; HAWAII-NEXT:        v_mov_b32_e32 v2, s1
+; HAWAII-NEXT:        v_mov_b32_e32 v1, s2
+; HAWAII-NEXT:        ds_write_b16 v0, v1 offset:4
+; HAWAII-NEXT:        ds_write_b32 v0, v2
+; HAWAII-NEXT:        s_endpgm
 ;
 ; FIJI-LABEL: local_store_i48:
 ; FIJI:       ; %bb.0:
-; FIJI-NEXT:    s_load_dword s0, s[4:5], 0x0
-; FIJI-NEXT:    s_load_dword s1, s[4:5], 0x8
-; FIJI-NEXT:    s_load_dword s2, s[4:5], 0xc
-; FIJI-NEXT:    s_mov_b32 m0, -1
-; FIJI-NEXT:    s_waitcnt lgkmcnt(0)
-; FIJI-NEXT:    v_mov_b32_e32 v0, s0
-; FIJI-NEXT:    v_mov_b32_e32 v1, s2
-; FIJI-NEXT:    ds_write_b16 v0, v1 offset:4
-; FIJI-NEXT:    v_mov_b32_e32 v1, s1
-; FIJI-NEXT:    ds_write_b32 v0, v1
-; FIJI-NEXT:    s_endpgm
+; FIJI-NEXT:        s_load_dword s0, s[4:5], 0x0
+; FIJI-NEXT:        s_load_dword s1, s[4:5], 0x8
+; FIJI-NEXT:        s_load_dword s2, s[4:5], 0xc
+; FIJI-NEXT:        s_mov_b32 m0, -1
+; FIJI-NEXT:        s_waitcnt lgkmcnt(0)
+; FIJI-NEXT:        v_mov_b32_e32 v0, s0
+; FIJI-NEXT:        v_mov_b32_e32 v2, s1
+; FIJI-NEXT:        v_mov_b32_e32 v1, s2
+; FIJI-NEXT:        ds_write_b16 v0, v1 offset:4
+; FIJI-NEXT:        ds_write_b32 v0, v2
+; FIJI-NEXT:        s_endpgm
 ;
 ; GFX9-LABEL: local_store_i48:
 ; GFX9:       ; %bb.0:
@@ -146,35 +146,35 @@ define amdgpu_kernel void @local_store_i48(i48 addrspace(3)* %ptr, i48 %arg) #0
 define amdgpu_kernel void @local_store_i65(i65 addrspace(3)* %ptr, i65 %arg) #0 {
 ; HAWAII-LABEL: local_store_i65:
 ; HAWAII:       ; %bb.0:
-; HAWAII-NEXT:    s_load_dword s2, s[4:5], 0x0
-; HAWAII-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; HAWAII-NEXT:    s_load_dword s3, s[4:5], 0x4
-; HAWAII-NEXT:    s_mov_b32 m0, -1
-; HAWAII-NEXT:    s_waitcnt lgkmcnt(0)
-; HAWAII-NEXT:    v_mov_b32_e32 v2, s2
-; HAWAII-NEXT:    s_and_b32 s3, s3, 1
-; HAWAII-NEXT:    v_mov_b32_e32 v0, s3
-; HAWAII-NEXT:    ds_write_b8 v2, v0 offset:8
-; HAWAII-NEXT:    v_mov_b32_e32 v0, s0
-; HAWAII-NEXT:    v_mov_b32_e32 v1, s1
-; HAWAII-NEXT:    ds_write_b64 v2, v[0:1]
-; HAWAII-NEXT:    s_endpgm
+; HAWAII-NEXT:        s_load_dword s2, s[4:5], 0x0
+; HAWAII-NEXT:        s_load_dwordx2 s[0:1], s[4:5], 0x2
+; HAWAII-NEXT:        s_load_dword s3, s[4:5], 0x4
+; HAWAII-NEXT:        s_mov_b32 m0, -1
+; HAWAII-NEXT:        s_waitcnt lgkmcnt(0)
+; HAWAII-NEXT:        v_mov_b32_e32 v2, s2
+; HAWAII-NEXT:        v_mov_b32_e32 v0, s0
+; HAWAII-NEXT:        s_and_b32 s3, s3, 1
+; HAWAII-NEXT:        v_mov_b32_e32 v3, s3
+; HAWAII-NEXT:        v_mov_b32_e32 v1, s1
+; HAWAII-NEXT:        ds_write_b8 v2, v3 offset:8
+; HAWAII-NEXT:        ds_write_b64 v2, v[0:1]
+; HAWAII-NEXT:        s_endpgm
 ;
 ; FIJI-LABEL: local_store_i65:
 ; FIJI:       ; %bb.0:
-; FIJI-NEXT:    s_load_dword s2, s[4:5], 0x0
-; FIJI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; FIJI-NEXT:    s_load_dword s3, s[4:5], 0x10
-; FIJI-NEXT:    s_mov_b32 m0, -1
-; FIJI-NEXT:    s_waitcnt lgkmcnt(0)
-; FIJI-NEXT:    v_mov_b32_e32 v2, s2
-; FIJI-NEXT:    s_and_b32 s3, s3, 1
-; FIJI-NEXT:    v_mov_b32_e32 v0, s3
-; FIJI-NEXT:    ds_write_b8 v2, v0 offset:8
-; FIJI-NEXT:    v_mov_b32_e32 v0, s0
-; FIJI-NEXT:    v_mov_b32_e32 v1, s1
-; FIJI-NEXT:    ds_write_b64 v2, v[0:1]
-; FIJI-NEXT:    s_endpgm
+; FIJI-NEXT:        s_load_dword s2, s[4:5], 0x0
+; FIJI-NEXT:        s_load_dwordx2 s[0:1], s[4:5], 0x8
+; FIJI-NEXT:        s_load_dword s3, s[4:5], 0x10
+; FIJI-NEXT:        s_mov_b32 m0, -1
+; FIJI-NEXT:        s_waitcnt lgkmcnt(0)
+; FIJI-NEXT:        v_mov_b32_e32 v2, s2
+; FIJI-NEXT:        v_mov_b32_e32 v0, s0
+; FIJI-NEXT:        s_and_b32 s3, s3, 1
+; FIJI-NEXT:        v_mov_b32_e32 v3, s3
+; FIJI-NEXT:        v_mov_b32_e32 v1, s1
+; FIJI-NEXT:        ds_write_b8 v2, v3 offset:8
+; FIJI-NEXT:        ds_write_b64 v2, v[0:1]
+; FIJI-NEXT:        s_endpgm
 ;
 ; GFX9-LABEL: local_store_i65:
 ; GFX9:       ; %bb.0:
@@ -218,22 +218,22 @@ define void @local_store_i13(i13 addrspace(3)* %ptr, i13 %arg) #0 {
 define void @local_store_i17(i17 addrspace(3)* %ptr, i17 %arg) #0 {
 ; CIVI-LABEL: local_store_i17:
 ; CIVI:       ; %bb.0:
-; CIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CIVI-NEXT:    s_mov_b32 m0, -1
-; CIVI-NEXT:    ds_write_b16 v0, v1
-; CIVI-NEXT:    v_bfe_u32 v1, v1, 16, 1
-; CIVI-NEXT:    ds_write_b8 v0, v1 offset:2
-; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
-; CIVI-NEXT:    s_setpc_b64 s[30:31]
+; CIVI-NEXT:        s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIVI-NEXT:        s_mov_b32 m0, -1
+; CIVI-NEXT:        v_bfe_u32 v2, v1, 16, 1
+; CIVI-NEXT:        ds_write_b16 v0, v1
+; CIVI-NEXT:        ds_write_b8 v0, v2 offset:2
+; CIVI-NEXT:        s_waitcnt lgkmcnt(0)
+; CIVI-NEXT:        s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: local_store_i17:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ds_write_b16 v0, v1
-; GFX9-NEXT:    v_and_b32_e32 v1, 0x1ffff, v1
-; GFX9-NEXT:    ds_write_b8_d16_hi v0, v1 offset:2
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-NEXT:        s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:        v_and_b32_e32 v2, 0x1ffff, v1
+; GFX9-NEXT:        ds_write_b16 v0, v1
+; GFX9-NEXT:        ds_write_b8_d16_hi v0, v2 offset:2
+; GFX9-NEXT:        s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:        s_setpc_b64 s[30:31]
   store i17 %arg, i17 addrspace(3)* %ptr, align 8
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/token-factor-inline-limit-test.ll b/llvm/test/CodeGen/AMDGPU/token-factor-inline-limit-test.ll
index 88d9eeb8f266..1648c7fe37cc 100644
--- a/llvm/test/CodeGen/AMDGPU/token-factor-inline-limit-test.ll
+++ b/llvm/test/CodeGen/AMDGPU/token-factor-inline-limit-test.ll
@@ -9,14 +9,14 @@
 ; GCN-TFILD: v_mov_b32_e32 [[REG10:v[0-9]+]], 10
 ; GCN-TFILD: v_mov_b32_e32 [[REG11:v[0-9]+]], 11
 ; GCN-TFILD: v_mov_b32_e32 [[REG12:v[0-9]+]], 12
+; GCN-TFILD: v_mov_b32_e32 [[REG13:v[0-9]+]], 13
+; GCN-TFILD: v_mov_b32_e32 [[REG14:v[0-9]+]], 14
+; GCN-TFILD: v_mov_b32_e32 [[REG15:v[0-9]+]], 15
 ; GCN-TFILD: buffer_store_dword [[REG8]], {{.*$}}
 ; GCN-TFILD: buffer_store_dword [[REG9]], {{.*}} offset:4
 ; GCN-TFILD: buffer_store_dword [[REG10]], {{.*}} offset:8
 ; GCN-TFILD: buffer_store_dword [[REG11]], {{.*}} offset:12
 ; GCN-TFILD: buffer_store_dword [[REG12]], {{.*}} offset:16
-; GCN-TFILD: v_mov_b32_e32 [[REG13:v[0-9]+]], 13
-; GCN-TFILD: v_mov_b32_e32 [[REG14:v[0-9]+]], 14
-; GCN-TFILD: v_mov_b32_e32 [[REG15:v[0-9]+]], 15
 ; GCN-TFILD: buffer_store_dword [[REG13]], {{.*}} offset:20
 ; GCN-TFILD: buffer_store_dword [[REG14]], {{.*}} offset:24
 ; GCN-TFILD: buffer_store_dword [[REG15]], {{.*}} offset:28
@@ -24,14 +24,14 @@
 ; GCN-TFIL7: v_mov_b32_e32 [[REG15:v[0-9]+]], 15
 ; GCN-TFIL7: v_mov_b32_e32 [[REG14:v[0-9]+]], 14
 ; GCN-TFIL7: v_mov_b32_e32 [[REG13:v[0-9]+]], 13
-; GCN-TFIL7: buffer_store_dword [[REG15]], {{.*}} offset:28
-; GCN-TFIL7: buffer_store_dword [[REG14]], {{.*}} offset:24
-; GCN-TFIL7: buffer_store_dword [[REG13]], {{.*}} offset:20
 ; GCN-TFIL7: v_mov_b32_e32 [[REG12:v[0-9]+]], 12
 ; GCN-TFIL7: v_mov_b32_e32 [[REG11:v[0-9]+]], 11
 ; GCN-TFIL7: v_mov_b32_e32 [[REG10:v[0-9]+]], 10
 ; GCN-TFIL7: v_mov_b32_e32 [[REG9:v[0-9]+]], 9
 ; GCN-TFIL7: v_mov_b32_e32 [[REG8:v[0-9]+]], 8
+; GCN-TFIL7: buffer_store_dword [[REG15]], {{.*}} offset:28
+; GCN-TFIL7: buffer_store_dword [[REG14]], {{.*}} offset:24
+; GCN-TFIL7: buffer_store_dword [[REG13]], {{.*}} offset:20
 ; GCN-TFIL7: buffer_store_dword [[REG12]], {{.*}} offset:16
 ; GCN-TFIL7: buffer_store_dword [[REG11]], {{.*}} offset:12
 ; GCN-TFIL7: buffer_store_dword [[REG10]], {{.*}} offset:8

diff  --git a/llvm/test/CodeGen/AMDGPU/udivrem.ll b/llvm/test/CodeGen/AMDGPU/udivrem.ll
index 559f1092e6e4..fe613f2579e4 100644
--- a/llvm/test/CodeGen/AMDGPU/udivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/udivrem.ll
@@ -39,6 +39,7 @@ define amdgpu_kernel void @test_udivrem(i32 addrspace(1)* %out0, [8 x i32], i32
 ; GFX6-NEXT:        s_load_dword s3, s[0:1], 0x26
 ; GFX6-NEXT:        s_load_dwordx2 s[4:5], s[0:1], 0x9
 ; GFX6-NEXT:        s_load_dwordx2 s[8:9], s[0:1], 0x13
+; GFX6-NEXT:        s_load_dword s0, s[0:1], 0x1d
 ; GFX6-NEXT:        s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:        s_mov_b32 s6, -1
 ; GFX6-NEXT:        s_mov_b32 s10, s6
@@ -50,14 +51,12 @@ define amdgpu_kernel void @test_udivrem(i32 addrspace(1)* %out0, [8 x i32], i32
 ; GFX6-NEXT:        v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:        v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:        v_mul_lo_u32 v1, s2, v0
-; GFX6-NEXT:        s_load_dword s2, s[0:1], 0x1d
 ; GFX6-NEXT:        v_mul_hi_u32 v1, v0, v1
 ; GFX6-NEXT:        v_add_i32_e32 v0, vcc, v1, v0
-; GFX6-NEXT:        s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:        v_mul_hi_u32 v0, s2, v0
+; GFX6-NEXT:        v_mul_hi_u32 v0, s0, v0
 ; GFX6-NEXT:        v_mul_lo_u32 v1, v0, s3
 ; GFX6-NEXT:        v_add_i32_e32 v2, vcc, 1, v0
-; GFX6-NEXT:        v_sub_i32_e32 v1, vcc, s2, v1
+; GFX6-NEXT:        v_sub_i32_e32 v1, vcc, s0, v1
 ; GFX6-NEXT:        v_cmp_le_u32_e64 s[0:1], s3, v1
 ; GFX6-NEXT:        v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX6-NEXT:        v_subrev_i32_e32 v2, vcc, s3, v1


        


More information about the llvm-commits mailing list