[llvm] 0412f51 - [AMDGPU] Fix typo in SIInstrInfo::memOpsHaveSameBasePtr

Jay Foad via llvm-commits llvm-commits at lists.llvm.org
Tue Dec 17 10:54:46 PST 2019


Author: Jay Foad
Date: 2019-12-17T18:54:27Z
New Revision: 0412f518dcb05216d2321c28366eb760b65baebc

URL: https://github.com/llvm/llvm-project/commit/0412f518dcb05216d2321c28366eb760b65baebc
DIFF: https://github.com/llvm/llvm-project/commit/0412f518dcb05216d2321c28366eb760b65baebc.diff

LOG: [AMDGPU] Fix typo in SIInstrInfo::memOpsHaveSameBasePtr

Summary:
The typo has been present since memOpsHaveSameBasePtr was introduced in
r313208.

It caused SIInstrInfo::shouldClusterMemOps to cluster more mem ops than
it was supposed to.

Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D71616

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
    llvm/test/CodeGen/AMDGPU/add.i16.ll
    llvm/test/CodeGen/AMDGPU/ctpop.ll
    llvm/test/CodeGen/AMDGPU/ctpop16.ll
    llvm/test/CodeGen/AMDGPU/fadd.f16.ll
    llvm/test/CodeGen/AMDGPU/global_smrd.ll
    llvm/test/CodeGen/AMDGPU/idot2.ll
    llvm/test/CodeGen/AMDGPU/idot4s.ll
    llvm/test/CodeGen/AMDGPU/idot4u.ll
    llvm/test/CodeGen/AMDGPU/idot8s.ll
    llvm/test/CodeGen/AMDGPU/idot8u.ll
    llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
    llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
    llvm/test/CodeGen/AMDGPU/madak.ll
    llvm/test/CodeGen/AMDGPU/max.i16.ll
    llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
    llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
    llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll
    llvm/test/CodeGen/AMDGPU/sub.i16.ll
    llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
    llvm/test/CodeGen/AMDGPU/trunc-combine.ll
    llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
    llvm/test/CodeGen/AMDGPU/wait.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 79481a1a0d21..2224dbc91aaf 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -423,7 +423,7 @@ static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
   const MachineFunction &MF = *MI1.getParent()->getParent();
   const DataLayout &DL = MF.getFunction().getParent()->getDataLayout();
   Base1 = GetUnderlyingObject(Base1, DL);
-  Base2 = GetUnderlyingObject(Base1, DL);
+  Base2 = GetUnderlyingObject(Base2, DL);
 
   if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))
     return false;

diff  --git a/llvm/test/CodeGen/AMDGPU/add.i16.ll b/llvm/test/CodeGen/AMDGPU/add.i16.ll
index 8da3401b3dbe..98848295a73b 100644
--- a/llvm/test/CodeGen/AMDGPU/add.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/add.i16.ll
@@ -105,7 +105,7 @@ define amdgpu_kernel void @v_test_add_i16_zext_to_i64(i64 addrspace(1)* %out, i1
 ; GCN-LABEL: {{^}}v_test_add_i16_sext_to_i32:
 ; VI: flat_load_ushort [[A:v[0-9]+]]
 ; VI: flat_load_ushort [[B:v[0-9]+]]
-; VI: v_add_u16_e32 [[ADD:v[0-9]+]],  [[B]], [[A]]
+; VI: v_add_u16_e32 [[ADD:v[0-9]+]],  [[A]], [[B]]
 ; VI-NEXT: v_bfe_i32 [[SEXT:v[0-9]+]], [[ADD]], 0, 16
 ; VI-NEXT: buffer_store_dword [[SEXT]]
 define amdgpu_kernel void @v_test_add_i16_sext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
@@ -125,7 +125,7 @@ define amdgpu_kernel void @v_test_add_i16_sext_to_i32(i32 addrspace(1)* %out, i1
 ; GCN-LABEL: {{^}}v_test_add_i16_sext_to_i64:
 ; VI: flat_load_ushort [[A:v[0-9]+]]
 ; VI: flat_load_ushort [[B:v[0-9]+]]
-; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
 ; VI-NEXT: v_bfe_i32 v[[LO:[0-9]+]], [[ADD]], 0, 16
 ; VI-NEXT: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
 ; VI-NEXT: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}

diff  --git a/llvm/test/CodeGen/AMDGPU/ctpop.ll b/llvm/test/CodeGen/AMDGPU/ctpop.ll
index df661b8f4aa1..70ac38d94fe5 100644
--- a/llvm/test/CodeGen/AMDGPU/ctpop.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctpop.ll
@@ -284,7 +284,7 @@ define amdgpu_kernel void @v_ctpop_i32_add_var_inv(i32 addrspace(1)* noalias %ou
 ; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAR]], [[VAL]]
 ; VI: flat_load_dword [[VAR:v[0-9]+]], v[{{[0-9]+:[0-9]+}}]
 ; VI: flat_load_dword [[VAL:v[0-9]+]], v[{{[0-9]+:[0-9]+}}]
-; VI: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
+; VI: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], [[VAR]], [[VAL]]
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
 

diff  --git a/llvm/test/CodeGen/AMDGPU/ctpop16.ll b/llvm/test/CodeGen/AMDGPU/ctpop16.ll
index 9e774d151c4c..5990697fcec0 100644
--- a/llvm/test/CodeGen/AMDGPU/ctpop16.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctpop16.ll
@@ -285,7 +285,7 @@ define amdgpu_kernel void @v_ctpop_i16_add_var_inv(i16 addrspace(1)* noalias %ou
 ; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAR]], [[VAL]]
 ; VI: flat_load_ushort [[VAR:v[0-9]+]], v[{{[0-9]+:[0-9]+}}]
 ; VI: flat_load_ushort [[VAL:v[0-9]+]], v[{{[0-9]+:[0-9]+}}]
-; VI: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
+; VI: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], [[VAR]], [[VAL]]
 ; GCN: buffer_store_short [[RESULT]],
 ; GCN: s_endpgm
 

diff  --git a/llvm/test/CodeGen/AMDGPU/fadd.f16.ll b/llvm/test/CodeGen/AMDGPU/fadd.f16.ll
index 30fdb1df8268..3d796464777d 100644
--- a/llvm/test/CodeGen/AMDGPU/fadd.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fadd.f16.ll
@@ -62,8 +62,8 @@ entry:
 ; GCN-LABEL: {{^}}fadd_v2f16:
 ; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
 ; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; VI: flat_load_dword v[[B_V2_F16:[0-9]+]]
 ; VI: flat_load_dword v[[A_V2_F16:[0-9]+]]
+; VI: flat_load_dword v[[B_V2_F16:[0-9]+]]
 
 ; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
 ; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]

diff  --git a/llvm/test/CodeGen/AMDGPU/global_smrd.ll b/llvm/test/CodeGen/AMDGPU/global_smrd.ll
index e4ad729ff715..070aa1342e10 100644
--- a/llvm/test/CodeGen/AMDGPU/global_smrd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_smrd.ll
@@ -83,8 +83,8 @@ define amdgpu_kernel void @memdep(i32 addrspace(1)* %in, [8 x i32], i32 addrspac
 ; CHECK: s_getpc_b64 [[GET_PC:s\[[0-9]+:[0-9]+\]]]
 ; CHECK: s_load_dwordx2 [[A_ADDR:s\[[0-9]+:[0-9]+\]]], [[GET_PC]], 0x0
 ; CHECK: s_load_dwordx2 [[A_ADDR1:s\[[0-9]+:[0-9]+\]]], [[A_ADDR]], 0x0
-; CHECK: s_load_dword [[SVAL:s[0-9]+]], [[A_ADDR1]], 0x0
 ; CHECK: s_load_dwordx2 [[OUT:s\[[0-9]+:[0-9]+\]]], s[4:5], 0x0
+; CHECK: s_load_dword [[SVAL:s[0-9]+]], [[A_ADDR1]], 0x0
 ; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]]
 ; CHECK: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[VVAL]]
 @A = common local_unnamed_addr addrspace(1) global i32 addrspace(1)* null, align 4

diff  --git a/llvm/test/CodeGen/AMDGPU/idot2.ll b/llvm/test/CodeGen/AMDGPU/idot2.ll
index 74132defec7a..1c97e2a2e2f8 100644
--- a/llvm/test/CodeGen/AMDGPU/idot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot2.ll
@@ -20,15 +20,16 @@ define amdgpu_kernel void @udot2(<2 x i16> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
-; GFX7-NEXT:    s_load_dword s6, s[0:1], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_lshr_b32 s7, s4, 16
-; GFX7-NEXT:    s_lshr_b32 s9, s5, 16
+; GFX7-NEXT:    s_lshr_b32 s6, s4, 16
+; GFX7-NEXT:    s_lshr_b32 s7, s5, 16
 ; GFX7-NEXT:    s_and_b32 s4, s4, s8
-; GFX7-NEXT:    v_mov_b32_e32 v0, s7
-; GFX7-NEXT:    v_mov_b32_e32 v1, s6
-; GFX7-NEXT:    v_mad_u32_u24 v0, s9, v0, v1
 ; GFX7-NEXT:    s_and_b32 s5, s5, s8
+; GFX7-NEXT:    s_load_dword s8, s[0:1], 0x0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v1, s8
+; GFX7-NEXT:    v_mad_u32_u24 v0, s7, v0, v1
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s5, v1, v0
 ; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -88,14 +89,14 @@ define amdgpu_kernel void @udot2(<2 x i16> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
-; GFX9-DL-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX9-DL-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX9-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-DL-NEXT:    v_dot2_u32_u16 v2, s4, v0, v1
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s4
-; GFX9-DL-NEXT:    v_dot2_u32_u16 v2, s3, v2, v3
 ; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
 ;
@@ -105,14 +106,14 @@ define amdgpu_kernel void @udot2(<2 x i16> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
-; GFX10-DL-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_load_dword s3, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-DL-NEXT:    v_dot2_u32_u16 v2, s4, s3, v0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-DL-NEXT:    v_dot2_u32_u16 v2, s3, s2, v2
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                  <2 x i16> addrspace(1)* %src2,
@@ -154,17 +155,18 @@ define amdgpu_kernel void @udot2_MulMul(<2 x i16> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
-; GFX7-NEXT:    s_load_dword s6, s[0:1], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_lshr_b32 s7, s4, 16
+; GFX7-NEXT:    s_lshr_b32 s6, s4, 16
 ; GFX7-NEXT:    s_and_b32 s4, s4, s8
-; GFX7-NEXT:    s_lshr_b32 s9, s5, 16
-; GFX7-NEXT:    s_and_b32 s5, s5, s8
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX7-NEXT:    s_lshr_b32 s7, s5, 16
+; GFX7-NEXT:    s_and_b32 s5, s5, s8
 ; GFX7-NEXT:    v_mul_u32_u24_e32 v0, s5, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s7
-; GFX7-NEXT:    v_mad_u32_u24 v0, s9, v1, v0
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, s6, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mad_u32_u24 v0, s7, v1, v0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
 ; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
@@ -250,16 +252,16 @@ define amdgpu_kernel void @udot2_MulMul(<2 x i16> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_load_dword s3, s[4:5], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s5, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_and_b32 s6, s3, s2
+; GFX10-DL-NEXT:    s_and_b32 s2, s4, s2
+; GFX10-DL-NEXT:    s_lshr_b32 s3, s3, 16
+; GFX10-DL-NEXT:    s_lshr_b32 s4, s4, 16
+; GFX10-DL-NEXT:    v_mul_u32_u24_e64 v0, s2, s6
+; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s4, s3, v0
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, s5, v0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_and_b32 s0, s3, s2
-; GFX10-DL-NEXT:    s_and_b32 s1, s4, s2
-; GFX10-DL-NEXT:    s_lshr_b32 s2, s3, 16
-; GFX10-DL-NEXT:    s_lshr_b32 s3, s4, 16
-; GFX10-DL-NEXT:    v_mul_u32_u24_e64 v2, s1, s0
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s3, s2, v2
-; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, s5, v2
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                         <2 x i16> addrspace(1)* %src2,
@@ -318,18 +320,18 @@ define amdgpu_kernel void @idot2(<2 x i16> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX8-NEXT:    s_load_dword s3, s[6:7], 0x0
 ; GFX8-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_sext_i32_i16 s0, s2
+; GFX8-NEXT:    s_sext_i32_i16 s5, s2
 ; GFX8-NEXT:    s_ashr_i32 s2, s2, 16
-; GFX8-NEXT:    s_sext_i32_i16 s1, s3
+; GFX8-NEXT:    s_sext_i32_i16 s6, s3
 ; GFX8-NEXT:    s_ashr_i32 s3, s3, 16
-; GFX8-NEXT:    v_mov_b32_e32 v2, s4
-; GFX8-NEXT:    v_mov_b32_e32 v3, s2
-; GFX8-NEXT:    v_mad_i32_i24 v2, s3, v3, v2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s0
-; GFX8-NEXT:    v_mad_i32_i24 v2, s1, v3, v2
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s2
+; GFX8-NEXT:    v_mov_b32_e32 v2, s5
+; GFX8-NEXT:    v_mad_i32_i24 v0, s3, v1, v0
+; GFX8-NEXT:    v_mad_i32_i24 v2, s6, v2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -341,18 +343,18 @@ define amdgpu_kernel void @idot2(<2 x i16> addrspace(1)* %src1,
 ; GFX9-NODL-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-NODL-NEXT:    s_load_dword s3, s[6:7], 0x0
 ; GFX9-NODL-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    s_sext_i32_i16 s0, s2
+; GFX9-NODL-NEXT:    s_sext_i32_i16 s5, s2
 ; GFX9-NODL-NEXT:    s_ashr_i32 s2, s2, 16
-; GFX9-NODL-NEXT:    s_sext_i32_i16 s1, s3
+; GFX9-NODL-NEXT:    s_sext_i32_i16 s6, s3
 ; GFX9-NODL-NEXT:    s_ashr_i32 s3, s3, 16
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s2
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, s3, v3, v2
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s0
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, s1, v3, v2
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v0, s3, v1, v0
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, s6, v2, v0
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NODL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
@@ -362,14 +364,14 @@ define amdgpu_kernel void @idot2(<2 x i16> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
-; GFX9-DL-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX9-DL-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX9-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-DL-NEXT:    v_dot2_i32_i16 v2, s4, v0, v1
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s4
-; GFX9-DL-NEXT:    v_dot2_i32_i16 v2, s3, v2, v3
 ; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
 ;
@@ -379,14 +381,14 @@ define amdgpu_kernel void @idot2(<2 x i16> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
-; GFX10-DL-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_load_dword s3, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-DL-NEXT:    v_dot2_i32_i16 v2, s4, s3, v0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-DL-NEXT:    v_dot2_i32_i16 v2, s3, s2, v2
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                  <2 x i16> addrspace(1)* %src2,
@@ -446,18 +448,18 @@ define amdgpu_kernel void @idot2_MixedTypedMul(<2 x i16> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX8-NEXT:    s_load_dword s3, s[6:7], 0x0
 ; GFX8-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_sext_i32_i16 s0, s2
+; GFX8-NEXT:    s_sext_i32_i16 s5, s2
 ; GFX8-NEXT:    s_lshr_b32 s2, s2, 16
-; GFX8-NEXT:    s_sext_i32_i16 s1, s3
+; GFX8-NEXT:    s_sext_i32_i16 s6, s3
 ; GFX8-NEXT:    s_lshr_b32 s3, s3, 16
-; GFX8-NEXT:    v_mov_b32_e32 v2, s4
-; GFX8-NEXT:    v_mov_b32_e32 v3, s2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s3, v3, v2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s0
-; GFX8-NEXT:    v_mad_i32_i24 v2, s1, v3, v2
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s2
+; GFX8-NEXT:    v_mov_b32_e32 v2, s5
+; GFX8-NEXT:    v_mad_u32_u24 v0, s3, v1, v0
+; GFX8-NEXT:    v_mad_i32_i24 v2, s6, v2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -469,18 +471,18 @@ define amdgpu_kernel void @idot2_MixedTypedMul(<2 x i16> addrspace(1)* %src1,
 ; GFX9-NODL-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-NODL-NEXT:    s_load_dword s3, s[6:7], 0x0
 ; GFX9-NODL-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    s_sext_i32_i16 s0, s2
+; GFX9-NODL-NEXT:    s_sext_i32_i16 s5, s2
 ; GFX9-NODL-NEXT:    s_lshr_b32 s2, s2, 16
-; GFX9-NODL-NEXT:    s_sext_i32_i16 s1, s3
+; GFX9-NODL-NEXT:    s_sext_i32_i16 s6, s3
 ; GFX9-NODL-NEXT:    s_lshr_b32 s3, s3, 16
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s2
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s3, v3, v2
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s0
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, s1, v3, v2
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v0, s3, v1, v0
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, s6, v2, v0
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NODL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
@@ -492,18 +494,18 @@ define amdgpu_kernel void @idot2_MixedTypedMul(<2 x i16> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_sext_i32_i16 s0, s2
+; GFX9-DL-NEXT:    s_sext_i32_i16 s5, s2
 ; GFX9-DL-NEXT:    s_lshr_b32 s2, s2, 16
-; GFX9-DL-NEXT:    s_sext_i32_i16 s1, s3
+; GFX9-DL-NEXT:    s_sext_i32_i16 s6, s3
 ; GFX9-DL-NEXT:    s_lshr_b32 s3, s3, 16
-; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s3, v3, v2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s0
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s1, v3, v2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-DL-NEXT:    v_mad_u32_u24 v0, s3, v1, v0
+; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s6, v2, v0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
 ;
@@ -516,16 +518,16 @@ define amdgpu_kernel void @idot2_MixedTypedMul(<2 x i16> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_lshr_b32 s0, s2, 16
-; GFX10-DL-NEXT:    s_lshr_b32 s1, s3, 16
-; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-DL-NEXT:    s_lshr_b32 s5, s2, 16
+; GFX10-DL-NEXT:    s_lshr_b32 s6, s3, 16
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-DL-NEXT:    s_sext_i32_i16 s2, s2
 ; GFX10-DL-NEXT:    s_sext_i32_i16 s3, s3
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s1, s0, v2
-; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s3, s2, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s6, s5, v0
+; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s3, s2, v0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                                <2 x i16> addrspace(1)* %src2,
@@ -564,15 +566,16 @@ define amdgpu_kernel void @udot2_alt_AddOperands(<2 x i16> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
-; GFX7-NEXT:    s_load_dword s6, s[0:1], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_lshr_b32 s7, s4, 16
-; GFX7-NEXT:    s_lshr_b32 s9, s5, 16
+; GFX7-NEXT:    s_lshr_b32 s6, s4, 16
+; GFX7-NEXT:    s_lshr_b32 s7, s5, 16
 ; GFX7-NEXT:    s_and_b32 s4, s4, s8
-; GFX7-NEXT:    v_mov_b32_e32 v0, s7
-; GFX7-NEXT:    v_mov_b32_e32 v1, s6
-; GFX7-NEXT:    v_mad_u32_u24 v0, s9, v0, v1
 ; GFX7-NEXT:    s_and_b32 s5, s5, s8
+; GFX7-NEXT:    s_load_dword s8, s[0:1], 0x0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v1, s8
+; GFX7-NEXT:    v_mad_u32_u24 v0, s7, v0, v1
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s5, v1, v0
 ; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -632,14 +635,14 @@ define amdgpu_kernel void @udot2_alt_AddOperands(<2 x i16> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
-; GFX9-DL-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX9-DL-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX9-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-DL-NEXT:    v_dot2_u32_u16 v2, s4, v0, v1
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s4
-; GFX9-DL-NEXT:    v_dot2_u32_u16 v2, s3, v2, v3
 ; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
 ;
@@ -649,14 +652,14 @@ define amdgpu_kernel void @udot2_alt_AddOperands(<2 x i16> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
-; GFX10-DL-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_load_dword s3, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-DL-NEXT:    v_dot2_u32_u16 v2, s4, s3, v0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-DL-NEXT:    v_dot2_u32_u16 v2, s3, s2, v2
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                                  <2 x i16> addrspace(1)* %src2,
@@ -716,18 +719,18 @@ define amdgpu_kernel void @idot2_MixedExt(<2 x i16> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX8-NEXT:    s_load_dword s3, s[6:7], 0x0
 ; GFX8-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_sext_i32_i16 s0, s2
+; GFX8-NEXT:    s_sext_i32_i16 s5, s2
 ; GFX8-NEXT:    s_ashr_i32 s2, s2, 16
-; GFX8-NEXT:    s_and_b32 s1, s3, 0xffff
+; GFX8-NEXT:    s_and_b32 s6, s3, 0xffff
 ; GFX8-NEXT:    s_ashr_i32 s3, s3, 16
-; GFX8-NEXT:    v_mov_b32_e32 v2, s4
-; GFX8-NEXT:    v_mov_b32_e32 v3, s2
-; GFX8-NEXT:    v_mad_i32_i24 v2, s3, v3, v2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s0
-; GFX8-NEXT:    v_mad_i32_i24 v2, s1, v3, v2
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s2
+; GFX8-NEXT:    v_mov_b32_e32 v2, s5
+; GFX8-NEXT:    v_mad_i32_i24 v0, s3, v1, v0
+; GFX8-NEXT:    v_mad_i32_i24 v2, s6, v2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -739,18 +742,18 @@ define amdgpu_kernel void @idot2_MixedExt(<2 x i16> addrspace(1)* %src1,
 ; GFX9-NODL-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-NODL-NEXT:    s_load_dword s3, s[6:7], 0x0
 ; GFX9-NODL-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    s_sext_i32_i16 s0, s2
+; GFX9-NODL-NEXT:    s_sext_i32_i16 s5, s2
 ; GFX9-NODL-NEXT:    s_ashr_i32 s2, s2, 16
-; GFX9-NODL-NEXT:    s_and_b32 s1, s3, 0xffff
+; GFX9-NODL-NEXT:    s_and_b32 s6, s3, 0xffff
 ; GFX9-NODL-NEXT:    s_ashr_i32 s3, s3, 16
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s2
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, s3, v3, v2
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s0
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, s1, v3, v2
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v0, s3, v1, v0
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, s6, v2, v0
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NODL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
@@ -762,18 +765,18 @@ define amdgpu_kernel void @idot2_MixedExt(<2 x i16> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_sext_i32_i16 s0, s2
+; GFX9-DL-NEXT:    s_sext_i32_i16 s5, s2
 ; GFX9-DL-NEXT:    s_ashr_i32 s2, s2, 16
-; GFX9-DL-NEXT:    s_and_b32 s1, s3, 0xffff
+; GFX9-DL-NEXT:    s_and_b32 s6, s3, 0xffff
 ; GFX9-DL-NEXT:    s_ashr_i32 s3, s3, 16
-; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s2
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s3, v3, v2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s0
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s1, v3, v2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s3, v1, v0
+; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s6, v2, v0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
 ;
@@ -786,16 +789,16 @@ define amdgpu_kernel void @idot2_MixedExt(<2 x i16> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_ashr_i32 s0, s2, 16
-; GFX10-DL-NEXT:    s_ashr_i32 s1, s3, 16
-; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-DL-NEXT:    s_ashr_i32 s5, s2, 16
+; GFX10-DL-NEXT:    s_ashr_i32 s6, s3, 16
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-DL-NEXT:    s_sext_i32_i16 s2, s2
 ; GFX10-DL-NEXT:    s_and_b32 s3, s3, 0xffff
-; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s1, s0, v2
-; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s3, s2, v2
+; GFX10-DL-NEXT:    v_mad_i32_i24 v0, s6, s5, v0
+; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s3, s2, v0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                           <2 x i16> addrspace(1)* %src2,
@@ -831,13 +834,13 @@ define amdgpu_kernel void @notudot2_SameVec(<2 x i16> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[0:1], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
-; GFX7-NEXT:    s_load_dword s6, s[0:1], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_lshr_b32 s5, s6, 16
+; GFX7-NEXT:    v_mov_b32_e32 v0, s7
 ; GFX7-NEXT:    s_and_b32 s4, s4, 0xffff
-; GFX7-NEXT:    s_lshr_b32 s5, s5, 16
-; GFX7-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s5, s5, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s4, s4, v0
 ; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -848,17 +851,17 @@ define amdgpu_kernel void @notudot2_SameVec(<2 x i16> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX8-NEXT:    s_load_dword s3, s[6:7], 0x0
-; GFX8-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX8-NEXT:    s_load_dword s2, s[6:7], 0x0
+; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX8-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_and_b32 s0, s2, 0xffff
-; GFX8-NEXT:    s_lshr_b32 s1, s3, 16
-; GFX8-NEXT:    v_mov_b32_e32 v2, s4
-; GFX8-NEXT:    v_mad_u32_u24 v2, s1, s1, v2
+; GFX8-NEXT:    s_lshr_b32 s0, s2, 16
+; GFX8-NEXT:    v_mov_b32_e32 v2, s3
+; GFX8-NEXT:    s_and_b32 s1, s4, 0xffff
 ; GFX8-NEXT:    v_mad_u32_u24 v2, s0, s0, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s1, s1, v2
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -867,17 +870,17 @@ define amdgpu_kernel void @notudot2_SameVec(<2 x i16> addrspace(1)* %src1,
 ; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX9-NODL-NEXT:    s_load_dword s3, s[6:7], 0x0
-; GFX9-NODL-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX9-NODL-NEXT:    s_load_dword s2, s[6:7], 0x0
+; GFX9-NODL-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX9-NODL-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    s_and_b32 s0, s2, 0xffff
-; GFX9-NODL-NEXT:    s_lshr_b32 s1, s3, 16
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s1, s1, v2
+; GFX9-NODL-NEXT:    s_lshr_b32 s0, s2, 16
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-NODL-NEXT:    s_and_b32 s1, s4, 0xffff
 ; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s0, s0, v2
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s1, s1, v2
 ; GFX9-NODL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
@@ -886,17 +889,17 @@ define amdgpu_kernel void @notudot2_SameVec(<2 x i16> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
-; GFX9-DL-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX9-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
+; GFX9-DL-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX9-DL-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_and_b32 s0, s2, 0xffff
-; GFX9-DL-NEXT:    s_lshr_b32 s1, s3, 16
-; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s1, s1, v2
+; GFX9-DL-NEXT:    s_lshr_b32 s0, s2, 16
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-DL-NEXT:    s_and_b32 s1, s4, 0xffff
 ; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s0, s0, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s1, s1, v2
 ; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
 ;
@@ -906,16 +909,16 @@ define amdgpu_kernel void @notudot2_SameVec(<2 x i16> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_load_dword s3, s[4:5], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_lshr_b32 s2, s2, 16
+; GFX10-DL-NEXT:    s_and_b32 s3, s3, 0xffff
+; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s2, s2, s4
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s3, s3, v0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_and_b32 s0, s2, 0xffff
-; GFX10-DL-NEXT:    s_lshr_b32 s1, s3, 16
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s1, s1, s4
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s0, s0, v2
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                             <2 x i16> addrspace(1)* %src2,
@@ -954,17 +957,18 @@ define amdgpu_kernel void @udot2_v4i16(<4 x i16> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
-; GFX7-NEXT:    s_load_dword s6, s[0:1], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_and_b32 s7, s4, s8
+; GFX7-NEXT:    s_and_b32 s6, s4, s8
+; GFX7-NEXT:    s_and_b32 s7, s5, s8
+; GFX7-NEXT:    s_load_dword s8, s[0:1], 0x0
 ; GFX7-NEXT:    s_lshr_b32 s4, s4, 16
-; GFX7-NEXT:    s_and_b32 s8, s5, s8
 ; GFX7-NEXT:    s_lshr_b32 s5, s5, 16
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v1, s8
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s5, v0, v1
-; GFX7-NEXT:    v_mov_b32_e32 v1, s7
-; GFX7-NEXT:    v_mad_u32_u24 v0, s8, v1, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mad_u32_u24 v0, s7, v1, v0
 ; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
@@ -1022,14 +1026,14 @@ define amdgpu_kernel void @udot2_v4i16(<4 x i16> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
-; GFX9-DL-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX9-DL-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX9-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-DL-NEXT:    v_dot2_u32_u16 v2, s4, v0, v1
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s4
-; GFX9-DL-NEXT:    v_dot2_u32_u16 v2, s3, v2, v3
 ; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
 ;
@@ -1039,14 +1043,14 @@ define amdgpu_kernel void @udot2_v4i16(<4 x i16> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
-; GFX10-DL-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_load_dword s3, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-DL-NEXT:    v_dot2_u32_u16 v2, s4, s3, v0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-DL-NEXT:    v_dot2_u32_u16 v2, s3, s2, v2
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                        <4 x i16> addrspace(1)* %src2,
@@ -1085,17 +1089,18 @@ define amdgpu_kernel void @udot2_v4i16_Hi(<4 x i16> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x1
 ; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
-; GFX7-NEXT:    s_load_dword s6, s[0:1], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_and_b32 s7, s4, s8
+; GFX7-NEXT:    s_and_b32 s6, s4, s8
+; GFX7-NEXT:    s_and_b32 s7, s5, s8
+; GFX7-NEXT:    s_load_dword s8, s[0:1], 0x0
 ; GFX7-NEXT:    s_lshr_b32 s4, s4, 16
-; GFX7-NEXT:    s_and_b32 s8, s5, s8
 ; GFX7-NEXT:    s_lshr_b32 s5, s5, 16
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v1, s8
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s5, v0, v1
-; GFX7-NEXT:    v_mov_b32_e32 v1, s7
-; GFX7-NEXT:    v_mad_u32_u24 v0, s8, v1, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mad_u32_u24 v0, s7, v1, v0
 ; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
@@ -1153,14 +1158,14 @@ define amdgpu_kernel void @udot2_v4i16_Hi(<4 x i16> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x4
-; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x4
-; GFX9-DL-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX9-DL-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX9-DL-NEXT:    s_load_dword s4, s[6:7], 0x4
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-DL-NEXT:    v_dot2_u32_u16 v2, s4, v0, v1
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s4
-; GFX9-DL-NEXT:    v_dot2_u32_u16 v2, s3, v2, v3
 ; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
 ;
@@ -1170,14 +1175,14 @@ define amdgpu_kernel void @udot2_v4i16_Hi(<4 x i16> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x4
-; GFX10-DL-NEXT:    s_load_dword s3, s[6:7], 0x4
-; GFX10-DL-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_load_dword s3, s[4:5], 0x4
+; GFX10-DL-NEXT:    s_load_dword s4, s[6:7], 0x4
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-DL-NEXT:    v_dot2_u32_u16 v2, s4, s3, v0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-DL-NEXT:    v_dot2_u32_u16 v2, s3, s2, v2
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                           <4 x i16> addrspace(1)* %src2,
@@ -1216,15 +1221,16 @@ define amdgpu_kernel void @notudot2_v4i16_Even(<4 x i16> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
-; GFX7-NEXT:    s_load_dword s9, s[0:1], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_and_b32 s5, s5, s8
 ; GFX7-NEXT:    s_and_b32 s4, s4, s8
+; GFX7-NEXT:    s_and_b32 s6, s6, s8
 ; GFX7-NEXT:    s_and_b32 s7, s7, s8
+; GFX7-NEXT:    s_load_dword s8, s[0:1], 0x0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s5
-; GFX7-NEXT:    v_mov_b32_e32 v1, s9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v1, s8
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s7, v0, v1
-; GFX7-NEXT:    s_and_b32 s6, s6, s8
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s6, v1, v0
 ; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -1312,16 +1318,16 @@ define amdgpu_kernel void @notudot2_v4i16_Even(<4 x i16> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s6, s[0:1], 0x0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_and_b32 s0, s3, s8
-; GFX10-DL-NEXT:    s_and_b32 s1, s5, s8
-; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX10-DL-NEXT:    s_and_b32 s3, s3, s8
+; GFX10-DL-NEXT:    s_and_b32 s5, s5, s8
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-DL-NEXT:    s_and_b32 s2, s2, s8
-; GFX10-DL-NEXT:    s_and_b32 s3, s4, s8
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s1, s0, v2
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s3, s2, v2
+; GFX10-DL-NEXT:    s_and_b32 s4, s4, s8
+; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s5, s3, v0
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s4, s2, v0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                                <4 x i16> addrspace(1)* %src2,
@@ -1360,15 +1366,16 @@ define amdgpu_kernel void @notudot2_v4i16_Middle(<4 x i16> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
-; GFX7-NEXT:    s_load_dword s9, s[0:1], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_and_b32 s5, s5, s8
-; GFX7-NEXT:    s_lshr_b32 s4, s4, 16
 ; GFX7-NEXT:    s_and_b32 s7, s7, s8
+; GFX7-NEXT:    s_load_dword s8, s[0:1], 0x0
+; GFX7-NEXT:    s_lshr_b32 s4, s4, 16
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s5
-; GFX7-NEXT:    v_mov_b32_e32 v1, s9
-; GFX7-NEXT:    v_mad_u32_u24 v0, s7, v0, v1
 ; GFX7-NEXT:    s_lshr_b32 s6, s6, 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v1, s8
+; GFX7-NEXT:    v_mad_u32_u24 v0, s7, v0, v1
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s6, v1, v0
 ; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -1456,16 +1463,16 @@ define amdgpu_kernel void @notudot2_v4i16_Middle(<4 x i16> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s6, s[0:1], 0x0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_and_b32 s0, s3, s8
-; GFX10-DL-NEXT:    s_and_b32 s1, s5, s8
-; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX10-DL-NEXT:    s_and_b32 s3, s3, s8
+; GFX10-DL-NEXT:    s_and_b32 s5, s5, s8
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-DL-NEXT:    s_lshr_b32 s2, s2, 16
-; GFX10-DL-NEXT:    s_lshr_b32 s3, s4, 16
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s1, s0, v2
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s3, s2, v2
+; GFX10-DL-NEXT:    s_lshr_b32 s4, s4, 16
+; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s5, s3, v0
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s4, s2, v0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                                  <4 x i16> addrspace(1)* %src2,
@@ -1504,17 +1511,18 @@ define amdgpu_kernel void @notudot2_DiffIndex(<2 x i16> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
-; GFX7-NEXT:    s_load_dword s6, s[0:1], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_lshr_b32 s7, s4, 16
-; GFX7-NEXT:    s_lshr_b32 s9, s5, 16
+; GFX7-NEXT:    s_lshr_b32 s6, s4, 16
+; GFX7-NEXT:    s_lshr_b32 s7, s5, 16
 ; GFX7-NEXT:    s_and_b32 s4, s4, s8
 ; GFX7-NEXT:    s_and_b32 s5, s5, s8
-; GFX7-NEXT:    v_mov_b32_e32 v0, s7
-; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    s_load_dword s8, s[0:1], 0x0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v1, s8
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s5, v0, v1
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s4
-; GFX7-NEXT:    v_mad_u32_u24 v0, s9, v1, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s7, v1, v0
 ; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
@@ -1600,16 +1608,16 @@ define amdgpu_kernel void @notudot2_DiffIndex(<2 x i16> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_load_dword s3, s[4:5], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s5, s[0:1], 0x0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_lshr_b32 s0, s3, 16
-; GFX10-DL-NEXT:    s_and_b32 s1, s4, s2
-; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-DL-NEXT:    s_lshr_b32 s6, s3, 16
+; GFX10-DL-NEXT:    s_and_b32 s7, s4, s2
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-DL-NEXT:    s_and_b32 s2, s3, s2
 ; GFX10-DL-NEXT:    s_lshr_b32 s3, s4, 16
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s1, s0, v2
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s3, s2, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s7, s6, v0
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s3, s2, v0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                               <2 x i16> addrspace(1)* %src2,
@@ -1648,15 +1656,16 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(<2 x i16> addrspace(1)* %src1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
-; GFX7-NEXT:    s_load_dword s6, s[0:1], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_lshr_b32 s7, s4, 16
-; GFX7-NEXT:    s_lshr_b32 s9, s5, 16
+; GFX7-NEXT:    s_lshr_b32 s6, s4, 16
+; GFX7-NEXT:    s_lshr_b32 s7, s5, 16
 ; GFX7-NEXT:    s_and_b32 s4, s4, s8
-; GFX7-NEXT:    v_mov_b32_e32 v0, s7
-; GFX7-NEXT:    v_mov_b32_e32 v1, s6
-; GFX7-NEXT:    v_mad_u32_u24 v0, s9, v0, v1
 ; GFX7-NEXT:    s_and_b32 s5, s5, s8
+; GFX7-NEXT:    s_load_dword s8, s[0:1], 0x0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v1, s8
+; GFX7-NEXT:    v_mad_u32_u24 v0, s7, v0, v1
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    v_mad_u32_u24 v1, s5, v1, v0
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
@@ -1748,17 +1757,17 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(<2 x i16> addrspace(1)* %src1
 ; GFX10-DL-NEXT:    s_load_dword s3, s[4:5], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s5, s[0:1], 0x0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_lshr_b32 s0, s3, 16
-; GFX10-DL-NEXT:    s_lshr_b32 s1, s4, 16
-; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-DL-NEXT:    s_lshr_b32 s6, s3, 16
+; GFX10-DL-NEXT:    s_lshr_b32 s7, s4, 16
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-DL-NEXT:    s_and_b32 s3, s3, s2
 ; GFX10-DL-NEXT:    s_and_b32 s2, s4, s2
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s1, s0, v2
-; GFX10-DL-NEXT:    v_mad_u32_u24 v3, s2, s3, v2
-; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v3, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s7, s6, v0
+; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s3, v0
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v1, v0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                                    <2 x i16> addrspace(1)* %src2,
@@ -1821,19 +1830,19 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(<2 x i16> addrspace(1)* %src1
 ; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX8-NEXT:    s_load_dword s3, s[6:7], 0x0
 ; GFX8-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_sext_i32_i16 s0, s2
+; GFX8-NEXT:    s_sext_i32_i16 s5, s2
 ; GFX8-NEXT:    s_ashr_i32 s2, s2, 16
-; GFX8-NEXT:    s_sext_i32_i16 s1, s3
+; GFX8-NEXT:    s_sext_i32_i16 s6, s3
 ; GFX8-NEXT:    s_ashr_i32 s3, s3, 16
-; GFX8-NEXT:    v_mov_b32_e32 v2, s4
-; GFX8-NEXT:    v_mov_b32_e32 v3, s2
-; GFX8-NEXT:    v_mad_i32_i24 v2, s3, v3, v2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s0
-; GFX8-NEXT:    v_mad_i32_i24 v3, s1, v3, v2
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s2
+; GFX8-NEXT:    v_mov_b32_e32 v2, s5
+; GFX8-NEXT:    v_mad_i32_i24 v0, s3, v1, v0
+; GFX8-NEXT:    v_mad_i32_i24 v1, s6, v2, v0
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -1845,19 +1854,19 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(<2 x i16> addrspace(1)* %src1
 ; GFX9-NODL-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-NODL-NEXT:    s_load_dword s3, s[6:7], 0x0
 ; GFX9-NODL-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    s_sext_i32_i16 s0, s2
+; GFX9-NODL-NEXT:    s_sext_i32_i16 s5, s2
 ; GFX9-NODL-NEXT:    s_ashr_i32 s2, s2, 16
-; GFX9-NODL-NEXT:    s_sext_i32_i16 s1, s3
+; GFX9-NODL-NEXT:    s_sext_i32_i16 s6, s3
 ; GFX9-NODL-NEXT:    s_ashr_i32 s3, s3, 16
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s2
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, s3, v3, v2
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s0
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v3, s1, v3, v2
-; GFX9-NODL-NEXT:    v_add_u32_e32 v2, v3, v2
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v0, s3, v1, v0
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v1, s6, v2, v0
+; GFX9-NODL-NEXT:    v_add_u32_e32 v2, v1, v0
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NODL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
@@ -1869,19 +1878,19 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(<2 x i16> addrspace(1)* %src1
 ; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_sext_i32_i16 s0, s2
+; GFX9-DL-NEXT:    s_sext_i32_i16 s5, s2
 ; GFX9-DL-NEXT:    s_ashr_i32 s2, s2, 16
-; GFX9-DL-NEXT:    s_sext_i32_i16 s1, s3
+; GFX9-DL-NEXT:    s_sext_i32_i16 s6, s3
 ; GFX9-DL-NEXT:    s_ashr_i32 s3, s3, 16
-; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s2
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s3, v3, v2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s0
-; GFX9-DL-NEXT:    v_mad_i32_i24 v3, s1, v3, v2
-; GFX9-DL-NEXT:    v_add_u32_e32 v2, v3, v2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s3, v1, v0
+; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s6, v2, v0
+; GFX9-DL-NEXT:    v_add_u32_e32 v2, v1, v0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
 ;
@@ -1894,17 +1903,17 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(<2 x i16> addrspace(1)* %src1
 ; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_ashr_i32 s0, s2, 16
-; GFX10-DL-NEXT:    s_ashr_i32 s1, s3, 16
-; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-DL-NEXT:    s_ashr_i32 s5, s2, 16
+; GFX10-DL-NEXT:    s_ashr_i32 s6, s3, 16
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-DL-NEXT:    s_sext_i32_i16 s2, s2
 ; GFX10-DL-NEXT:    s_sext_i32_i16 s3, s3
-; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s1, s0, v2
-; GFX10-DL-NEXT:    v_mad_i32_i24 v3, s3, s2, v2
-; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v3, v2
+; GFX10-DL-NEXT:    v_mad_i32_i24 v0, s6, s5, v0
+; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s3, s2, v0
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v1, v0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                                    <2 x i16> addrspace(1)* %src2,
@@ -1945,17 +1954,18 @@ define amdgpu_kernel void @udot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
-; GFX7-NEXT:    s_load_dword s6, s[0:1], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_lshr_b32 s7, s4, 16
+; GFX7-NEXT:    s_lshr_b32 s6, s4, 16
+; GFX7-NEXT:    s_lshr_b32 s7, s5, 16
 ; GFX7-NEXT:    s_and_b32 s4, s4, s8
-; GFX7-NEXT:    s_lshr_b32 s9, s5, 16
 ; GFX7-NEXT:    s_and_b32 s5, s5, s8
+; GFX7-NEXT:    s_load_dword s8, s[0:1], 0x0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v1, s8
 ; GFX7-NEXT:    v_mad_u32_u24 v1, s5, v0, v1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s7
-; GFX7-NEXT:    v_mad_u32_u24 v1, s9, v2, v1
+; GFX7-NEXT:    v_mad_u32_u24 v1, s7, v2, v1
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s5, v0, v1
 ; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
@@ -2045,17 +2055,17 @@ define amdgpu_kernel void @udot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1
 ; GFX10-DL-NEXT:    s_load_dword s3, s[4:5], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s5, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_and_b32 s6, s3, s2
+; GFX10-DL-NEXT:    s_and_b32 s2, s4, s2
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-DL-NEXT:    s_lshr_b32 s3, s3, 16
+; GFX10-DL-NEXT:    s_lshr_b32 s4, s4, 16
+; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s2, s6, v0
+; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s4, s3, v0
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s6, v0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_and_b32 s0, s3, s2
-; GFX10-DL-NEXT:    s_and_b32 s1, s4, s2
-; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-DL-NEXT:    s_lshr_b32 s2, s3, 16
-; GFX10-DL-NEXT:    s_lshr_b32 s3, s4, 16
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s1, s0, v2
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s3, s2, v2
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s1, s0, v2
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                                    <2 x i16> addrspace(1)* %src2,
@@ -2119,19 +2129,19 @@ define amdgpu_kernel void @idot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1
 ; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX8-NEXT:    s_load_dword s3, s[6:7], 0x0
 ; GFX8-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_sext_i32_i16 s0, s2
-; GFX8-NEXT:    s_sext_i32_i16 s1, s3
+; GFX8-NEXT:    s_sext_i32_i16 s5, s2
+; GFX8-NEXT:    s_sext_i32_i16 s6, s3
 ; GFX8-NEXT:    s_ashr_i32 s2, s2, 16
-; GFX8-NEXT:    v_mov_b32_e32 v2, s4
-; GFX8-NEXT:    v_mov_b32_e32 v3, s0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    s_ashr_i32 s3, s3, 16
-; GFX8-NEXT:    v_mad_i32_i24 v2, s1, v3, v2
-; GFX8-NEXT:    v_mov_b32_e32 v4, s2
-; GFX8-NEXT:    v_mad_i32_i24 v2, s3, v4, v2
-; GFX8-NEXT:    v_mad_i32_i24 v2, s1, v3, v2
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mad_i32_i24 v0, s6, v1, v0
+; GFX8-NEXT:    v_mad_i32_i24 v0, s3, v2, v0
+; GFX8-NEXT:    v_mad_i32_i24 v2, s6, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -2143,19 +2153,19 @@ define amdgpu_kernel void @idot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1
 ; GFX9-NODL-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-NODL-NEXT:    s_load_dword s3, s[6:7], 0x0
 ; GFX9-NODL-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    s_sext_i32_i16 s0, s2
-; GFX9-NODL-NEXT:    s_sext_i32_i16 s1, s3
+; GFX9-NODL-NEXT:    s_sext_i32_i16 s5, s2
+; GFX9-NODL-NEXT:    s_sext_i32_i16 s6, s3
 ; GFX9-NODL-NEXT:    s_ashr_i32 s2, s2, 16
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s0
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NODL-NEXT:    s_ashr_i32 s3, s3, 16
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, s1, v3, v2
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v4, s2
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, s3, v4, v2
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, s1, v3, v2
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v0, s6, v1, v0
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v0, s3, v2, v0
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, s6, v1, v0
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NODL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
@@ -2167,19 +2177,19 @@ define amdgpu_kernel void @idot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1
 ; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_sext_i32_i16 s0, s2
-; GFX9-DL-NEXT:    s_sext_i32_i16 s1, s3
+; GFX9-DL-NEXT:    s_sext_i32_i16 s5, s2
+; GFX9-DL-NEXT:    s_sext_i32_i16 s6, s3
 ; GFX9-DL-NEXT:    s_ashr_i32 s2, s2, 16
-; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-DL-NEXT:    s_ashr_i32 s3, s3, 16
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s1, v3, v2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s2
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s3, v4, v2
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s1, v3, v2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s6, v1, v0
+; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s3, v2, v0
+; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s6, v1, v0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
 ;
@@ -2192,17 +2202,17 @@ define amdgpu_kernel void @idot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1
 ; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_sext_i32_i16 s0, s2
-; GFX10-DL-NEXT:    s_sext_i32_i16 s1, s3
-; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-DL-NEXT:    s_sext_i32_i16 s5, s2
+; GFX10-DL-NEXT:    s_sext_i32_i16 s6, s3
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-DL-NEXT:    s_ashr_i32 s2, s2, 16
 ; GFX10-DL-NEXT:    s_ashr_i32 s3, s3, 16
-; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s1, s0, v2
-; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s3, s2, v2
-; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s1, s0, v2
+; GFX10-DL-NEXT:    v_mad_i32_i24 v0, s6, s5, v0
+; GFX10-DL-NEXT:    v_mad_i32_i24 v0, s3, s2, v0
+; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s6, s5, v0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                                    <2 x i16> addrspace(1)* %src2,
@@ -2244,16 +2254,17 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(<2 x i16> addrspace(1)* %src1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
-; GFX7-NEXT:    s_load_dword s6, s[0:1], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_lshr_b32 s7, s4, 16
-; GFX7-NEXT:    s_lshr_b32 s9, s5, 16
-; GFX7-NEXT:    v_mov_b32_e32 v0, s7
-; GFX7-NEXT:    v_mov_b32_e32 v1, s6
-; GFX7-NEXT:    v_mad_u32_u24 v1, s9, v0, v1
+; GFX7-NEXT:    s_lshr_b32 s6, s4, 16
+; GFX7-NEXT:    s_lshr_b32 s7, s5, 16
 ; GFX7-NEXT:    s_and_b32 s4, s4, s8
-; GFX7-NEXT:    v_mad_u32_u24 v0, s9, v0, v1
 ; GFX7-NEXT:    s_and_b32 s5, s5, s8
+; GFX7-NEXT:    s_load_dword s8, s[0:1], 0x0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v1, s8
+; GFX7-NEXT:    v_mad_u32_u24 v1, s7, v0, v1
+; GFX7-NEXT:    v_mad_u32_u24 v0, s7, v0, v1
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s5, v1, v0
 ; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -2344,17 +2355,17 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(<2 x i16> addrspace(1)* %src1
 ; GFX10-DL-NEXT:    s_load_dword s3, s[4:5], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s5, s[0:1], 0x0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_lshr_b32 s0, s3, 16
-; GFX10-DL-NEXT:    s_lshr_b32 s1, s4, 16
-; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-DL-NEXT:    s_lshr_b32 s6, s3, 16
+; GFX10-DL-NEXT:    s_lshr_b32 s7, s4, 16
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-DL-NEXT:    s_and_b32 s3, s3, s2
 ; GFX10-DL-NEXT:    s_and_b32 s2, s4, s2
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s1, s0, v2
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s1, s0, v2
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s3, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s7, s6, v0
+; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s7, s6, v0
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s3, v0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                                    <2 x i16> addrspace(1)* %src2,
@@ -2418,19 +2429,19 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(<2 x i16> addrspace(1)* %src1
 ; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX8-NEXT:    s_load_dword s3, s[6:7], 0x0
 ; GFX8-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_sext_i32_i16 s0, s2
+; GFX8-NEXT:    s_sext_i32_i16 s5, s2
 ; GFX8-NEXT:    s_ashr_i32 s2, s2, 16
-; GFX8-NEXT:    s_sext_i32_i16 s1, s3
+; GFX8-NEXT:    s_sext_i32_i16 s6, s3
 ; GFX8-NEXT:    s_ashr_i32 s3, s3, 16
-; GFX8-NEXT:    v_mov_b32_e32 v2, s4
-; GFX8-NEXT:    v_mov_b32_e32 v3, s2
-; GFX8-NEXT:    v_mad_i32_i24 v2, s3, v3, v2
-; GFX8-NEXT:    v_mad_i32_i24 v2, s3, v3, v2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s0
-; GFX8-NEXT:    v_mad_i32_i24 v2, s1, v3, v2
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s2
+; GFX8-NEXT:    v_mad_i32_i24 v0, s3, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s5
+; GFX8-NEXT:    v_mad_i32_i24 v0, s3, v1, v0
+; GFX8-NEXT:    v_mad_i32_i24 v2, s6, v2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -2442,19 +2453,19 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(<2 x i16> addrspace(1)* %src1
 ; GFX9-NODL-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-NODL-NEXT:    s_load_dword s3, s[6:7], 0x0
 ; GFX9-NODL-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    s_sext_i32_i16 s0, s2
+; GFX9-NODL-NEXT:    s_sext_i32_i16 s5, s2
 ; GFX9-NODL-NEXT:    s_ashr_i32 s2, s2, 16
-; GFX9-NODL-NEXT:    s_sext_i32_i16 s1, s3
+; GFX9-NODL-NEXT:    s_sext_i32_i16 s6, s3
 ; GFX9-NODL-NEXT:    s_ashr_i32 s3, s3, 16
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s2
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, s3, v3, v2
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, s3, v3, v2
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s0
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, s1, v3, v2
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v0, s3, v1, v0
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v0, s3, v1, v0
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, s6, v2, v0
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NODL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
@@ -2466,19 +2477,19 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(<2 x i16> addrspace(1)* %src1
 ; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_sext_i32_i16 s0, s2
+; GFX9-DL-NEXT:    s_sext_i32_i16 s5, s2
 ; GFX9-DL-NEXT:    s_ashr_i32 s2, s2, 16
-; GFX9-DL-NEXT:    s_sext_i32_i16 s1, s3
+; GFX9-DL-NEXT:    s_sext_i32_i16 s6, s3
 ; GFX9-DL-NEXT:    s_ashr_i32 s3, s3, 16
-; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s2
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s3, v3, v2
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s3, v3, v2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s0
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s1, v3, v2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s3, v1, v0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s3, v1, v0
+; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s6, v2, v0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
 ;
@@ -2491,17 +2502,17 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(<2 x i16> addrspace(1)* %src1
 ; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_ashr_i32 s0, s2, 16
-; GFX10-DL-NEXT:    s_ashr_i32 s1, s3, 16
-; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-DL-NEXT:    s_ashr_i32 s5, s2, 16
+; GFX10-DL-NEXT:    s_ashr_i32 s6, s3, 16
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-DL-NEXT:    s_sext_i32_i16 s2, s2
 ; GFX10-DL-NEXT:    s_sext_i32_i16 s3, s3
-; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s1, s0, v2
-; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s1, s0, v2
-; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s3, s2, v2
+; GFX10-DL-NEXT:    v_mad_i32_i24 v0, s6, s5, v0
+; GFX10-DL-NEXT:    v_mad_i32_i24 v0, s6, s5, v0
+; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s3, s2, v0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                                    <2 x i16> addrspace(1)* %src2,
@@ -2561,23 +2572,23 @@ define amdgpu_kernel void @udot2_acc16(<2 x i16> addrspace(1)* %src1,
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX8-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_load_ushort v2, v[0:1]
-; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX8-NEXT:    s_load_dword s1, s[6:7], 0x0
+; GFX8-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_and_b32 s3, s1, s2
+; GFX8-NEXT:    s_and_b32 s3, s1, s0
 ; GFX8-NEXT:    s_lshr_b32 s1, s1, 16
-; GFX8-NEXT:    s_and_b32 s2, s0, s2
-; GFX8-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX8-NEXT:    s_and_b32 s0, s2, s0
+; GFX8-NEXT:    s_lshr_b32 s2, s2, 16
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX8-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
 ; GFX8-NEXT:    flat_store_short v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -2585,23 +2596,23 @@ define amdgpu_kernel void @udot2_acc16(<2 x i16> addrspace(1)* %src1,
 ; GFX9-NODL:       ; %bb.0: ; %entry
 ; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX9-NODL-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NODL-NEXT:    global_load_ushort v2, v[0:1], off
-; GFX9-NODL-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX9-NODL-NEXT:    s_load_dword s1, s[6:7], 0x0
+; GFX9-NODL-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    s_and_b32 s3, s1, s2
+; GFX9-NODL-NEXT:    s_and_b32 s3, s1, s0
 ; GFX9-NODL-NEXT:    s_lshr_b32 s1, s1, 16
-; GFX9-NODL-NEXT:    s_and_b32 s2, s0, s2
-; GFX9-NODL-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX9-NODL-NEXT:    s_and_b32 s0, s2, s0
+; GFX9-NODL-NEXT:    s_lshr_b32 s2, s2, 16
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
 ; GFX9-NODL-NEXT:    global_store_short v[0:1], v2, off
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
@@ -2610,15 +2621,15 @@ define amdgpu_kernel void @udot2_acc16(<2 x i16> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX9-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DL-NEXT:    global_load_ushort v2, v[0:1], off
+; GFX9-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_dot2_u32_u16 v2, s2, v3, v2
+; GFX9-DL-NEXT:    v_dot2_u32_u16 v2, s0, v3, v2
 ; GFX9-DL-NEXT:    global_store_short v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
 ;
@@ -2699,19 +2710,19 @@ define amdgpu_kernel void @notsdot2_sext8(<2 x i8> addrspace(1)* %src1,
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX8-NEXT:    flat_load_ushort v2, v[2:3]
-; GFX8-NEXT:    flat_load_ushort v3, v[0:1]
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b16_e32 v4, 8, v2
+; GFX8-NEXT:    v_bfe_i32 v1, v2, 0, 8
+; GFX8-NEXT:    v_lshrrev_b16_e32 v2, 8, v2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_bfe_i32 v5, v3, 0, 8
-; GFX8-NEXT:    v_lshrrev_b16_e32 v3, 8, v3
-; GFX8-NEXT:    v_bfe_i32 v4, v4, 0, 8
-; GFX8-NEXT:    v_bfe_i32 v3, v3, 0, 8
+; GFX8-NEXT:    v_bfe_i32 v3, v0, 0, 8
+; GFX8-NEXT:    v_lshrrev_b16_e32 v0, 8, v0
 ; GFX8-NEXT:    v_bfe_i32 v2, v2, 0, 8
-; GFX8-NEXT:    v_mad_i32_i24 v3, v3, v4, s2
-; GFX8-NEXT:    v_mad_i32_i24 v2, v5, v2, v3
+; GFX8-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX8-NEXT:    v_mad_i32_i24 v0, v0, v2, s2
+; GFX8-NEXT:    v_mad_i32_i24 v2, v3, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -2726,20 +2737,20 @@ define amdgpu_kernel void @notsdot2_sext8(<2 x i8> addrspace(1)* %src1,
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX9-NODL-NEXT:    global_load_ushort v2, v[2:3], off
-; GFX9-NODL-NEXT:    global_load_ushort v3, v[0:1], off
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NODL-NEXT:    global_load_ushort v0, v[0:1], off
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NODL-NEXT:    v_lshrrev_b16_e32 v4, 8, v2
+; GFX9-NODL-NEXT:    v_bfe_i32 v1, v2, 0, 8
+; GFX9-NODL-NEXT:    v_lshrrev_b16_e32 v2, 8, v2
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT:    v_bfe_i32 v5, v3, 0, 8
-; GFX9-NODL-NEXT:    v_lshrrev_b16_e32 v3, 8, v3
-; GFX9-NODL-NEXT:    v_bfe_i32 v4, v4, 0, 8
-; GFX9-NODL-NEXT:    v_bfe_i32 v3, v3, 0, 8
+; GFX9-NODL-NEXT:    v_bfe_i32 v3, v0, 0, 8
+; GFX9-NODL-NEXT:    v_lshrrev_b16_e32 v0, 8, v0
 ; GFX9-NODL-NEXT:    v_bfe_i32 v2, v2, 0, 8
+; GFX9-NODL-NEXT:    v_bfe_i32 v0, v0, 0, 8
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v3, v3, v4, s2
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, v5, v2, v3
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v0, v0, v2, s2
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, v3, v1, v0
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NODL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
@@ -2754,20 +2765,20 @@ define amdgpu_kernel void @notsdot2_sext8(<2 x i8> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX9-DL-NEXT:    global_load_ushort v2, v[2:3], off
-; GFX9-DL-NEXT:    global_load_ushort v3, v[0:1], off
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-DL-NEXT:    global_load_ushort v0, v[0:1], off
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-DL-NEXT:    v_lshrrev_b16_e32 v4, 8, v2
+; GFX9-DL-NEXT:    v_bfe_i32 v1, v2, 0, 8
+; GFX9-DL-NEXT:    v_lshrrev_b16_e32 v2, 8, v2
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_bfe_i32 v5, v3, 0, 8
-; GFX9-DL-NEXT:    v_lshrrev_b16_e32 v3, 8, v3
-; GFX9-DL-NEXT:    v_bfe_i32 v4, v4, 0, 8
-; GFX9-DL-NEXT:    v_bfe_i32 v3, v3, 0, 8
+; GFX9-DL-NEXT:    v_bfe_i32 v3, v0, 0, 8
+; GFX9-DL-NEXT:    v_lshrrev_b16_e32 v0, 8, v0
 ; GFX9-DL-NEXT:    v_bfe_i32 v2, v2, 0, 8
+; GFX9-DL-NEXT:    v_bfe_i32 v0, v0, 0, 8
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mad_i32_i24 v3, v3, v4, s2
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, v5, v2, v3
+; GFX9-DL-NEXT:    v_mad_i32_i24 v0, v0, v2, s2
+; GFX9-DL-NEXT:    v_mad_i32_i24 v2, v3, v1, v0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
 ;

diff  --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll
index 698a620965af..28eb2717bf44 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll
@@ -16,22 +16,22 @@ define amdgpu_kernel void @idot4_acc32(<4 x i8> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
-; GFX7-NEXT:    s_load_dword s6, s[0:1], 0x0
+; GFX7-NEXT:    s_load_dword s12, s[0:1], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_sext_i32_i8 s7, s4
-; GFX7-NEXT:    s_sext_i32_i8 s8, s5
-; GFX7-NEXT:    s_bfe_i32 s10, s5, 0x80008
-; GFX7-NEXT:    v_mov_b32_e32 v0, s8
-; GFX7-NEXT:    v_mov_b32_e32 v1, s6
-; GFX7-NEXT:    s_bfe_i32 s12, s5, 0x80010
-; GFX7-NEXT:    v_mad_i32_i24 v0, s7, v0, v1
-; GFX7-NEXT:    s_bfe_i32 s9, s4, 0x80008
-; GFX7-NEXT:    v_mov_b32_e32 v1, s10
-; GFX7-NEXT:    s_bfe_i32 s11, s4, 0x80010
-; GFX7-NEXT:    v_mad_i32_i24 v0, s9, v1, v0
+; GFX7-NEXT:    s_sext_i32_i8 s6, s4
+; GFX7-NEXT:    s_sext_i32_i8 s7, s5
+; GFX7-NEXT:    s_bfe_i32 s9, s5, 0x80008
+; GFX7-NEXT:    v_mov_b32_e32 v0, s7
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s12
+; GFX7-NEXT:    s_bfe_i32 s11, s5, 0x80010
+; GFX7-NEXT:    v_mad_i32_i24 v0, s6, v0, v1
+; GFX7-NEXT:    s_bfe_i32 s8, s4, 0x80008
+; GFX7-NEXT:    v_mov_b32_e32 v1, s9
+; GFX7-NEXT:    s_bfe_i32 s10, s4, 0x80010
+; GFX7-NEXT:    v_mad_i32_i24 v0, s8, v1, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s11
 ; GFX7-NEXT:    s_ashr_i32 s5, s5, 24
-; GFX7-NEXT:    v_mad_i32_i24 v0, s11, v1, v0
+; GFX7-NEXT:    v_mad_i32_i24 v0, s10, v1, v0
 ; GFX7-NEXT:    s_ashr_i32 s4, s4, 24
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    v_mad_i32_i24 v0, s4, v1, v0
@@ -45,27 +45,27 @@ define amdgpu_kernel void @idot4_acc32(<4 x i8> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX8-NEXT:    s_load_dword s3, s[6:7], 0x0
-; GFX8-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    s_load_dword s10, s[0:1], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_sext_i32_i8 s0, s2
-; GFX8-NEXT:    s_sext_i32_i8 s1, s3
-; GFX8-NEXT:    s_bfe_i32 s6, s3, 0x80008
-; GFX8-NEXT:    v_mov_b32_e32 v2, s1
-; GFX8-NEXT:    v_mov_b32_e32 v3, s4
-; GFX8-NEXT:    s_bfe_i32 s8, s3, 0x80010
-; GFX8-NEXT:    v_mad_i32_i24 v2, s0, v2, v3
-; GFX8-NEXT:    s_bfe_i32 s5, s2, 0x80008
-; GFX8-NEXT:    v_mov_b32_e32 v3, s6
-; GFX8-NEXT:    s_bfe_i32 s7, s2, 0x80010
-; GFX8-NEXT:    v_mad_i32_i24 v2, s5, v3, v2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s8
+; GFX8-NEXT:    s_sext_i32_i8 s4, s2
+; GFX8-NEXT:    s_sext_i32_i8 s5, s3
+; GFX8-NEXT:    s_bfe_i32 s7, s3, 0x80008
+; GFX8-NEXT:    v_mov_b32_e32 v0, s5
+; GFX8-NEXT:    v_mov_b32_e32 v1, s10
+; GFX8-NEXT:    s_bfe_i32 s9, s3, 0x80010
+; GFX8-NEXT:    v_mad_i32_i24 v0, s4, v0, v1
+; GFX8-NEXT:    s_bfe_i32 s6, s2, 0x80008
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    s_bfe_i32 s8, s2, 0x80010
+; GFX8-NEXT:    v_mad_i32_i24 v0, s6, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX8-NEXT:    s_ashr_i32 s3, s3, 24
-; GFX8-NEXT:    v_mad_i32_i24 v2, s7, v3, v2
+; GFX8-NEXT:    v_mad_i32_i24 v0, s8, v1, v0
 ; GFX8-NEXT:    s_ashr_i32 s2, s2, 24
-; GFX8-NEXT:    v_mov_b32_e32 v3, s3
-; GFX8-NEXT:    v_mad_i32_i24 v2, s2, v3, v2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_mad_i32_i24 v2, s2, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -76,27 +76,27 @@ define amdgpu_kernel void @idot4_acc32(<4 x i8> addrspace(1)* %src1,
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NODL-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-NODL-NEXT:    s_load_dword s3, s[6:7], 0x0
-; GFX9-NODL-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NODL-NEXT:    s_load_dword s10, s[0:1], 0x0
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    s_sext_i32_i8 s0, s2
-; GFX9-NODL-NEXT:    s_sext_i32_i8 s1, s3
-; GFX9-NODL-NEXT:    s_bfe_i32 s6, s3, 0x80008
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s4
-; GFX9-NODL-NEXT:    s_bfe_i32 s8, s3, 0x80010
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, s0, v2, v3
-; GFX9-NODL-NEXT:    s_bfe_i32 s5, s2, 0x80008
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s6
-; GFX9-NODL-NEXT:    s_bfe_i32 s7, s2, 0x80010
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, s5, v3, v2
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s8
+; GFX9-NODL-NEXT:    s_sext_i32_i8 s4, s2
+; GFX9-NODL-NEXT:    s_sext_i32_i8 s5, s3
+; GFX9-NODL-NEXT:    s_bfe_i32 s7, s3, 0x80008
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s5
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s10
+; GFX9-NODL-NEXT:    s_bfe_i32 s9, s3, 0x80010
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v0, s4, v0, v1
+; GFX9-NODL-NEXT:    s_bfe_i32 s6, s2, 0x80008
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NODL-NEXT:    s_bfe_i32 s8, s2, 0x80010
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v0, s6, v1, v0
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX9-NODL-NEXT:    s_ashr_i32 s3, s3, 24
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, s7, v3, v2
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v0, s8, v1, v0
 ; GFX9-NODL-NEXT:    s_ashr_i32 s2, s2, 24
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, s2, v3, v2
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, s2, v1, v0
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NODL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
@@ -105,15 +105,15 @@ define amdgpu_kernel void @idot4_acc32(<4 x i8> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
-; GFX9-DL-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX9-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
+; GFX9-DL-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX9-DL-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s3
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s4
-; GFX9-DL-NEXT:    v_dot4_i32_i8 v2, s2, v2, v3
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-DL-NEXT:    v_dot4_i32_i8 v2, s4, v2, v3
 ; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
 ;
@@ -123,14 +123,14 @@ define amdgpu_kernel void @idot4_acc32(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
-; GFX10-DL-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_load_dword s3, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-DL-NEXT:    v_dot4_i32_i8 v2, s3, s4, v0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-DL-NEXT:    v_dot4_i32_i8 v2, s2, s3, v2
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                        <4 x i8> addrspace(1)* %src2,
@@ -220,29 +220,29 @@ define amdgpu_kernel void @idot4_acc16(<4 x i8> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX8-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX8-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_load_ushort v2, v[0:1]
+; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_sext_i32_i8 s0, s2
-; GFX8-NEXT:    s_sext_i32_i8 s1, s3
+; GFX8-NEXT:    s_sext_i32_i8 s1, s2
+; GFX8-NEXT:    s_bfe_i32 s3, s2, 0x80008
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    s_bfe_i32 s4, s3, 0x80008
-; GFX8-NEXT:    s_bfe_i32 s5, s3, 0x80010
-; GFX8-NEXT:    v_mov_b32_e32 v4, s4
-; GFX8-NEXT:    s_bfe_i32 s1, s2, 0x80008
-; GFX8-NEXT:    s_bfe_i32 s4, s2, 0x80010
-; GFX8-NEXT:    s_ashr_i32 s3, s3, 24
-; GFX8-NEXT:    v_mov_b32_e32 v5, s5
+; GFX8-NEXT:    s_bfe_i32 s5, s2, 0x80010
+; GFX8-NEXT:    s_sext_i32_i8 s1, s0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s3
+; GFX8-NEXT:    s_bfe_i32 s4, s0, 0x80008
+; GFX8-NEXT:    s_bfe_i32 s3, s0, 0x80010
 ; GFX8-NEXT:    s_ashr_i32 s2, s2, 24
+; GFX8-NEXT:    v_mov_b32_e32 v5, s5
+; GFX8-NEXT:    s_ashr_i32 s0, s0, 24
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mad_i32_i24 v2, s1, v3, v2
+; GFX8-NEXT:    v_mad_i32_i24 v2, s4, v4, v2
+; GFX8-NEXT:    v_mad_i32_i24 v2, s3, v5, v2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX8-NEXT:    v_mad_i32_i24 v2, s0, v3, v2
-; GFX8-NEXT:    v_mad_i32_i24 v2, s1, v4, v2
-; GFX8-NEXT:    v_mad_i32_i24 v2, s4, v5, v2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s3
-; GFX8-NEXT:    v_mad_i32_i24 v2, s2, v3, v2
 ; GFX8-NEXT:    flat_store_short v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -251,29 +251,29 @@ define amdgpu_kernel void @idot4_acc16(<4 x i8> addrspace(1)* %src1,
 ; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX9-NODL-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX9-NODL-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NODL-NEXT:    global_load_ushort v2, v[0:1], off
+; GFX9-NODL-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    s_sext_i32_i8 s0, s2
-; GFX9-NODL-NEXT:    s_sext_i32_i8 s1, s3
+; GFX9-NODL-NEXT:    s_sext_i32_i8 s1, s2
+; GFX9-NODL-NEXT:    s_bfe_i32 s3, s2, 0x80008
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NODL-NEXT:    s_bfe_i32 s4, s3, 0x80008
-; GFX9-NODL-NEXT:    s_bfe_i32 s5, s3, 0x80010
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v4, s4
-; GFX9-NODL-NEXT:    s_bfe_i32 s1, s2, 0x80008
-; GFX9-NODL-NEXT:    s_bfe_i32 s4, s2, 0x80010
-; GFX9-NODL-NEXT:    s_ashr_i32 s3, s3, 24
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v5, s5
+; GFX9-NODL-NEXT:    s_bfe_i32 s5, s2, 0x80010
+; GFX9-NODL-NEXT:    s_sext_i32_i8 s1, s0
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v4, s3
+; GFX9-NODL-NEXT:    s_bfe_i32 s4, s0, 0x80008
+; GFX9-NODL-NEXT:    s_bfe_i32 s3, s0, 0x80010
 ; GFX9-NODL-NEXT:    s_ashr_i32 s2, s2, 24
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v5, s5
+; GFX9-NODL-NEXT:    s_ashr_i32 s0, s0, 24
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, s1, v3, v2
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, s4, v4, v2
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, s3, v5, v2
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, s0, v3, v2
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, s1, v4, v2
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, s4, v5, v2
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, s2, v3, v2
 ; GFX9-NODL-NEXT:    global_store_short v[0:1], v2, off
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
@@ -282,15 +282,15 @@ define amdgpu_kernel void @idot4_acc16(<4 x i8> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX9-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DL-NEXT:    global_load_ushort v2, v[0:1], off
+; GFX9-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_dot4_i32_i8 v2, s2, v3, v2
+; GFX9-DL-NEXT:    v_dot4_i32_i8 v2, s0, v3, v2
 ; GFX9-DL-NEXT:    global_store_short v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
 ;
@@ -356,28 +356,28 @@ define amdgpu_kernel void @idot4_acc8(<4 x i8> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s2, -1
-; GFX7-NEXT:    s_movk_i32 s8, 0xff
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GFX7-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
-; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x0
+; GFX7-NEXT:    s_movk_i32 s5, 0xff
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_and_b32 s7, s4, s8
 ; GFX7-NEXT:    s_bfe_u32 s9, s4, 0x80008
-; GFX7-NEXT:    s_and_b32 s6, s5, s8
-; GFX7-NEXT:    s_bfe_u32 s8, s5, 0x80008
-; GFX7-NEXT:    v_mov_b32_e32 v1, s6
-; GFX7-NEXT:    s_bfe_u32 s10, s5, 0x80010
-; GFX7-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX7-NEXT:    s_bfe_u32 s11, s4, 0x80010
-; GFX7-NEXT:    s_lshr_b32 s5, s5, 24
+; GFX7-NEXT:    s_and_b32 s7, s6, s5
+; GFX7-NEXT:    s_and_b32 s5, s4, s5
+; GFX7-NEXT:    s_bfe_u32 s8, s6, 0x80008
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_bfe_u32 s10, s6, 0x80010
+; GFX7-NEXT:    v_mov_b32_e32 v2, s8
+; GFX7-NEXT:    s_lshr_b32 s6, s6, 24
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s10
 ; GFX7-NEXT:    s_lshr_b32 s4, s4, 24
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mad_u32_u24 v0, s7, v1, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s5, v1, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s9, v2, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s11, v3, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
 ; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
@@ -386,31 +386,31 @@ define amdgpu_kernel void @idot4_acc8(<4 x i8> addrspace(1)* %src1,
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX8-NEXT:    s_movk_i32 s2, 0xff
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX8-NEXT:    s_load_dword s1, s[6:7], 0x0
+; GFX8-NEXT:    s_movk_i32 s0, 0xff
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_bfe_u32 s5, s0, 0x80008
-; GFX8-NEXT:    s_and_b32 s3, s1, s2
+; GFX8-NEXT:    s_bfe_u32 s5, s2, 0x80008
+; GFX8-NEXT:    s_bfe_u32 s7, s2, 0x80010
+; GFX8-NEXT:    s_and_b32 s3, s1, s0
+; GFX8-NEXT:    s_and_b32 s0, s2, s0
 ; GFX8-NEXT:    s_bfe_u32 s4, s1, 0x80008
-; GFX8-NEXT:    s_and_b32 s2, s0, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX8-NEXT:    s_bfe_u32 s6, s1, 0x80010
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s4
-; GFX8-NEXT:    s_bfe_u32 s7, s0, 0x80010
 ; GFX8-NEXT:    s_lshr_b32 s1, s1, 24
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s6
-; GFX8-NEXT:    s_lshr_b32 s0, s0, 24
+; GFX8-NEXT:    s_lshr_b32 s2, s2, 24
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
 ; GFX8-NEXT:    v_mad_u32_u24 v2, s5, v4, v2
 ; GFX8-NEXT:    v_mad_u32_u24 v2, s7, v5, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
 ; GFX8-NEXT:    flat_store_byte v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -418,31 +418,31 @@ define amdgpu_kernel void @idot4_acc8(<4 x i8> addrspace(1)* %src1,
 ; GFX9-NODL:       ; %bb.0: ; %entry
 ; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX9-NODL-NEXT:    s_movk_i32 s2, 0xff
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NODL-NEXT:    global_load_ubyte v2, v[0:1], off
-; GFX9-NODL-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX9-NODL-NEXT:    s_load_dword s1, s[6:7], 0x0
+; GFX9-NODL-NEXT:    s_movk_i32 s0, 0xff
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    s_bfe_u32 s5, s0, 0x80008
-; GFX9-NODL-NEXT:    s_and_b32 s3, s1, s2
+; GFX9-NODL-NEXT:    s_bfe_u32 s5, s2, 0x80008
+; GFX9-NODL-NEXT:    s_bfe_u32 s7, s2, 0x80010
+; GFX9-NODL-NEXT:    s_and_b32 s3, s1, s0
+; GFX9-NODL-NEXT:    s_and_b32 s0, s2, s0
 ; GFX9-NODL-NEXT:    s_bfe_u32 s4, s1, 0x80008
-; GFX9-NODL-NEXT:    s_and_b32 s2, s0, s2
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX9-NODL-NEXT:    s_bfe_u32 s6, s1, 0x80010
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v4, s4
-; GFX9-NODL-NEXT:    s_bfe_u32 s7, s0, 0x80010
 ; GFX9-NODL-NEXT:    s_lshr_b32 s1, s1, 24
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v5, s6
-; GFX9-NODL-NEXT:    s_lshr_b32 s0, s0, 24
+; GFX9-NODL-NEXT:    s_lshr_b32 s2, s2, 24
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
 ; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s5, v4, v2
 ; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s7, v5, v2
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
 ; GFX9-NODL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
@@ -451,15 +451,15 @@ define amdgpu_kernel void @idot4_acc8(<4 x i8> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX9-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DL-NEXT:    global_load_ubyte v2, v[0:1], off
+; GFX9-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_dot4_u32_u8 v2, s2, v3, v2
+; GFX9-DL-NEXT:    v_dot4_u32_u8 v2, s0, v3, v2
 ; GFX9-DL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
 ;
@@ -520,23 +520,23 @@ define amdgpu_kernel void @idot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
-; GFX7-NEXT:    s_load_dword s6, s[0:1], 0x0
+; GFX7-NEXT:    s_load_dword s12, s[0:1], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_sext_i32_i8 s7, s4
-; GFX7-NEXT:    s_sext_i32_i8 s8, s5
-; GFX7-NEXT:    s_bfe_i32 s10, s5, 0x80008
-; GFX7-NEXT:    v_mov_b32_e32 v0, s8
-; GFX7-NEXT:    v_mov_b32_e32 v1, s6
-; GFX7-NEXT:    s_bfe_i32 s9, s4, 0x80008
-; GFX7-NEXT:    v_mad_i32_i24 v1, s7, v0, v1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s10
-; GFX7-NEXT:    s_bfe_i32 s12, s5, 0x80010
-; GFX7-NEXT:    v_mad_i32_i24 v1, s9, v2, v1
-; GFX7-NEXT:    s_bfe_i32 s11, s4, 0x80010
-; GFX7-NEXT:    v_mad_i32_i24 v0, s7, v0, v1
+; GFX7-NEXT:    s_sext_i32_i8 s6, s4
+; GFX7-NEXT:    s_sext_i32_i8 s7, s5
+; GFX7-NEXT:    s_bfe_i32 s9, s5, 0x80008
+; GFX7-NEXT:    v_mov_b32_e32 v0, s7
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s12
+; GFX7-NEXT:    s_bfe_i32 s8, s4, 0x80008
+; GFX7-NEXT:    v_mad_i32_i24 v1, s6, v0, v1
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    s_bfe_i32 s11, s5, 0x80010
+; GFX7-NEXT:    v_mad_i32_i24 v1, s8, v2, v1
+; GFX7-NEXT:    s_bfe_i32 s10, s4, 0x80010
+; GFX7-NEXT:    v_mad_i32_i24 v0, s6, v0, v1
+; GFX7-NEXT:    v_mov_b32_e32 v1, s11
 ; GFX7-NEXT:    s_ashr_i32 s5, s5, 24
-; GFX7-NEXT:    v_mad_i32_i24 v0, s11, v1, v0
+; GFX7-NEXT:    v_mad_i32_i24 v0, s10, v1, v0
 ; GFX7-NEXT:    s_ashr_i32 s4, s4, 24
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    v_mad_i32_i24 v0, s4, v1, v0
@@ -550,28 +550,28 @@ define amdgpu_kernel void @idot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX8-NEXT:    s_load_dword s3, s[6:7], 0x0
-; GFX8-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    s_load_dword s10, s[0:1], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_sext_i32_i8 s0, s2
-; GFX8-NEXT:    s_sext_i32_i8 s1, s3
-; GFX8-NEXT:    s_bfe_i32 s6, s3, 0x80008
-; GFX8-NEXT:    v_mov_b32_e32 v2, s1
-; GFX8-NEXT:    v_mov_b32_e32 v3, s4
-; GFX8-NEXT:    s_bfe_i32 s5, s2, 0x80008
-; GFX8-NEXT:    v_mad_i32_i24 v3, s0, v2, v3
-; GFX8-NEXT:    v_mov_b32_e32 v4, s6
-; GFX8-NEXT:    s_bfe_i32 s8, s3, 0x80010
-; GFX8-NEXT:    v_mad_i32_i24 v3, s5, v4, v3
-; GFX8-NEXT:    s_bfe_i32 s7, s2, 0x80010
-; GFX8-NEXT:    v_mad_i32_i24 v2, s0, v2, v3
-; GFX8-NEXT:    v_mov_b32_e32 v3, s8
+; GFX8-NEXT:    s_sext_i32_i8 s4, s2
+; GFX8-NEXT:    s_sext_i32_i8 s5, s3
+; GFX8-NEXT:    s_bfe_i32 s7, s3, 0x80008
+; GFX8-NEXT:    v_mov_b32_e32 v0, s5
+; GFX8-NEXT:    v_mov_b32_e32 v1, s10
+; GFX8-NEXT:    s_bfe_i32 s6, s2, 0x80008
+; GFX8-NEXT:    v_mad_i32_i24 v1, s4, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s7
+; GFX8-NEXT:    s_bfe_i32 s9, s3, 0x80010
+; GFX8-NEXT:    v_mad_i32_i24 v1, s6, v2, v1
+; GFX8-NEXT:    s_bfe_i32 s8, s2, 0x80010
+; GFX8-NEXT:    v_mad_i32_i24 v0, s4, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX8-NEXT:    s_ashr_i32 s3, s3, 24
-; GFX8-NEXT:    v_mad_i32_i24 v2, s7, v3, v2
+; GFX8-NEXT:    v_mad_i32_i24 v0, s8, v1, v0
 ; GFX8-NEXT:    s_ashr_i32 s2, s2, 24
-; GFX8-NEXT:    v_mov_b32_e32 v3, s3
-; GFX8-NEXT:    v_mad_i32_i24 v2, s2, v3, v2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_mad_i32_i24 v2, s2, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -582,28 +582,28 @@ define amdgpu_kernel void @idot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1,
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NODL-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-NODL-NEXT:    s_load_dword s3, s[6:7], 0x0
-; GFX9-NODL-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NODL-NEXT:    s_load_dword s10, s[0:1], 0x0
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    s_sext_i32_i8 s0, s2
-; GFX9-NODL-NEXT:    s_sext_i32_i8 s1, s3
-; GFX9-NODL-NEXT:    s_bfe_i32 s6, s3, 0x80008
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s4
-; GFX9-NODL-NEXT:    s_bfe_i32 s5, s2, 0x80008
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v3, s0, v2, v3
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v4, s6
-; GFX9-NODL-NEXT:    s_bfe_i32 s8, s3, 0x80010
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v3, s5, v4, v3
-; GFX9-NODL-NEXT:    s_bfe_i32 s7, s2, 0x80010
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, s0, v2, v3
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s8
+; GFX9-NODL-NEXT:    s_sext_i32_i8 s4, s2
+; GFX9-NODL-NEXT:    s_sext_i32_i8 s5, s3
+; GFX9-NODL-NEXT:    s_bfe_i32 s7, s3, 0x80008
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s5
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s10
+; GFX9-NODL-NEXT:    s_bfe_i32 s6, s2, 0x80008
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v1, s4, v0, v1
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s7
+; GFX9-NODL-NEXT:    s_bfe_i32 s9, s3, 0x80010
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v1, s6, v2, v1
+; GFX9-NODL-NEXT:    s_bfe_i32 s8, s2, 0x80010
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v0, s4, v0, v1
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX9-NODL-NEXT:    s_ashr_i32 s3, s3, 24
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, s7, v3, v2
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v0, s8, v1, v0
 ; GFX9-NODL-NEXT:    s_ashr_i32 s2, s2, 24
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, s2, v3, v2
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, s2, v1, v0
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NODL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
@@ -614,28 +614,28 @@ define amdgpu_kernel void @idot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
-; GFX9-DL-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-DL-NEXT:    s_load_dword s10, s[0:1], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_sext_i32_i8 s0, s2
-; GFX9-DL-NEXT:    s_sext_i32_i8 s1, s3
-; GFX9-DL-NEXT:    s_bfe_i32 s6, s3, 0x80008
-; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s4
-; GFX9-DL-NEXT:    s_bfe_i32 s5, s2, 0x80008
-; GFX9-DL-NEXT:    v_mad_i32_i24 v3, s0, v2, v3
-; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s6
-; GFX9-DL-NEXT:    s_bfe_i32 s8, s3, 0x80010
-; GFX9-DL-NEXT:    v_mad_i32_i24 v3, s5, v4, v3
-; GFX9-DL-NEXT:    s_bfe_i32 s7, s2, 0x80010
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s0, v2, v3
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s8
+; GFX9-DL-NEXT:    s_sext_i32_i8 s4, s2
+; GFX9-DL-NEXT:    s_sext_i32_i8 s5, s3
+; GFX9-DL-NEXT:    s_bfe_i32 s7, s3, 0x80008
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s5
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s10
+; GFX9-DL-NEXT:    s_bfe_i32 s6, s2, 0x80008
+; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s4, v0, v1
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s7
+; GFX9-DL-NEXT:    s_bfe_i32 s9, s3, 0x80010
+; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s6, v2, v1
+; GFX9-DL-NEXT:    s_bfe_i32 s8, s2, 0x80010
+; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s4, v0, v1
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX9-DL-NEXT:    s_ashr_i32 s3, s3, 24
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s7, v3, v2
+; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s8, v1, v0
 ; GFX9-DL-NEXT:    s_ashr_i32 s2, s2, 24
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s2, v3, v2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s2, v1, v0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
 ;
@@ -648,23 +648,23 @@ define amdgpu_kernel void @idot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_sext_i32_i8 s0, s2
-; GFX10-DL-NEXT:    s_sext_i32_i8 s1, s3
-; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-DL-NEXT:    s_sext_i32_i8 s5, s2
+; GFX10-DL-NEXT:    s_sext_i32_i8 s6, s3
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-DL-NEXT:    s_bfe_i32 s4, s2, 0x80008
-; GFX10-DL-NEXT:    s_bfe_i32 s5, s3, 0x80008
-; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s0, s1, v2
-; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s4, s5, v2
+; GFX10-DL-NEXT:    s_bfe_i32 s7, s3, 0x80008
+; GFX10-DL-NEXT:    v_mad_i32_i24 v0, s5, s6, v0
+; GFX10-DL-NEXT:    v_mad_i32_i24 v0, s4, s7, v0
 ; GFX10-DL-NEXT:    s_bfe_i32 s4, s2, 0x80010
-; GFX10-DL-NEXT:    s_bfe_i32 s5, s3, 0x80010
-; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s0, s1, v2
-; GFX10-DL-NEXT:    s_ashr_i32 s0, s2, 24
-; GFX10-DL-NEXT:    s_ashr_i32 s1, s3, 24
-; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s4, s5, v2
-; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s0, s1, v2
+; GFX10-DL-NEXT:    s_bfe_i32 s7, s3, 0x80010
+; GFX10-DL-NEXT:    s_ashr_i32 s2, s2, 24
+; GFX10-DL-NEXT:    s_ashr_i32 s3, s3, 24
+; GFX10-DL-NEXT:    v_mad_i32_i24 v0, s5, s6, v0
+; GFX10-DL-NEXT:    v_mad_i32_i24 v0, s4, s7, v0
+; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s2, s3, v0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                                <4 x i8> addrspace(1)* %src2,
@@ -719,25 +719,25 @@ define amdgpu_kernel void @idot4_acc32_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
-; GFX7-NEXT:    s_load_dword s6, s[0:1], 0x0
+; GFX7-NEXT:    s_load_dword s12, s[0:1], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_ashr_i32 s7, s4, 24
-; GFX7-NEXT:    s_ashr_i32 s10, s5, 24
-; GFX7-NEXT:    s_bfe_i32 s11, s5, 0x80010
-; GFX7-NEXT:    s_bfe_i32 s12, s5, 0x80008
+; GFX7-NEXT:    s_ashr_i32 s6, s4, 24
+; GFX7-NEXT:    s_ashr_i32 s9, s5, 24
+; GFX7-NEXT:    s_bfe_i32 s10, s5, 0x80010
+; GFX7-NEXT:    s_bfe_i32 s11, s5, 0x80008
 ; GFX7-NEXT:    s_sext_i32_i8 s5, s5
-; GFX7-NEXT:    s_bfe_i32 s8, s4, 0x80010
-; GFX7-NEXT:    s_bfe_i32 s9, s4, 0x80008
+; GFX7-NEXT:    s_bfe_i32 s7, s4, 0x80010
+; GFX7-NEXT:    s_bfe_i32 s8, s4, 0x80008
 ; GFX7-NEXT:    s_sext_i32_i8 s4, s4
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s5
-; GFX7-NEXT:    v_mov_b32_e32 v1, s6
-; GFX7-NEXT:    v_mad_i32_i24 v0, s4, v0, v1
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s12
-; GFX7-NEXT:    v_mad_i32_i24 v0, s9, v1, v0
+; GFX7-NEXT:    v_mad_i32_i24 v0, s4, v0, v1
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s11
 ; GFX7-NEXT:    v_mad_i32_i24 v0, s8, v1, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s10
 ; GFX7-NEXT:    v_mad_i32_i24 v0, s7, v1, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s9
+; GFX7-NEXT:    v_mad_i32_i24 v0, s6, v1, v0
 ; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
@@ -748,28 +748,28 @@ define amdgpu_kernel void @idot4_acc32_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX8-NEXT:    s_load_dword s3, s[6:7], 0x0
-; GFX8-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    s_load_dword s8, s[0:1], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b16_e64 v2, 8, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v3, 8, s3
-; GFX8-NEXT:    s_ashr_i32 s5, s3, 24
-; GFX8-NEXT:    s_bfe_i32 s6, s3, 0x80010
+; GFX8-NEXT:    v_lshrrev_b16_e64 v0, 8, s2
+; GFX8-NEXT:    v_lshrrev_b16_e64 v1, 8, s3
+; GFX8-NEXT:    s_ashr_i32 s6, s3, 24
+; GFX8-NEXT:    s_bfe_i32 s7, s3, 0x80010
 ; GFX8-NEXT:    s_sext_i32_i8 s3, s3
-; GFX8-NEXT:    s_ashr_i32 s0, s2, 24
-; GFX8-NEXT:    s_bfe_i32 s1, s2, 0x80010
+; GFX8-NEXT:    s_ashr_i32 s4, s2, 24
+; GFX8-NEXT:    s_bfe_i32 s5, s2, 0x80010
 ; GFX8-NEXT:    s_sext_i32_i8 s2, s2
-; GFX8-NEXT:    v_mov_b32_e32 v4, s3
-; GFX8-NEXT:    v_mov_b32_e32 v5, s4
-; GFX8-NEXT:    v_bfe_i32 v2, v2, 0, 8
-; GFX8-NEXT:    v_bfe_i32 v3, v3, 0, 8
-; GFX8-NEXT:    v_mad_i32_i24 v4, s2, v4, v5
-; GFX8-NEXT:    v_mad_i32_i24 v2, v2, v3, v4
-; GFX8-NEXT:    v_mov_b32_e32 v3, s6
-; GFX8-NEXT:    v_mad_i32_i24 v2, s1, v3, v2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s5
-; GFX8-NEXT:    v_mad_i32_i24 v2, s0, v3, v2
+; GFX8-NEXT:    v_mov_b32_e32 v2, s3
+; GFX8-NEXT:    v_mov_b32_e32 v3, s8
+; GFX8-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX8-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX8-NEXT:    v_mad_i32_i24 v2, s2, v2, v3
+; GFX8-NEXT:    v_mad_i32_i24 v0, v0, v1, v2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mad_i32_i24 v0, s5, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s6
+; GFX8-NEXT:    v_mad_i32_i24 v2, s4, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -780,28 +780,28 @@ define amdgpu_kernel void @idot4_acc32_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NODL-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-NODL-NEXT:    s_load_dword s3, s[6:7], 0x0
-; GFX9-NODL-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NODL-NEXT:    s_load_dword s8, s[0:1], 0x0
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    v_lshrrev_b16_e64 v2, 8, s2
-; GFX9-NODL-NEXT:    v_lshrrev_b16_e64 v3, 8, s3
-; GFX9-NODL-NEXT:    s_ashr_i32 s5, s3, 24
-; GFX9-NODL-NEXT:    s_bfe_i32 s6, s3, 0x80010
+; GFX9-NODL-NEXT:    v_lshrrev_b16_e64 v0, 8, s2
+; GFX9-NODL-NEXT:    v_lshrrev_b16_e64 v1, 8, s3
+; GFX9-NODL-NEXT:    s_ashr_i32 s6, s3, 24
+; GFX9-NODL-NEXT:    s_bfe_i32 s7, s3, 0x80010
 ; GFX9-NODL-NEXT:    s_sext_i32_i8 s3, s3
-; GFX9-NODL-NEXT:    s_ashr_i32 s0, s2, 24
-; GFX9-NODL-NEXT:    s_bfe_i32 s1, s2, 0x80010
+; GFX9-NODL-NEXT:    s_ashr_i32 s4, s2, 24
+; GFX9-NODL-NEXT:    s_bfe_i32 s5, s2, 0x80010
 ; GFX9-NODL-NEXT:    s_sext_i32_i8 s2, s2
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v4, s3
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v5, s4
-; GFX9-NODL-NEXT:    v_bfe_i32 v2, v2, 0, 8
-; GFX9-NODL-NEXT:    v_bfe_i32 v3, v3, 0, 8
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v4, s2, v4, v5
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, v2, v3, v4
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s6
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, s1, v3, v2
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, s0, v3, v2
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s8
+; GFX9-NODL-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX9-NODL-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, s2, v2, v3
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v0, v0, v1, v2
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v0, s5, v1, v0
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, s4, v1, v0
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NODL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
@@ -812,28 +812,28 @@ define amdgpu_kernel void @idot4_acc32_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
-; GFX9-DL-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-DL-NEXT:    s_load_dword s8, s[0:1], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_lshrrev_b16_e64 v2, 8, s2
-; GFX9-DL-NEXT:    v_lshrrev_b16_e64 v3, 8, s3
-; GFX9-DL-NEXT:    s_ashr_i32 s5, s3, 24
-; GFX9-DL-NEXT:    s_bfe_i32 s6, s3, 0x80010
+; GFX9-DL-NEXT:    v_lshrrev_b16_e64 v0, 8, s2
+; GFX9-DL-NEXT:    v_lshrrev_b16_e64 v1, 8, s3
+; GFX9-DL-NEXT:    s_ashr_i32 s6, s3, 24
+; GFX9-DL-NEXT:    s_bfe_i32 s7, s3, 0x80010
 ; GFX9-DL-NEXT:    s_sext_i32_i8 s3, s3
-; GFX9-DL-NEXT:    s_ashr_i32 s0, s2, 24
-; GFX9-DL-NEXT:    s_bfe_i32 s1, s2, 0x80010
+; GFX9-DL-NEXT:    s_ashr_i32 s4, s2, 24
+; GFX9-DL-NEXT:    s_bfe_i32 s5, s2, 0x80010
 ; GFX9-DL-NEXT:    s_sext_i32_i8 s2, s2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s3
-; GFX9-DL-NEXT:    v_mov_b32_e32 v5, s4
-; GFX9-DL-NEXT:    v_bfe_i32 v2, v2, 0, 8
-; GFX9-DL-NEXT:    v_bfe_i32 v3, v3, 0, 8
-; GFX9-DL-NEXT:    v_mad_i32_i24 v4, s2, v4, v5
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, v2, v3, v4
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s6
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s1, v3, v2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s0, v3, v2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s8
+; GFX9-DL-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX9-DL-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s2, v2, v3
+; GFX9-DL-NEXT:    v_mad_i32_i24 v0, v0, v1, v2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s5, v1, v0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s4, v1, v0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
 ;
@@ -846,24 +846,24 @@ define amdgpu_kernel void @idot4_acc32_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_lshrrev_b16_e64 v2, 8, s2
-; GFX10-DL-NEXT:    v_lshrrev_b16_e64 v3, 8, s3
-; GFX10-DL-NEXT:    v_mov_b32_e32 v4, s4
-; GFX10-DL-NEXT:    s_sext_i32_i8 s0, s2
-; GFX10-DL-NEXT:    s_sext_i32_i8 s1, s3
-; GFX10-DL-NEXT:    v_bfe_i32 v2, v2, 0, 8
-; GFX10-DL-NEXT:    v_bfe_i32 v3, v3, 0, 8
+; GFX10-DL-NEXT:    v_lshrrev_b16_e64 v0, 8, s2
+; GFX10-DL-NEXT:    v_lshrrev_b16_e64 v1, 8, s3
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-DL-NEXT:    s_sext_i32_i8 s5, s2
+; GFX10-DL-NEXT:    s_sext_i32_i8 s6, s3
+; GFX10-DL-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX10-DL-NEXT:    v_bfe_i32 v1, v1, 0, 8
 ; GFX10-DL-NEXT:    s_bfe_i32 s4, s2, 0x80010
+; GFX10-DL-NEXT:    s_ashr_i32 s2, s2, 24
+; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s5, s6, v2
 ; GFX10-DL-NEXT:    s_bfe_i32 s5, s3, 0x80010
-; GFX10-DL-NEXT:    v_mad_i32_i24 v4, s0, s1, v4
-; GFX10-DL-NEXT:    s_ashr_i32 s0, s2, 24
-; GFX10-DL-NEXT:    s_ashr_i32 s1, s3, 24
-; GFX10-DL-NEXT:    v_mad_i32_i24 v2, v2, v3, v4
-; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s4, s5, v2
-; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s0, s1, v2
+; GFX10-DL-NEXT:    s_ashr_i32 s3, s3, 24
+; GFX10-DL-NEXT:    v_mad_i32_i24 v0, v0, v1, v2
+; GFX10-DL-NEXT:    v_mad_i32_i24 v0, s4, s5, v0
+; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s2, s3, v0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                               <4 x i8> addrspace(1)* %src2,
@@ -927,33 +927,33 @@ define amdgpu_kernel void @idot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_load_dword s3, s[6:7], 0x0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_load_ushort v2, v[0:1]
-; GFX8-NEXT:    s_load_dword s1, s[4:5], 0x0
-; GFX8-NEXT:    s_load_dword s2, s[6:7], 0x0
-; GFX8-NEXT:    s_mov_b32 s0, 0xffff
+; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
-; GFX8-NEXT:    s_bfe_i32 s6, s2, 0x80000
-; GFX8-NEXT:    s_lshr_b32 s4, s2, 16
-; GFX8-NEXT:    s_bfe_i32 s5, s1, 0x80000
-; GFX8-NEXT:    v_ashrrev_i16_e64 v4, 8, s1
-; GFX8-NEXT:    s_bfe_i32 s1, s3, 0x80000
-; GFX8-NEXT:    v_ashrrev_i16_e64 v6, 8, s3
-; GFX8-NEXT:    s_and_b32 s3, s0, s6
-; GFX8-NEXT:    v_ashrrev_i16_e64 v3, 8, s2
-; GFX8-NEXT:    s_bfe_i32 s2, s4, 0x80000
+; GFX8-NEXT:    s_bfe_i32 s6, s3, 0x80000
+; GFX8-NEXT:    s_lshr_b32 s4, s3, 16
+; GFX8-NEXT:    v_ashrrev_i16_e64 v3, 8, s3
+; GFX8-NEXT:    s_bfe_i32 s3, s4, 0x80000
+; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
+; GFX8-NEXT:    s_bfe_i32 s5, s0, 0x80000
+; GFX8-NEXT:    v_ashrrev_i16_e64 v4, 8, s0
+; GFX8-NEXT:    s_bfe_i32 s0, s1, 0x80000
+; GFX8-NEXT:    v_ashrrev_i16_e64 v6, 8, s1
+; GFX8-NEXT:    s_and_b32 s1, s2, s6
 ; GFX8-NEXT:    v_ashrrev_i16_e64 v5, 8, s4
-; GFX8-NEXT:    s_and_b32 s4, s0, s5
-; GFX8-NEXT:    v_mov_b32_e32 v7, s3
-; GFX8-NEXT:    s_and_b32 s2, s0, s2
-; GFX8-NEXT:    s_and_b32 s0, s0, s1
+; GFX8-NEXT:    s_and_b32 s4, s2, s5
+; GFX8-NEXT:    v_mov_b32_e32 v7, s1
+; GFX8-NEXT:    s_and_b32 s3, s2, s3
+; GFX8-NEXT:    s_and_b32 s0, s2, s0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mad_u32_u24 v2, s4, v7, v2
 ; GFX8-NEXT:    v_mad_u32_u24 v2, v4, v3, v2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX8-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
 ; GFX8-NEXT:    v_mad_u32_u24 v2, v6, v5, v2
 ; GFX8-NEXT:    flat_store_short v[0:1], v2
@@ -1046,31 +1046,31 @@ define amdgpu_kernel void @idot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0xffff
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX10-DL-NEXT:    global_load_ushort v3, v[0:1], off
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_bfe_i32 s4, s0, 0x80000
+; GFX10-DL-NEXT:    s_bfe_i32 s3, s1, 0x80000
+; GFX10-DL-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v4, 8, s0
+; GFX10-DL-NEXT:    s_lshr_b32 s5, s1, 16
+; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v5, 8, s1
+; GFX10-DL-NEXT:    v_and_b32_e32 v6, s3, v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v7, s4, v2
 ; GFX10-DL-NEXT:    s_bfe_i32 s0, s2, 0x80000
-; GFX10-DL-NEXT:    s_bfe_i32 s1, s3, 0x80000
-; GFX10-DL-NEXT:    s_lshr_b32 s4, s2, 16
-; GFX10-DL-NEXT:    s_lshr_b32 s5, s3, 16
-; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v4, 8, s2
-; GFX10-DL-NEXT:    v_and_b32_e32 v7, s0, v2
-; GFX10-DL-NEXT:    v_and_b32_e32 v6, s1, v2
-; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v5, 8, s3
-; GFX10-DL-NEXT:    s_bfe_i32 s0, s4, 0x80000
 ; GFX10-DL-NEXT:    s_bfe_i32 s1, s5, 0x80000
-; GFX10-DL-NEXT:    v_lshl_or_b32 v4, v4, 16, v7
-; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v8, 8, s4
+; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v11, 8, s2
 ; GFX10-DL-NEXT:    v_lshl_or_b32 v5, v5, 16, v6
-; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v6, 8, s5
-; GFX10-DL-NEXT:    v_and_b32_e32 v7, s1, v2
+; GFX10-DL-NEXT:    v_lshl_or_b32 v4, v4, 16, v7
+; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v7, 8, s5
+; GFX10-DL-NEXT:    v_and_b32_e32 v8, s1, v2
 ; GFX10-DL-NEXT:    v_and_b32_e32 v2, s0, v2
 ; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v4, v4, v5
-; GFX10-DL-NEXT:    v_lshl_or_b32 v5, v6, 16, v7
-; GFX10-DL-NEXT:    v_lshl_or_b32 v2, v8, 16, v2
+; GFX10-DL-NEXT:    v_lshl_or_b32 v5, v7, 16, v8
+; GFX10-DL-NEXT:    v_lshl_or_b32 v2, v11, 16, v2
 ; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v2, v2, v5
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DL-NEXT:    v_add_nc_u32_e32 v3, v4, v3

diff  --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll
index fddb14dbf975..6172d54b8e0d 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll
@@ -17,22 +17,22 @@ define amdgpu_kernel void @udot4_acc32(<4 x i8> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
-; GFX7-NEXT:    s_load_dword s6, s[0:1], 0x0
+; GFX7-NEXT:    s_load_dword s12, s[0:1], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_and_b32 s7, s4, s8
-; GFX7-NEXT:    s_and_b32 s8, s5, s8
-; GFX7-NEXT:    s_bfe_u32 s10, s5, 0x80008
-; GFX7-NEXT:    v_mov_b32_e32 v0, s8
-; GFX7-NEXT:    v_mov_b32_e32 v1, s6
-; GFX7-NEXT:    s_bfe_u32 s12, s5, 0x80010
-; GFX7-NEXT:    v_mad_u32_u24 v0, s7, v0, v1
-; GFX7-NEXT:    s_bfe_u32 s9, s4, 0x80008
-; GFX7-NEXT:    v_mov_b32_e32 v1, s10
-; GFX7-NEXT:    s_bfe_u32 s11, s4, 0x80010
-; GFX7-NEXT:    v_mad_u32_u24 v0, s9, v1, v0
+; GFX7-NEXT:    s_and_b32 s6, s4, s8
+; GFX7-NEXT:    s_and_b32 s7, s5, s8
+; GFX7-NEXT:    s_bfe_u32 s9, s5, 0x80008
+; GFX7-NEXT:    v_mov_b32_e32 v0, s7
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s12
+; GFX7-NEXT:    s_bfe_u32 s11, s5, 0x80010
+; GFX7-NEXT:    v_mad_u32_u24 v0, s6, v0, v1
+; GFX7-NEXT:    s_bfe_u32 s8, s4, 0x80008
+; GFX7-NEXT:    v_mov_b32_e32 v1, s9
+; GFX7-NEXT:    s_bfe_u32 s10, s4, 0x80010
+; GFX7-NEXT:    v_mad_u32_u24 v0, s8, v1, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s11
 ; GFX7-NEXT:    s_lshr_b32 s5, s5, 24
-; GFX7-NEXT:    v_mad_u32_u24 v0, s11, v1, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s10, v1, v0
 ; GFX7-NEXT:    s_lshr_b32 s4, s4, 24
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
@@ -47,22 +47,22 @@ define amdgpu_kernel void @udot4_acc32(<4 x i8> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_load_dword s3, s[4:5], 0x0
 ; GFX8-NEXT:    s_load_dword s4, s[6:7], 0x0
-; GFX8-NEXT:    s_load_dword s5, s[0:1], 0x0
+; GFX8-NEXT:    s_load_dword s10, s[0:1], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_and_b32 s6, s3, s2
+; GFX8-NEXT:    s_and_b32 s5, s3, s2
 ; GFX8-NEXT:    s_and_b32 s2, s4, s2
-; GFX8-NEXT:    s_bfe_u32 s8, s4, 0x80008
+; GFX8-NEXT:    s_bfe_u32 s7, s4, 0x80008
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s2
-; GFX8-NEXT:    v_mov_b32_e32 v1, s5
-; GFX8-NEXT:    s_bfe_u32 s10, s4, 0x80010
-; GFX8-NEXT:    v_mad_u32_u24 v0, s6, v0, v1
-; GFX8-NEXT:    s_bfe_u32 s7, s3, 0x80008
-; GFX8-NEXT:    v_mov_b32_e32 v1, s8
-; GFX8-NEXT:    s_bfe_u32 s9, s3, 0x80010
-; GFX8-NEXT:    v_mad_u32_u24 v0, s7, v1, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s10
+; GFX8-NEXT:    s_bfe_u32 s9, s4, 0x80010
+; GFX8-NEXT:    v_mad_u32_u24 v0, s5, v0, v1
+; GFX8-NEXT:    s_bfe_u32 s6, s3, 0x80008
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    s_bfe_u32 s8, s3, 0x80010
+; GFX8-NEXT:    v_mad_u32_u24 v0, s6, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX8-NEXT:    s_lshr_b32 s4, s4, 24
-; GFX8-NEXT:    v_mad_u32_u24 v0, s9, v1, v0
+; GFX8-NEXT:    v_mad_u32_u24 v0, s8, v1, v0
 ; GFX8-NEXT:    s_lshr_b32 s3, s3, 24
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX8-NEXT:    v_mad_u32_u24 v2, s3, v1, v0
@@ -79,22 +79,22 @@ define amdgpu_kernel void @udot4_acc32(<4 x i8> addrspace(1)* %src1,
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NODL-NEXT:    s_load_dword s3, s[4:5], 0x0
 ; GFX9-NODL-NEXT:    s_load_dword s4, s[6:7], 0x0
-; GFX9-NODL-NEXT:    s_load_dword s5, s[0:1], 0x0
+; GFX9-NODL-NEXT:    s_load_dword s10, s[0:1], 0x0
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    s_and_b32 s6, s3, s2
+; GFX9-NODL-NEXT:    s_and_b32 s5, s3, s2
 ; GFX9-NODL-NEXT:    s_and_b32 s2, s4, s2
-; GFX9-NODL-NEXT:    s_bfe_u32 s8, s4, 0x80008
+; GFX9-NODL-NEXT:    s_bfe_u32 s7, s4, 0x80008
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NODL-NEXT:    s_bfe_u32 s10, s4, 0x80010
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v0, s6, v0, v1
-; GFX9-NODL-NEXT:    s_bfe_u32 s7, s3, 0x80008
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s8
-; GFX9-NODL-NEXT:    s_bfe_u32 s9, s3, 0x80010
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v0, s7, v1, v0
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s10
+; GFX9-NODL-NEXT:    s_bfe_u32 s9, s4, 0x80010
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v0, s5, v0, v1
+; GFX9-NODL-NEXT:    s_bfe_u32 s6, s3, 0x80008
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NODL-NEXT:    s_bfe_u32 s8, s3, 0x80010
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v0, s6, v1, v0
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX9-NODL-NEXT:    s_lshr_b32 s4, s4, 24
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v0, s9, v1, v0
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v0, s8, v1, v0
 ; GFX9-NODL-NEXT:    s_lshr_b32 s3, s3, 24
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s3, v1, v0
@@ -108,15 +108,15 @@ define amdgpu_kernel void @udot4_acc32(<4 x i8> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
-; GFX9-DL-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX9-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
+; GFX9-DL-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX9-DL-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s3
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s4
-; GFX9-DL-NEXT:    v_dot4_u32_u8 v2, s2, v2, v3
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-DL-NEXT:    v_dot4_u32_u8 v2, s4, v2, v3
 ; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
 ;
@@ -126,14 +126,14 @@ define amdgpu_kernel void @udot4_acc32(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
-; GFX10-DL-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_load_dword s3, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-DL-NEXT:    v_dot4_u32_u8 v2, s3, s4, v0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-DL-NEXT:    v_dot4_u32_u8 v2, s2, s3, v2
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                        <4 x i8> addrspace(1)* %src2,
@@ -183,28 +183,28 @@ define amdgpu_kernel void @udot4_acc16(<4 x i8> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s2, -1
-; GFX7-NEXT:    s_movk_i32 s8, 0xff
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GFX7-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
-; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x0
+; GFX7-NEXT:    s_movk_i32 s5, 0xff
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_and_b32 s7, s4, s8
 ; GFX7-NEXT:    s_bfe_u32 s9, s4, 0x80008
-; GFX7-NEXT:    s_and_b32 s6, s5, s8
-; GFX7-NEXT:    s_bfe_u32 s8, s5, 0x80008
-; GFX7-NEXT:    v_mov_b32_e32 v1, s6
-; GFX7-NEXT:    s_bfe_u32 s10, s5, 0x80010
-; GFX7-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX7-NEXT:    s_bfe_u32 s11, s4, 0x80010
-; GFX7-NEXT:    s_lshr_b32 s5, s5, 24
+; GFX7-NEXT:    s_and_b32 s7, s6, s5
+; GFX7-NEXT:    s_and_b32 s5, s4, s5
+; GFX7-NEXT:    s_bfe_u32 s8, s6, 0x80008
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_bfe_u32 s10, s6, 0x80010
+; GFX7-NEXT:    v_mov_b32_e32 v2, s8
+; GFX7-NEXT:    s_lshr_b32 s6, s6, 24
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s10
 ; GFX7-NEXT:    s_lshr_b32 s4, s4, 24
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mad_u32_u24 v0, s7, v1, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s5, v1, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s9, v2, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s11, v3, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
 ; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
@@ -214,20 +214,20 @@ define amdgpu_kernel void @udot4_acc16(<4 x i8> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_load_ushort v2, v[0:1]
 ; GFX8-NEXT:    s_load_dword s1, s[4:5], 0x0
-; GFX8-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX8-NEXT:    s_movk_i32 s0, 0xff
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_bfe_u32 s5, s2, 0x80008
+; GFX8-NEXT:    s_bfe_u32 s7, s2, 0x80010
+; GFX8-NEXT:    v_mov_b32_e32 v4, s5
 ; GFX8-NEXT:    s_and_b32 s3, s1, s0
 ; GFX8-NEXT:    s_and_b32 s0, s2, s0
-; GFX8-NEXT:    s_bfe_u32 s5, s2, 0x80008
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s0
-; GFX8-NEXT:    s_bfe_u32 s7, s2, 0x80010
 ; GFX8-NEXT:    s_bfe_u32 s4, s1, 0x80008
-; GFX8-NEXT:    v_mov_b32_e32 v4, s5
 ; GFX8-NEXT:    s_bfe_u32 s6, s1, 0x80010
 ; GFX8-NEXT:    s_lshr_b32 s2, s2, 24
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s7
@@ -246,20 +246,20 @@ define amdgpu_kernel void @udot4_acc16(<4 x i8> addrspace(1)* %src1,
 ; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NODL-NEXT:    global_load_ushort v2, v[0:1], off
 ; GFX9-NODL-NEXT:    s_load_dword s1, s[4:5], 0x0
-; GFX9-NODL-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX9-NODL-NEXT:    s_movk_i32 s0, 0xff
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    s_bfe_u32 s5, s2, 0x80008
+; GFX9-NODL-NEXT:    s_bfe_u32 s7, s2, 0x80010
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v4, s5
 ; GFX9-NODL-NEXT:    s_and_b32 s3, s1, s0
 ; GFX9-NODL-NEXT:    s_and_b32 s0, s2, s0
-; GFX9-NODL-NEXT:    s_bfe_u32 s5, s2, 0x80008
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s0
-; GFX9-NODL-NEXT:    s_bfe_u32 s7, s2, 0x80010
 ; GFX9-NODL-NEXT:    s_bfe_u32 s4, s1, 0x80008
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v4, s5
 ; GFX9-NODL-NEXT:    s_bfe_u32 s6, s1, 0x80010
 ; GFX9-NODL-NEXT:    s_lshr_b32 s2, s2, 24
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v5, s7
@@ -278,15 +278,15 @@ define amdgpu_kernel void @udot4_acc16(<4 x i8> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX9-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DL-NEXT:    global_load_ushort v2, v[0:1], off
+; GFX9-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_dot4_u32_u8 v2, s2, v3, v2
+; GFX9-DL-NEXT:    v_dot4_u32_u8 v2, s0, v3, v2
 ; GFX9-DL-NEXT:    global_store_short v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
 ;
@@ -353,28 +353,28 @@ define amdgpu_kernel void @udot4_acc8(<4 x i8> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s2, -1
-; GFX7-NEXT:    s_movk_i32 s8, 0xff
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GFX7-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
-; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x0
+; GFX7-NEXT:    s_movk_i32 s5, 0xff
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_and_b32 s7, s4, s8
 ; GFX7-NEXT:    s_bfe_u32 s9, s4, 0x80008
-; GFX7-NEXT:    s_and_b32 s6, s5, s8
-; GFX7-NEXT:    s_bfe_u32 s8, s5, 0x80008
-; GFX7-NEXT:    v_mov_b32_e32 v1, s6
-; GFX7-NEXT:    s_bfe_u32 s10, s5, 0x80010
-; GFX7-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX7-NEXT:    s_bfe_u32 s11, s4, 0x80010
-; GFX7-NEXT:    s_lshr_b32 s5, s5, 24
+; GFX7-NEXT:    s_and_b32 s7, s6, s5
+; GFX7-NEXT:    s_and_b32 s5, s4, s5
+; GFX7-NEXT:    s_bfe_u32 s8, s6, 0x80008
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_bfe_u32 s10, s6, 0x80010
+; GFX7-NEXT:    v_mov_b32_e32 v2, s8
+; GFX7-NEXT:    s_lshr_b32 s6, s6, 24
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s10
 ; GFX7-NEXT:    s_lshr_b32 s4, s4, 24
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mad_u32_u24 v0, s7, v1, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s5, v1, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s9, v2, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s11, v3, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
 ; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
@@ -383,31 +383,31 @@ define amdgpu_kernel void @udot4_acc8(<4 x i8> addrspace(1)* %src1,
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX8-NEXT:    s_movk_i32 s2, 0xff
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX8-NEXT:    s_load_dword s1, s[6:7], 0x0
+; GFX8-NEXT:    s_movk_i32 s0, 0xff
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_bfe_u32 s5, s0, 0x80008
-; GFX8-NEXT:    s_and_b32 s3, s1, s2
+; GFX8-NEXT:    s_bfe_u32 s5, s2, 0x80008
+; GFX8-NEXT:    s_bfe_u32 s7, s2, 0x80010
+; GFX8-NEXT:    s_and_b32 s3, s1, s0
+; GFX8-NEXT:    s_and_b32 s0, s2, s0
 ; GFX8-NEXT:    s_bfe_u32 s4, s1, 0x80008
-; GFX8-NEXT:    s_and_b32 s2, s0, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX8-NEXT:    s_bfe_u32 s6, s1, 0x80010
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s4
-; GFX8-NEXT:    s_bfe_u32 s7, s0, 0x80010
 ; GFX8-NEXT:    s_lshr_b32 s1, s1, 24
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s6
-; GFX8-NEXT:    s_lshr_b32 s0, s0, 24
+; GFX8-NEXT:    s_lshr_b32 s2, s2, 24
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
 ; GFX8-NEXT:    v_mad_u32_u24 v2, s5, v4, v2
 ; GFX8-NEXT:    v_mad_u32_u24 v2, s7, v5, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
 ; GFX8-NEXT:    flat_store_byte v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -415,31 +415,31 @@ define amdgpu_kernel void @udot4_acc8(<4 x i8> addrspace(1)* %src1,
 ; GFX9-NODL:       ; %bb.0: ; %entry
 ; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX9-NODL-NEXT:    s_movk_i32 s2, 0xff
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NODL-NEXT:    global_load_ubyte v2, v[0:1], off
-; GFX9-NODL-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX9-NODL-NEXT:    s_load_dword s1, s[6:7], 0x0
+; GFX9-NODL-NEXT:    s_movk_i32 s0, 0xff
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    s_bfe_u32 s5, s0, 0x80008
-; GFX9-NODL-NEXT:    s_and_b32 s3, s1, s2
+; GFX9-NODL-NEXT:    s_bfe_u32 s5, s2, 0x80008
+; GFX9-NODL-NEXT:    s_bfe_u32 s7, s2, 0x80010
+; GFX9-NODL-NEXT:    s_and_b32 s3, s1, s0
+; GFX9-NODL-NEXT:    s_and_b32 s0, s2, s0
 ; GFX9-NODL-NEXT:    s_bfe_u32 s4, s1, 0x80008
-; GFX9-NODL-NEXT:    s_and_b32 s2, s0, s2
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX9-NODL-NEXT:    s_bfe_u32 s6, s1, 0x80010
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v4, s4
-; GFX9-NODL-NEXT:    s_bfe_u32 s7, s0, 0x80010
 ; GFX9-NODL-NEXT:    s_lshr_b32 s1, s1, 24
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v5, s6
-; GFX9-NODL-NEXT:    s_lshr_b32 s0, s0, 24
+; GFX9-NODL-NEXT:    s_lshr_b32 s2, s2, 24
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
 ; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s5, v4, v2
 ; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s7, v5, v2
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
 ; GFX9-NODL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
@@ -448,15 +448,15 @@ define amdgpu_kernel void @udot4_acc8(<4 x i8> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX9-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DL-NEXT:    global_load_ubyte v2, v[0:1], off
+; GFX9-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_dot4_u32_u8 v2, s2, v3, v2
+; GFX9-DL-NEXT:    v_dot4_u32_u8 v2, s0, v3, v2
 ; GFX9-DL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
 ;
@@ -613,19 +613,19 @@ define amdgpu_kernel void @udot2_8(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_movk_i32 s2, 0xff
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_load_dword s3, s[4:5], 0x0
-; GFX10-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_load_dword s1, s[4:5], 0x0
 ; GFX10-DL-NEXT:    global_load_ubyte v2, v[0:1], off
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_and_b32 s0, s3, s2
-; GFX10-DL-NEXT:    s_and_b32 s1, s4, s2
-; GFX10-DL-NEXT:    s_bfe_u32 s2, s4, 0x80008
-; GFX10-DL-NEXT:    s_bfe_u32 s3, s3, 0x80008
+; GFX10-DL-NEXT:    s_and_b32 s3, s0, s2
+; GFX10-DL-NEXT:    s_and_b32 s2, s1, s2
+; GFX10-DL-NEXT:    s_bfe_u32 s0, s0, 0x80008
+; GFX10-DL-NEXT:    s_bfe_u32 s1, s1, 0x80008
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s0, s1, v2
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s3, s2, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s3, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s1, s0, v2
 ; GFX10-DL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                    <4 x i8> addrspace(1)* %src2,
@@ -752,14 +752,14 @@ define amdgpu_kernel void @udot4_CommutationInsideMAD(<4 x i8> addrspace(1)* %sr
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DL-NEXT:    global_load_ubyte v2, v[0:1], off
+; GFX9-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_dot4_u32_u8 v2, s3, v3, v2
+; GFX9-DL-NEXT:    v_dot4_u32_u8 v2, s0, v3, v2
 ; GFX9-DL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
 ;
@@ -948,24 +948,24 @@ define amdgpu_kernel void @udot4_CommutationAccrossMADs(<4 x i8> addrspace(1)* %
 ; GFX10-DL-NEXT:    s_movk_i32 s2, 0xff
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_load_dword s3, s[4:5], 0x0
-; GFX10-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX10-DL-NEXT:    global_load_ubyte v2, v[0:1], off
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_bfe_u32 s0, s3, 0x80008
-; GFX10-DL-NEXT:    s_bfe_u32 s1, s4, 0x80008
-; GFX10-DL-NEXT:    s_and_b32 s5, s3, s2
-; GFX10-DL-NEXT:    s_and_b32 s2, s4, s2
-; GFX10-DL-NEXT:    s_bfe_u32 s6, s3, 0x80010
-; GFX10-DL-NEXT:    s_bfe_u32 s7, s4, 0x80010
+; GFX10-DL-NEXT:    s_bfe_u32 s3, s0, 0x80008
+; GFX10-DL-NEXT:    s_bfe_u32 s4, s1, 0x80008
+; GFX10-DL-NEXT:    s_and_b32 s5, s0, s2
+; GFX10-DL-NEXT:    s_and_b32 s2, s1, s2
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s1, s0, v2
-; GFX10-DL-NEXT:    s_lshr_b32 s0, s3, 24
-; GFX10-DL-NEXT:    s_lshr_b32 s1, s4, 24
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s4, s3, v2
+; GFX10-DL-NEXT:    s_bfe_u32 s3, s0, 0x80010
+; GFX10-DL-NEXT:    s_bfe_u32 s4, s1, 0x80010
+; GFX10-DL-NEXT:    s_lshr_b32 s0, s0, 24
+; GFX10-DL-NEXT:    s_lshr_b32 s1, s1, 24
 ; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s5, v2
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s7, s6, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s4, s3, v2
 ; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s1, s0, v2
 ; GFX10-DL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
@@ -1012,23 +1012,23 @@ define amdgpu_kernel void @udot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
-; GFX7-NEXT:    s_load_dword s6, s[0:1], 0x0
+; GFX7-NEXT:    s_load_dword s12, s[0:1], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_and_b32 s7, s4, s8
-; GFX7-NEXT:    s_and_b32 s8, s5, s8
-; GFX7-NEXT:    s_bfe_u32 s10, s5, 0x80008
-; GFX7-NEXT:    v_mov_b32_e32 v0, s8
-; GFX7-NEXT:    v_mov_b32_e32 v1, s6
-; GFX7-NEXT:    s_bfe_u32 s9, s4, 0x80008
-; GFX7-NEXT:    v_mad_u32_u24 v1, s7, v0, v1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s10
-; GFX7-NEXT:    s_bfe_u32 s12, s5, 0x80010
-; GFX7-NEXT:    v_mad_u32_u24 v1, s9, v2, v1
-; GFX7-NEXT:    s_bfe_u32 s11, s4, 0x80010
-; GFX7-NEXT:    v_mad_u32_u24 v0, s7, v0, v1
+; GFX7-NEXT:    s_and_b32 s6, s4, s8
+; GFX7-NEXT:    s_and_b32 s7, s5, s8
+; GFX7-NEXT:    s_bfe_u32 s9, s5, 0x80008
+; GFX7-NEXT:    v_mov_b32_e32 v0, s7
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s12
+; GFX7-NEXT:    s_bfe_u32 s8, s4, 0x80008
+; GFX7-NEXT:    v_mad_u32_u24 v1, s6, v0, v1
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    s_bfe_u32 s11, s5, 0x80010
+; GFX7-NEXT:    v_mad_u32_u24 v1, s8, v2, v1
+; GFX7-NEXT:    s_bfe_u32 s10, s4, 0x80010
+; GFX7-NEXT:    v_mad_u32_u24 v0, s6, v0, v1
+; GFX7-NEXT:    v_mov_b32_e32 v1, s11
 ; GFX7-NEXT:    s_lshr_b32 s5, s5, 24
-; GFX7-NEXT:    v_mad_u32_u24 v0, s11, v1, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s10, v1, v0
 ; GFX7-NEXT:    s_lshr_b32 s4, s4, 24
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
@@ -1043,23 +1043,23 @@ define amdgpu_kernel void @udot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_load_dword s3, s[4:5], 0x0
 ; GFX8-NEXT:    s_load_dword s4, s[6:7], 0x0
-; GFX8-NEXT:    s_load_dword s5, s[0:1], 0x0
+; GFX8-NEXT:    s_load_dword s10, s[0:1], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_and_b32 s6, s3, s2
+; GFX8-NEXT:    s_and_b32 s5, s3, s2
 ; GFX8-NEXT:    s_and_b32 s2, s4, s2
-; GFX8-NEXT:    s_bfe_u32 s8, s4, 0x80008
+; GFX8-NEXT:    s_bfe_u32 s7, s4, 0x80008
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s2
-; GFX8-NEXT:    v_mov_b32_e32 v1, s5
-; GFX8-NEXT:    s_bfe_u32 s7, s3, 0x80008
-; GFX8-NEXT:    v_mad_u32_u24 v1, s6, v0, v1
-; GFX8-NEXT:    v_mov_b32_e32 v2, s8
-; GFX8-NEXT:    s_bfe_u32 s10, s4, 0x80010
-; GFX8-NEXT:    v_mad_u32_u24 v1, s7, v2, v1
-; GFX8-NEXT:    s_bfe_u32 s9, s3, 0x80010
-; GFX8-NEXT:    v_mad_u32_u24 v0, s6, v0, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s10
+; GFX8-NEXT:    s_bfe_u32 s6, s3, 0x80008
+; GFX8-NEXT:    v_mad_u32_u24 v1, s5, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s7
+; GFX8-NEXT:    s_bfe_u32 s9, s4, 0x80010
+; GFX8-NEXT:    v_mad_u32_u24 v1, s6, v2, v1
+; GFX8-NEXT:    s_bfe_u32 s8, s3, 0x80010
+; GFX8-NEXT:    v_mad_u32_u24 v0, s5, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX8-NEXT:    s_lshr_b32 s4, s4, 24
-; GFX8-NEXT:    v_mad_u32_u24 v0, s9, v1, v0
+; GFX8-NEXT:    v_mad_u32_u24 v0, s8, v1, v0
 ; GFX8-NEXT:    s_lshr_b32 s3, s3, 24
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX8-NEXT:    v_mad_u32_u24 v2, s3, v1, v0
@@ -1076,23 +1076,23 @@ define amdgpu_kernel void @udot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1,
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NODL-NEXT:    s_load_dword s3, s[4:5], 0x0
 ; GFX9-NODL-NEXT:    s_load_dword s4, s[6:7], 0x0
-; GFX9-NODL-NEXT:    s_load_dword s5, s[0:1], 0x0
+; GFX9-NODL-NEXT:    s_load_dword s10, s[0:1], 0x0
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    s_and_b32 s6, s3, s2
+; GFX9-NODL-NEXT:    s_and_b32 s5, s3, s2
 ; GFX9-NODL-NEXT:    s_and_b32 s2, s4, s2
-; GFX9-NODL-NEXT:    s_bfe_u32 s8, s4, 0x80008
+; GFX9-NODL-NEXT:    s_bfe_u32 s7, s4, 0x80008
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NODL-NEXT:    s_bfe_u32 s7, s3, 0x80008
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s6, v0, v1
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s8
-; GFX9-NODL-NEXT:    s_bfe_u32 s10, s4, 0x80010
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s7, v2, v1
-; GFX9-NODL-NEXT:    s_bfe_u32 s9, s3, 0x80010
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v0, s6, v0, v1
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s10
+; GFX9-NODL-NEXT:    s_bfe_u32 s6, s3, 0x80008
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s5, v0, v1
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s7
+; GFX9-NODL-NEXT:    s_bfe_u32 s9, s4, 0x80010
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, s6, v2, v1
+; GFX9-NODL-NEXT:    s_bfe_u32 s8, s3, 0x80010
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v0, s5, v0, v1
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX9-NODL-NEXT:    s_lshr_b32 s4, s4, 24
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v0, s9, v1, v0
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v0, s8, v1, v0
 ; GFX9-NODL-NEXT:    s_lshr_b32 s3, s3, 24
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s3, v1, v0
@@ -1109,23 +1109,23 @@ define amdgpu_kernel void @udot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    s_load_dword s3, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
-; GFX9-DL-NEXT:    s_load_dword s5, s[0:1], 0x0
+; GFX9-DL-NEXT:    s_load_dword s10, s[0:1], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_and_b32 s6, s3, s2
+; GFX9-DL-NEXT:    s_and_b32 s5, s3, s2
 ; GFX9-DL-NEXT:    s_and_b32 s2, s4, s2
-; GFX9-DL-NEXT:    s_bfe_u32 s8, s4, 0x80008
+; GFX9-DL-NEXT:    s_bfe_u32 s7, s4, 0x80008
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-DL-NEXT:    s_bfe_u32 s7, s3, 0x80008
-; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s6, v0, v1
-; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s8
-; GFX9-DL-NEXT:    s_bfe_u32 s10, s4, 0x80010
-; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s7, v2, v1
-; GFX9-DL-NEXT:    s_bfe_u32 s9, s3, 0x80010
-; GFX9-DL-NEXT:    v_mad_u32_u24 v0, s6, v0, v1
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s10
+; GFX9-DL-NEXT:    s_bfe_u32 s6, s3, 0x80008
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s5, v0, v1
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s7
+; GFX9-DL-NEXT:    s_bfe_u32 s9, s4, 0x80010
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s6, v2, v1
+; GFX9-DL-NEXT:    s_bfe_u32 s8, s3, 0x80010
+; GFX9-DL-NEXT:    v_mad_u32_u24 v0, s5, v0, v1
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX9-DL-NEXT:    s_lshr_b32 s4, s4, 24
-; GFX9-DL-NEXT:    v_mad_u32_u24 v0, s9, v1, v0
+; GFX9-DL-NEXT:    v_mad_u32_u24 v0, s8, v1, v0
 ; GFX9-DL-NEXT:    s_lshr_b32 s3, s3, 24
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s3, v1, v0
@@ -1144,23 +1144,23 @@ define amdgpu_kernel void @udot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_load_dword s3, s[4:5], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s5, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_and_b32 s6, s3, s2
+; GFX10-DL-NEXT:    s_and_b32 s2, s4, s2
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-DL-NEXT:    s_bfe_u32 s5, s3, 0x80008
+; GFX10-DL-NEXT:    s_bfe_u32 s7, s4, 0x80008
+; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s6, s2, v0
+; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s5, s7, v0
+; GFX10-DL-NEXT:    s_bfe_u32 s5, s3, 0x80010
+; GFX10-DL-NEXT:    s_bfe_u32 s7, s4, 0x80010
+; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s6, s2, v0
+; GFX10-DL-NEXT:    s_lshr_b32 s2, s3, 24
+; GFX10-DL-NEXT:    s_lshr_b32 s3, s4, 24
+; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s5, s7, v0
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s3, v0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_and_b32 s0, s3, s2
-; GFX10-DL-NEXT:    s_and_b32 s1, s4, s2
-; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-DL-NEXT:    s_bfe_u32 s2, s3, 0x80008
-; GFX10-DL-NEXT:    s_bfe_u32 s5, s4, 0x80008
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s0, s1, v2
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s5, v2
-; GFX10-DL-NEXT:    s_bfe_u32 s2, s3, 0x80010
-; GFX10-DL-NEXT:    s_bfe_u32 s5, s4, 0x80010
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s0, s1, v2
-; GFX10-DL-NEXT:    s_lshr_b32 s0, s3, 24
-; GFX10-DL-NEXT:    s_lshr_b32 s1, s4, 24
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s5, v2
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s0, s1, v2
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                                <4 x i8> addrspace(1)* %src2,
@@ -1215,23 +1215,23 @@ define amdgpu_kernel void @udot4_multiuse_add1(<4 x i8> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
-; GFX7-NEXT:    s_load_dword s6, s[0:1], 0x0
+; GFX7-NEXT:    s_load_dword s12, s[0:1], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_and_b32 s7, s4, s8
-; GFX7-NEXT:    s_bfe_u32 s10, s5, 0x80008
-; GFX7-NEXT:    s_and_b32 s8, s5, s8
-; GFX7-NEXT:    s_bfe_u32 s9, s4, 0x80008
-; GFX7-NEXT:    v_mov_b32_e32 v0, s10
-; GFX7-NEXT:    v_mov_b32_e32 v1, s6
-; GFX7-NEXT:    v_mad_u32_u24 v0, s9, v0, v1
-; GFX7-NEXT:    s_bfe_u32 s12, s5, 0x80010
-; GFX7-NEXT:    v_mov_b32_e32 v2, s8
-; GFX7-NEXT:    s_bfe_u32 s11, s4, 0x80010
-; GFX7-NEXT:    v_add_i32_e32 v1, vcc, s6, v0
-; GFX7-NEXT:    v_mad_u32_u24 v0, s7, v2, v0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s12
+; GFX7-NEXT:    s_and_b32 s6, s4, s8
+; GFX7-NEXT:    s_bfe_u32 s9, s5, 0x80008
+; GFX7-NEXT:    s_and_b32 s7, s5, s8
+; GFX7-NEXT:    s_bfe_u32 s8, s4, 0x80008
+; GFX7-NEXT:    v_mov_b32_e32 v0, s9
+; GFX7-NEXT:    v_mov_b32_e32 v1, s12
+; GFX7-NEXT:    v_mad_u32_u24 v0, s8, v0, v1
+; GFX7-NEXT:    s_bfe_u32 s11, s5, 0x80010
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    s_bfe_u32 s10, s4, 0x80010
+; GFX7-NEXT:    v_add_i32_e32 v1, vcc, s12, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s6, v2, v0
+; GFX7-NEXT:    v_mov_b32_e32 v2, s11
 ; GFX7-NEXT:    s_lshr_b32 s5, s5, 24
-; GFX7-NEXT:    v_mad_u32_u24 v0, s11, v2, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s10, v2, v0
 ; GFX7-NEXT:    s_lshr_b32 s4, s4, 24
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s4, v2, v0
@@ -1247,23 +1247,23 @@ define amdgpu_kernel void @udot4_multiuse_add1(<4 x i8> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_load_dword s3, s[4:5], 0x0
 ; GFX8-NEXT:    s_load_dword s4, s[6:7], 0x0
-; GFX8-NEXT:    s_load_dword s5, s[0:1], 0x0
+; GFX8-NEXT:    s_load_dword s10, s[0:1], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_and_b32 s6, s3, s2
-; GFX8-NEXT:    s_bfe_u32 s8, s4, 0x80008
+; GFX8-NEXT:    s_and_b32 s5, s3, s2
+; GFX8-NEXT:    s_bfe_u32 s7, s4, 0x80008
 ; GFX8-NEXT:    s_and_b32 s2, s4, s2
-; GFX8-NEXT:    s_bfe_u32 s7, s3, 0x80008
-; GFX8-NEXT:    v_mov_b32_e32 v0, s8
-; GFX8-NEXT:    v_mov_b32_e32 v1, s5
-; GFX8-NEXT:    v_mad_u32_u24 v0, s7, v0, v1
-; GFX8-NEXT:    s_bfe_u32 s10, s4, 0x80010
+; GFX8-NEXT:    s_bfe_u32 s6, s3, 0x80008
+; GFX8-NEXT:    v_mov_b32_e32 v0, s7
+; GFX8-NEXT:    v_mov_b32_e32 v1, s10
+; GFX8-NEXT:    v_mad_u32_u24 v0, s6, v0, v1
+; GFX8-NEXT:    s_bfe_u32 s9, s4, 0x80010
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s2
-; GFX8-NEXT:    s_bfe_u32 s9, s3, 0x80010
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s5, v0
-; GFX8-NEXT:    v_mad_u32_u24 v0, s6, v2, v0
-; GFX8-NEXT:    v_mov_b32_e32 v2, s10
+; GFX8-NEXT:    s_bfe_u32 s8, s3, 0x80010
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s10, v0
+; GFX8-NEXT:    v_mad_u32_u24 v0, s5, v2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s9
 ; GFX8-NEXT:    s_lshr_b32 s4, s4, 24
-; GFX8-NEXT:    v_mad_u32_u24 v0, s9, v2, v0
+; GFX8-NEXT:    v_mad_u32_u24 v0, s8, v2, v0
 ; GFX8-NEXT:    s_lshr_b32 s3, s3, 24
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-NEXT:    v_mad_u32_u24 v0, s3, v2, v0
@@ -1281,23 +1281,23 @@ define amdgpu_kernel void @udot4_multiuse_add1(<4 x i8> addrspace(1)* %src1,
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NODL-NEXT:    s_load_dword s3, s[4:5], 0x0
 ; GFX9-NODL-NEXT:    s_load_dword s4, s[6:7], 0x0
-; GFX9-NODL-NEXT:    s_load_dword s5, s[0:1], 0x0
+; GFX9-NODL-NEXT:    s_load_dword s10, s[0:1], 0x0
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    s_and_b32 s6, s3, s2
-; GFX9-NODL-NEXT:    s_bfe_u32 s8, s4, 0x80008
+; GFX9-NODL-NEXT:    s_and_b32 s5, s3, s2
+; GFX9-NODL-NEXT:    s_bfe_u32 s7, s4, 0x80008
 ; GFX9-NODL-NEXT:    s_and_b32 s2, s4, s2
-; GFX9-NODL-NEXT:    s_bfe_u32 s7, s3, 0x80008
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s8
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v0, s7, v0, v1
-; GFX9-NODL-NEXT:    s_bfe_u32 s10, s4, 0x80010
+; GFX9-NODL-NEXT:    s_bfe_u32 s6, s3, 0x80008
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s7
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s10
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v0, s6, v0, v1
+; GFX9-NODL-NEXT:    s_bfe_u32 s9, s4, 0x80010
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-NODL-NEXT:    s_bfe_u32 s9, s3, 0x80010
-; GFX9-NODL-NEXT:    v_add_u32_e32 v1, s5, v0
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v0, s6, v2, v0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s10
+; GFX9-NODL-NEXT:    s_bfe_u32 s8, s3, 0x80010
+; GFX9-NODL-NEXT:    v_add_u32_e32 v1, s10, v0
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v0, s5, v2, v0
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s9
 ; GFX9-NODL-NEXT:    s_lshr_b32 s4, s4, 24
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v0, s9, v2, v0
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v0, s8, v2, v0
 ; GFX9-NODL-NEXT:    s_lshr_b32 s3, s3, 24
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-NODL-NEXT:    v_mad_u32_u24 v0, s3, v2, v0
@@ -1315,23 +1315,23 @@ define amdgpu_kernel void @udot4_multiuse_add1(<4 x i8> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    s_load_dword s3, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
-; GFX9-DL-NEXT:    s_load_dword s5, s[0:1], 0x0
+; GFX9-DL-NEXT:    s_load_dword s10, s[0:1], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_and_b32 s6, s3, s2
-; GFX9-DL-NEXT:    s_bfe_u32 s8, s4, 0x80008
+; GFX9-DL-NEXT:    s_and_b32 s5, s3, s2
+; GFX9-DL-NEXT:    s_bfe_u32 s7, s4, 0x80008
 ; GFX9-DL-NEXT:    s_and_b32 s2, s4, s2
-; GFX9-DL-NEXT:    s_bfe_u32 s7, s3, 0x80008
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s8
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-DL-NEXT:    v_mad_u32_u24 v0, s7, v0, v1
-; GFX9-DL-NEXT:    s_bfe_u32 s10, s4, 0x80010
+; GFX9-DL-NEXT:    s_bfe_u32 s6, s3, 0x80008
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s7
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s10
+; GFX9-DL-NEXT:    v_mad_u32_u24 v0, s6, v0, v1
+; GFX9-DL-NEXT:    s_bfe_u32 s9, s4, 0x80010
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-DL-NEXT:    s_bfe_u32 s9, s3, 0x80010
-; GFX9-DL-NEXT:    v_add_u32_e32 v1, s5, v0
-; GFX9-DL-NEXT:    v_mad_u32_u24 v0, s6, v2, v0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s10
+; GFX9-DL-NEXT:    s_bfe_u32 s8, s3, 0x80010
+; GFX9-DL-NEXT:    v_add_u32_e32 v1, s10, v0
+; GFX9-DL-NEXT:    v_mad_u32_u24 v0, s5, v2, v0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s9
 ; GFX9-DL-NEXT:    s_lshr_b32 s4, s4, 24
-; GFX9-DL-NEXT:    v_mad_u32_u24 v0, s9, v2, v0
+; GFX9-DL-NEXT:    v_mad_u32_u24 v0, s8, v2, v0
 ; GFX9-DL-NEXT:    s_lshr_b32 s3, s3, 24
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-DL-NEXT:    v_mad_u32_u24 v0, s3, v2, v0
@@ -1351,24 +1351,24 @@ define amdgpu_kernel void @udot4_multiuse_add1(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_load_dword s3, s[4:5], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s5, s[0:1], 0x0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_bfe_u32 s0, s3, 0x80008
-; GFX10-DL-NEXT:    s_bfe_u32 s1, s4, 0x80008
-; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-DL-NEXT:    s_and_b32 s6, s3, s2
+; GFX10-DL-NEXT:    s_bfe_u32 s6, s3, 0x80008
+; GFX10-DL-NEXT:    s_bfe_u32 s7, s4, 0x80008
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-DL-NEXT:    s_and_b32 s8, s3, s2
 ; GFX10-DL-NEXT:    s_and_b32 s2, s4, s2
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s0, s1, v2
-; GFX10-DL-NEXT:    s_bfe_u32 s0, s3, 0x80010
-; GFX10-DL-NEXT:    s_bfe_u32 s1, s4, 0x80010
-; GFX10-DL-NEXT:    v_mad_u32_u24 v3, s6, s2, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s6, s7, v0
+; GFX10-DL-NEXT:    s_bfe_u32 s6, s3, 0x80010
+; GFX10-DL-NEXT:    s_bfe_u32 s7, s4, 0x80010
+; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s8, s2, v0
 ; GFX10-DL-NEXT:    s_lshr_b32 s2, s3, 24
 ; GFX10-DL-NEXT:    s_lshr_b32 s3, s4, 24
-; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, s5, v2
-; GFX10-DL-NEXT:    v_mad_u32_u24 v3, s0, s1, v3
-; GFX10-DL-NEXT:    v_mad_u32_u24 v3, s2, s3, v3
-; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v3, v2
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v0, s5, v0
+; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s6, s7, v1
+; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s3, v1
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v1, v0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                                <4 x i8> addrspace(1)* %src2,
@@ -1421,28 +1421,28 @@ define amdgpu_kernel void @notdot4_mixedtypes(<4 x i8> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_mov_b32 s8, 0xffff
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x0
 ; GFX7-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
-; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_sext_i32_i8 s6, s4
-; GFX7-NEXT:    s_bfe_u32 s10, s4, 0x80008
-; GFX7-NEXT:    s_sext_i32_i8 s7, s5
-; GFX7-NEXT:    s_bfe_u32 s9, s5, 0x80008
+; GFX7-NEXT:    s_sext_i32_i8 s7, s6
+; GFX7-NEXT:    s_bfe_u32 s9, s6, 0x80008
+; GFX7-NEXT:    s_sext_i32_i8 s5, s4
 ; GFX7-NEXT:    s_and_b32 s7, s7, s8
+; GFX7-NEXT:    s_bfe_u32 s10, s4, 0x80008
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s9
-; GFX7-NEXT:    s_bfe_u32 s11, s5, 0x80010
-; GFX7-NEXT:    s_and_b32 s6, s6, s8
+; GFX7-NEXT:    s_bfe_u32 s11, s6, 0x80010
+; GFX7-NEXT:    s_and_b32 s5, s5, s8
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX7-NEXT:    s_bfe_u32 s12, s4, 0x80010
-; GFX7-NEXT:    s_lshr_b32 s5, s5, 24
+; GFX7-NEXT:    s_lshr_b32 s6, s6, 24
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s11
 ; GFX7-NEXT:    s_lshr_b32 s4, s4, 24
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s10, v1, v0
-; GFX7-NEXT:    v_mad_u32_u24 v0, s6, v3, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s5, v3, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s12, v2, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
 ; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
@@ -1452,29 +1452,29 @@ define amdgpu_kernel void @notdot4_mixedtypes(<4 x i8> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX8-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX8-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_load_ushort v2, v[0:1]
+; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_bfe_u32 s0, s2, 0x80008
-; GFX8-NEXT:    s_bfe_u32 s1, s3, 0x80008
+; GFX8-NEXT:    s_bfe_u32 s1, s2, 0x80008
+; GFX8-NEXT:    s_sext_i32_i8 s3, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    s_sext_i32_i8 s4, s3
-; GFX8-NEXT:    s_bfe_u32 s5, s3, 0x80010
-; GFX8-NEXT:    v_mov_b32_e32 v4, s4
-; GFX8-NEXT:    s_sext_i32_i8 s1, s2
-; GFX8-NEXT:    s_bfe_u32 s4, s2, 0x80010
-; GFX8-NEXT:    s_lshr_b32 s3, s3, 24
-; GFX8-NEXT:    v_mov_b32_e32 v5, s5
+; GFX8-NEXT:    s_bfe_u32 s5, s2, 0x80010
+; GFX8-NEXT:    s_bfe_u32 s1, s0, 0x80008
+; GFX8-NEXT:    v_mov_b32_e32 v4, s3
+; GFX8-NEXT:    s_sext_i32_i8 s4, s0
+; GFX8-NEXT:    s_bfe_u32 s3, s0, 0x80010
 ; GFX8-NEXT:    s_lshr_b32 s2, s2, 24
+; GFX8-NEXT:    v_mov_b32_e32 v5, s5
+; GFX8-NEXT:    s_lshr_b32 s0, s0, 24
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mad_u32_u24 v2, s1, v3, v2
+; GFX8-NEXT:    v_mad_i32_i24 v2, s4, v4, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s3, v5, v2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX8-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
-; GFX8-NEXT:    v_mad_i32_i24 v2, s1, v4, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s4, v5, v2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s3
-; GFX8-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
 ; GFX8-NEXT:    flat_store_short v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -1483,29 +1483,29 @@ define amdgpu_kernel void @notdot4_mixedtypes(<4 x i8> addrspace(1)* %src1,
 ; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX9-NODL-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX9-NODL-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NODL-NEXT:    global_load_ushort v2, v[0:1], off
+; GFX9-NODL-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    s_bfe_u32 s0, s2, 0x80008
-; GFX9-NODL-NEXT:    s_bfe_u32 s1, s3, 0x80008
+; GFX9-NODL-NEXT:    s_bfe_u32 s1, s2, 0x80008
+; GFX9-NODL-NEXT:    s_sext_i32_i8 s3, s2
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NODL-NEXT:    s_sext_i32_i8 s4, s3
-; GFX9-NODL-NEXT:    s_bfe_u32 s5, s3, 0x80010
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v4, s4
-; GFX9-NODL-NEXT:    s_sext_i32_i8 s1, s2
-; GFX9-NODL-NEXT:    s_bfe_u32 s4, s2, 0x80010
-; GFX9-NODL-NEXT:    s_lshr_b32 s3, s3, 24
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v5, s5
+; GFX9-NODL-NEXT:    s_bfe_u32 s5, s2, 0x80010
+; GFX9-NODL-NEXT:    s_bfe_u32 s1, s0, 0x80008
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v4, s3
+; GFX9-NODL-NEXT:    s_sext_i32_i8 s4, s0
+; GFX9-NODL-NEXT:    s_bfe_u32 s3, s0, 0x80010
 ; GFX9-NODL-NEXT:    s_lshr_b32 s2, s2, 24
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v5, s5
+; GFX9-NODL-NEXT:    s_lshr_b32 s0, s0, 24
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s1, v3, v2
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, s4, v4, v2
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s3, v5, v2
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, s1, v4, v2
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s4, v5, v2
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
 ; GFX9-NODL-NEXT:    global_store_short v[0:1], v2, off
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
@@ -1514,29 +1514,29 @@ define amdgpu_kernel void @notdot4_mixedtypes(<4 x i8> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX9-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DL-NEXT:    global_load_ushort v2, v[0:1], off
+; GFX9-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_bfe_u32 s0, s2, 0x80008
-; GFX9-DL-NEXT:    s_bfe_u32 s1, s3, 0x80008
+; GFX9-DL-NEXT:    s_bfe_u32 s1, s2, 0x80008
+; GFX9-DL-NEXT:    s_sext_i32_i8 s3, s2
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-DL-NEXT:    s_sext_i32_i8 s4, s3
-; GFX9-DL-NEXT:    s_bfe_u32 s5, s3, 0x80010
-; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s4
-; GFX9-DL-NEXT:    s_sext_i32_i8 s1, s2
-; GFX9-DL-NEXT:    s_bfe_u32 s4, s2, 0x80010
-; GFX9-DL-NEXT:    s_lshr_b32 s3, s3, 24
-; GFX9-DL-NEXT:    v_mov_b32_e32 v5, s5
+; GFX9-DL-NEXT:    s_bfe_u32 s5, s2, 0x80010
+; GFX9-DL-NEXT:    s_bfe_u32 s1, s0, 0x80008
+; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s3
+; GFX9-DL-NEXT:    s_sext_i32_i8 s4, s0
+; GFX9-DL-NEXT:    s_bfe_u32 s3, s0, 0x80010
 ; GFX9-DL-NEXT:    s_lshr_b32 s2, s2, 24
+; GFX9-DL-NEXT:    v_mov_b32_e32 v5, s5
+; GFX9-DL-NEXT:    s_lshr_b32 s0, s0, 24
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s1, v3, v2
+; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s4, v4, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s3, v5, v2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s1, v4, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s4, v5, v2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
 ; GFX9-DL-NEXT:    global_store_short v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
 ;
@@ -1546,24 +1546,24 @@ define amdgpu_kernel void @notdot4_mixedtypes(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX10-DL-NEXT:    global_load_ushort v2, v[0:1], off
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_bfe_u32 s0, s2, 0x80008
-; GFX10-DL-NEXT:    s_bfe_u32 s1, s3, 0x80008
-; GFX10-DL-NEXT:    s_sext_i32_i8 s4, s2
-; GFX10-DL-NEXT:    s_sext_i32_i8 s5, s3
-; GFX10-DL-NEXT:    s_bfe_u32 s6, s2, 0x80010
-; GFX10-DL-NEXT:    s_bfe_u32 s7, s3, 0x80010
+; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x80008
+; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x80008
+; GFX10-DL-NEXT:    s_sext_i32_i8 s4, s0
+; GFX10-DL-NEXT:    s_sext_i32_i8 s5, s1
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s0, s1, v2
-; GFX10-DL-NEXT:    s_lshr_b32 s0, s2, 24
-; GFX10-DL-NEXT:    s_lshr_b32 s1, s3, 24
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s3, v2
+; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x80010
+; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x80010
+; GFX10-DL-NEXT:    s_lshr_b32 s0, s0, 24
+; GFX10-DL-NEXT:    s_lshr_b32 s1, s1, 24
 ; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s4, s5, v2
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s6, s7, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s3, v2
 ; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s0, s1, v2
 ; GFX10-DL-NEXT:    global_store_short v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
@@ -1613,31 +1613,32 @@ define amdgpu_kernel void @udot4_acc32_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
 ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
-; GFX7-NEXT:    s_movk_i32 s12, 0xff
+; GFX7-NEXT:    s_movk_i32 s11, 0xff
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
-; GFX7-NEXT:    s_load_dword s6, s[0:1], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_lshr_b32 s7, s4, 24
-; GFX7-NEXT:    s_lshr_b32 s9, s5, 24
-; GFX7-NEXT:    s_bfe_u32 s10, s5, 0x80008
-; GFX7-NEXT:    s_bfe_u32 s13, s5, 0x80010
-; GFX7-NEXT:    s_and_b32 s5, s5, s12
-; GFX7-NEXT:    s_bfe_u32 s8, s4, 0x80008
-; GFX7-NEXT:    s_bfe_u32 s11, s4, 0x80010
-; GFX7-NEXT:    s_and_b32 s4, s4, s12
+; GFX7-NEXT:    s_lshr_b32 s6, s4, 24
+; GFX7-NEXT:    s_bfe_u32 s7, s4, 0x80008
+; GFX7-NEXT:    s_bfe_u32 s10, s4, 0x80010
+; GFX7-NEXT:    s_lshr_b32 s8, s5, 24
+; GFX7-NEXT:    s_bfe_u32 s9, s5, 0x80008
+; GFX7-NEXT:    s_bfe_u32 s12, s5, 0x80010
+; GFX7-NEXT:    s_and_b32 s5, s5, s11
+; GFX7-NEXT:    s_and_b32 s4, s4, s11
+; GFX7-NEXT:    s_load_dword s11, s[0:1], 0x0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s5
-; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v1, s11
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s4, v0, v1
-; GFX7-NEXT:    v_mov_b32_e32 v1, s10
-; GFX7-NEXT:    v_mad_u32_u24 v0, s8, v1, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s13
-; GFX7-NEXT:    v_mad_u32_u24 v0, s11, v1, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s7, v1, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s12
+; GFX7-NEXT:    v_mad_u32_u24 v0, s10, v1, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s8
+; GFX7-NEXT:    v_mad_u32_u24 v0, s6, v1, v0
 ; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
@@ -1649,24 +1650,25 @@ define amdgpu_kernel void @udot4_acc32_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_load_dword s3, s[4:5], 0x0
 ; GFX8-NEXT:    s_load_dword s4, s[6:7], 0x0
-; GFX8-NEXT:    s_load_dword s5, s[0:1], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_lshr_b32 s6, s3, 24
-; GFX8-NEXT:    s_bfe_u32 s8, s3, 0x80010
+; GFX8-NEXT:    s_lshr_b32 s5, s3, 24
+; GFX8-NEXT:    s_lshr_b32 s6, s4, 24
+; GFX8-NEXT:    s_bfe_u32 s7, s3, 0x80010
 ; GFX8-NEXT:    v_lshrrev_b16_e64 v0, 8, s3
 ; GFX8-NEXT:    s_and_b32 s3, s3, s2
 ; GFX8-NEXT:    s_and_b32 s2, s4, s2
-; GFX8-NEXT:    v_mov_b32_e32 v2, s2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s5
-; GFX8-NEXT:    s_bfe_u32 s9, s4, 0x80010
+; GFX8-NEXT:    s_bfe_u32 s8, s4, 0x80010
 ; GFX8-NEXT:    v_lshrrev_b16_e64 v1, 8, s4
+; GFX8-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX8-NEXT:    v_mad_u32_u24 v2, s3, v2, v3
 ; GFX8-NEXT:    v_mad_u32_u24 v0, v0, v1, v2
-; GFX8-NEXT:    v_mov_b32_e32 v1, s9
-; GFX8-NEXT:    s_lshr_b32 s7, s4, 24
-; GFX8-NEXT:    v_mad_u32_u24 v0, s8, v1, v0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s7
-; GFX8-NEXT:    v_mad_u32_u24 v2, s6, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s8
+; GFX8-NEXT:    v_mad_u32_u24 v0, s7, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s6
+; GFX8-NEXT:    v_mad_u32_u24 v2, s5, v1, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
@@ -1680,24 +1682,25 @@ define amdgpu_kernel void @udot4_acc32_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NODL-NEXT:    s_load_dword s3, s[4:5], 0x0
 ; GFX9-NODL-NEXT:    s_load_dword s4, s[6:7], 0x0
-; GFX9-NODL-NEXT:    s_load_dword s5, s[0:1], 0x0
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    s_lshr_b32 s6, s3, 24
-; GFX9-NODL-NEXT:    s_bfe_u32 s8, s3, 0x80010
+; GFX9-NODL-NEXT:    s_lshr_b32 s5, s3, 24
+; GFX9-NODL-NEXT:    s_lshr_b32 s6, s4, 24
+; GFX9-NODL-NEXT:    s_bfe_u32 s7, s3, 0x80010
 ; GFX9-NODL-NEXT:    v_lshrrev_b16_e64 v0, 8, s3
 ; GFX9-NODL-NEXT:    s_and_b32 s3, s3, s2
 ; GFX9-NODL-NEXT:    s_and_b32 s2, s4, s2
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-NODL-NEXT:    s_bfe_u32 s9, s4, 0x80010
+; GFX9-NODL-NEXT:    s_bfe_u32 s8, s4, 0x80010
 ; GFX9-NODL-NEXT:    v_lshrrev_b16_e64 v1, 8, s4
+; GFX9-NODL-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s3, v2, v3
 ; GFX9-NODL-NEXT:    v_mad_u32_u24 v0, v0, v1, v2
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s9
-; GFX9-NODL-NEXT:    s_lshr_b32 s7, s4, 24
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v0, s8, v1, v0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s6, v1, v0
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s8
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v0, s7, v1, v0
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s5, v1, v0
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NODL-NEXT:    global_store_dword v[0:1], v2, off
@@ -1711,24 +1714,25 @@ define amdgpu_kernel void @udot4_acc32_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    s_load_dword s3, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
-; GFX9-DL-NEXT:    s_load_dword s5, s[0:1], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_lshr_b32 s6, s3, 24
-; GFX9-DL-NEXT:    s_bfe_u32 s8, s3, 0x80010
+; GFX9-DL-NEXT:    s_lshr_b32 s5, s3, 24
+; GFX9-DL-NEXT:    s_lshr_b32 s6, s4, 24
+; GFX9-DL-NEXT:    s_bfe_u32 s7, s3, 0x80010
 ; GFX9-DL-NEXT:    v_lshrrev_b16_e64 v0, 8, s3
 ; GFX9-DL-NEXT:    s_and_b32 s3, s3, s2
 ; GFX9-DL-NEXT:    s_and_b32 s2, s4, s2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-DL-NEXT:    s_bfe_u32 s9, s4, 0x80010
+; GFX9-DL-NEXT:    s_bfe_u32 s8, s4, 0x80010
 ; GFX9-DL-NEXT:    v_lshrrev_b16_e64 v1, 8, s4
+; GFX9-DL-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s3, v2, v3
 ; GFX9-DL-NEXT:    v_mad_u32_u24 v0, v0, v1, v2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s9
-; GFX9-DL-NEXT:    s_lshr_b32 s7, s4, 24
-; GFX9-DL-NEXT:    v_mad_u32_u24 v0, s8, v1, v0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s6, v1, v0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s8
+; GFX9-DL-NEXT:    v_mad_u32_u24 v0, s7, v1, v0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s5, v1, v0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
@@ -1745,24 +1749,24 @@ define amdgpu_kernel void @udot4_acc32_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s5, s[6:7], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s6, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-DL-NEXT:    s_and_b32 s7, s4, s3
+; GFX10-DL-NEXT:    s_and_b32 s3, s5, s3
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX10-DL-NEXT:    v_and_b32_sdwa v0, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX10-DL-NEXT:    v_and_b32_sdwa v1, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX10-DL-NEXT:    s_bfe_u32 s2, s4, 0x80010
+; GFX10-DL-NEXT:    s_lshr_b32 s4, s4, 24
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s7, s3, v2
+; GFX10-DL-NEXT:    s_bfe_u32 s3, s5, 0x80010
+; GFX10-DL-NEXT:    s_lshr_b32 s5, s5, 24
+; GFX10-DL-NEXT:    v_mad_u32_u24 v0, v0, v1, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s2, s3, v0
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s4, s5, v0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-DL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-DL-NEXT:    s_and_b32 s0, s4, s3
-; GFX10-DL-NEXT:    s_and_b32 s1, s5, s3
-; GFX10-DL-NEXT:    v_mov_b32_e32 v4, s6
-; GFX10-DL-NEXT:    v_and_b32_sdwa v2, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX10-DL-NEXT:    v_and_b32_sdwa v3, s2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX10-DL-NEXT:    s_bfe_u32 s3, s4, 0x80010
-; GFX10-DL-NEXT:    s_bfe_u32 s2, s5, 0x80010
-; GFX10-DL-NEXT:    v_mad_u32_u24 v4, s0, s1, v4
-; GFX10-DL-NEXT:    s_lshr_b32 s0, s4, 24
-; GFX10-DL-NEXT:    s_lshr_b32 s1, s5, 24
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, v2, v3, v4
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s3, s2, v2
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s0, s1, v2
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                               <4 x i8> addrspace(1)* %src2,
@@ -1798,29 +1802,29 @@ define amdgpu_kernel void @udot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s2, -1
-; GFX7-NEXT:    s_movk_i32 s8, 0xff
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x0
 ; GFX7-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
-; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GFX7-NEXT:    s_movk_i32 s7, 0xff
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_lshr_b32 s6, s4, 24
-; GFX7-NEXT:    s_bfe_u32 s7, s4, 0x80008
-; GFX7-NEXT:    s_bfe_u32 s10, s5, 0x80008
-; GFX7-NEXT:    s_bfe_u32 s12, s5, 0x80010
-; GFX7-NEXT:    s_lshr_b32 s9, s5, 24
-; GFX7-NEXT:    s_and_b32 s5, s5, s8
+; GFX7-NEXT:    s_bfe_u32 s10, s6, 0x80008
+; GFX7-NEXT:    s_bfe_u32 s12, s6, 0x80010
+; GFX7-NEXT:    s_lshr_b32 s9, s6, 24
+; GFX7-NEXT:    s_and_b32 s6, s6, s7
+; GFX7-NEXT:    s_lshr_b32 s5, s4, 24
+; GFX7-NEXT:    s_bfe_u32 s8, s4, 0x80008
 ; GFX7-NEXT:    s_bfe_u32 s11, s4, 0x80010
-; GFX7-NEXT:    s_and_b32 s4, s4, s8
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_and_b32 s4, s4, s7
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s12
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
-; GFX7-NEXT:    v_mad_u32_u24 v0, s7, v2, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s8, v2, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s11, v3, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s9
-; GFX7-NEXT:    v_mad_u32_u24 v0, s6, v1, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s5, v1, v0
 ; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
@@ -1934,27 +1938,27 @@ define amdgpu_kernel void @udot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0xffff
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX10-DL-NEXT:    global_load_ushort v3, v[0:1], off
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_lshrrev_b16_e64 v4, 8, s2
-; GFX10-DL-NEXT:    v_and_b32_sdwa v7, v2, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-DL-NEXT:    v_lshrrev_b16_e64 v5, 8, s3
-; GFX10-DL-NEXT:    v_and_b32_sdwa v6, v2, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-DL-NEXT:    s_lshr_b32 s0, s2, 16
-; GFX10-DL-NEXT:    s_lshr_b32 s1, s3, 16
+; GFX10-DL-NEXT:    v_lshrrev_b16_e64 v4, 8, s0
+; GFX10-DL-NEXT:    v_and_b32_sdwa v7, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-DL-NEXT:    v_lshrrev_b16_e64 v5, 8, s1
+; GFX10-DL-NEXT:    v_and_b32_sdwa v6, v2, s1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-DL-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX10-DL-NEXT:    s_lshr_b32 s3, s1, 16
 ; GFX10-DL-NEXT:    v_lshl_or_b32 v4, v4, 16, v7
-; GFX10-DL-NEXT:    s_lshr_b32 s2, s2, 24
+; GFX10-DL-NEXT:    s_lshr_b32 s0, s0, 24
 ; GFX10-DL-NEXT:    v_lshl_or_b32 v5, v5, 16, v6
-; GFX10-DL-NEXT:    s_lshr_b32 s3, s3, 24
-; GFX10-DL-NEXT:    v_and_b32_sdwa v6, v2, s1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-DL-NEXT:    v_and_b32_sdwa v2, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-DL-NEXT:    s_lshr_b32 s1, s1, 24
+; GFX10-DL-NEXT:    v_and_b32_sdwa v6, v2, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-DL-NEXT:    v_and_b32_sdwa v2, v2, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v4, v4, v5
-; GFX10-DL-NEXT:    v_lshl_or_b32 v5, s3, 16, v6
-; GFX10-DL-NEXT:    v_lshl_or_b32 v2, s2, 16, v2
+; GFX10-DL-NEXT:    v_lshl_or_b32 v5, s1, 16, v6
+; GFX10-DL-NEXT:    v_lshl_or_b32 v2, s0, 16, v2
 ; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v2, v2, v5
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DL-NEXT:    v_add_nc_u32_e32 v3, v4, v3
@@ -2081,25 +2085,25 @@ define amdgpu_kernel void @udot4_acc8_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX9-NODL-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX9-NODL-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NODL-NEXT:    global_load_ubyte v2, v[0:1], off
+; GFX9-NODL-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    s_lshr_b32 s0, s2, 16
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v4, s3
-; GFX9-NODL-NEXT:    s_lshr_b32 s1, s3, 16
-; GFX9-NODL-NEXT:    s_lshr_b32 s4, s3, 24
-; GFX9-NODL-NEXT:    v_mul_lo_u16_e32 v3, s2, v3
-; GFX9-NODL-NEXT:    v_mul_lo_u16_sdwa v4, s2, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s2
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v4, s2
+; GFX9-NODL-NEXT:    s_lshr_b32 s1, s2, 16
+; GFX9-NODL-NEXT:    s_lshr_b32 s3, s2, 24
+; GFX9-NODL-NEXT:    v_mul_lo_u16_e32 v3, s0, v3
+; GFX9-NODL-NEXT:    v_mul_lo_u16_sdwa v4, s0, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
 ; GFX9-NODL-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NODL-NEXT:    s_lshr_b32 s2, s0, 16
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v5, s1
-; GFX9-NODL-NEXT:    s_lshr_b32 s5, s2, 24
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v4, s4
-; GFX9-NODL-NEXT:    v_mul_lo_u16_sdwa v4, s5, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NODL-NEXT:    v_mul_lo_u16_e32 v5, s0, v5
+; GFX9-NODL-NEXT:    s_lshr_b32 s4, s0, 24
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v4, s3
+; GFX9-NODL-NEXT:    v_mul_lo_u16_sdwa v4, s4, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NODL-NEXT:    v_mul_lo_u16_e32 v5, s2, v5
 ; GFX9-NODL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX9-NODL-NEXT:    v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NODL-NEXT:    v_or_b32_e32 v4, v3, v4
@@ -2117,25 +2121,25 @@ define amdgpu_kernel void @udot4_acc8_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX9-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DL-NEXT:    global_load_ubyte v2, v[0:1], off
+; GFX9-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_lshr_b32 s0, s2, 16
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s3
-; GFX9-DL-NEXT:    s_lshr_b32 s1, s3, 16
-; GFX9-DL-NEXT:    s_lshr_b32 s4, s3, 24
-; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v3, s2, v3
-; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v4, s2, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s2
+; GFX9-DL-NEXT:    s_lshr_b32 s1, s2, 16
+; GFX9-DL-NEXT:    s_lshr_b32 s3, s2, 24
+; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v3, s0, v3
+; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v4, s0, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
 ; GFX9-DL-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-DL-NEXT:    s_lshr_b32 s2, s0, 16
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v5, s1
-; GFX9-DL-NEXT:    s_lshr_b32 s5, s2, 24
-; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s4
-; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v4, s5, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v5, s0, v5
+; GFX9-DL-NEXT:    s_lshr_b32 s4, s0, 24
+; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s3
+; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v4, s4, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v5, s2, v5
 ; GFX9-DL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX9-DL-NEXT:    v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-DL-NEXT:    v_or_b32_e32 v4, v3, v4
@@ -2154,24 +2158,24 @@ define amdgpu_kernel void @udot4_acc8_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX10-DL-NEXT:    global_load_ubyte v2, v[0:1], off
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_lshrrev_b16_e64 v3, 8, s2
-; GFX10-DL-NEXT:    v_lshrrev_b16_e64 v4, 8, s3
-; GFX10-DL-NEXT:    s_lshr_b32 s0, s2, 24
-; GFX10-DL-NEXT:    s_lshr_b32 s1, s3, 24
-; GFX10-DL-NEXT:    s_lshr_b32 s4, s2, 16
-; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v5, s2, s3
+; GFX10-DL-NEXT:    v_lshrrev_b16_e64 v3, 8, s0
+; GFX10-DL-NEXT:    v_lshrrev_b16_e64 v4, 8, s1
+; GFX10-DL-NEXT:    s_lshr_b32 s2, s0, 24
+; GFX10-DL-NEXT:    s_lshr_b32 s3, s1, 24
+; GFX10-DL-NEXT:    s_lshr_b32 s4, s0, 16
 ; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v3, v3, v4
 ; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v4, s0, s1
-; GFX10-DL-NEXT:    s_lshr_b32 s0, s3, 16
+; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v5, s2, s3
+; GFX10-DL-NEXT:    s_lshr_b32 s0, s1, 16
 ; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v3, 8, v3
-; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v4, 8, v4
-; GFX10-DL-NEXT:    v_or_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-DL-NEXT:    v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v4, 8, v5
 ; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v5, s4, s0
 ; GFX10-DL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX10-DL-NEXT:    v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD

diff  --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll
index e41f5fe6b6a3..9ce54565b109 100644
--- a/llvm/test/CodeGen/AMDGPU/idot8s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll
@@ -16,38 +16,38 @@ define amdgpu_kernel void @idot8_acc32(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_load_dword s0, s[8:9], 0x0
 ; GFX7-NEXT:    s_load_dword s1, s[10:11], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s21, s[4:5], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_bfe_i32 s8, s0, 0x40000
-; GFX7-NEXT:    s_bfe_i32 s9, s1, 0x40000
-; GFX7-NEXT:    s_bfe_i32 s11, s1, 0x40004
-; GFX7-NEXT:    v_mov_b32_e32 v0, s9
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mad_i32_i24 v0, s8, v0, v1
-; GFX7-NEXT:    s_bfe_i32 s10, s0, 0x40004
-; GFX7-NEXT:    v_mov_b32_e32 v1, s11
-; GFX7-NEXT:    s_bfe_i32 s13, s1, 0x40008
-; GFX7-NEXT:    v_mad_i32_i24 v0, s10, v1, v0
-; GFX7-NEXT:    s_bfe_i32 s12, s0, 0x40008
-; GFX7-NEXT:    v_mov_b32_e32 v1, s13
-; GFX7-NEXT:    s_bfe_i32 s15, s1, 0x4000c
-; GFX7-NEXT:    v_mad_i32_i24 v0, s12, v1, v0
-; GFX7-NEXT:    s_bfe_i32 s14, s0, 0x4000c
-; GFX7-NEXT:    v_mov_b32_e32 v1, s15
-; GFX7-NEXT:    s_bfe_i32 s17, s1, 0x40010
-; GFX7-NEXT:    v_mad_i32_i24 v0, s14, v1, v0
-; GFX7-NEXT:    s_bfe_i32 s16, s0, 0x40010
-; GFX7-NEXT:    v_mov_b32_e32 v1, s17
-; GFX7-NEXT:    s_bfe_i32 s19, s1, 0x40014
-; GFX7-NEXT:    s_bfe_i32 s21, s1, 0x40018
-; GFX7-NEXT:    v_mad_i32_i24 v0, s16, v1, v0
-; GFX7-NEXT:    s_bfe_i32 s18, s0, 0x40014
-; GFX7-NEXT:    v_mov_b32_e32 v1, s19
-; GFX7-NEXT:    s_bfe_i32 s20, s0, 0x40018
-; GFX7-NEXT:    v_mad_i32_i24 v0, s18, v1, v0
+; GFX7-NEXT:    s_bfe_i32 s2, s0, 0x40000
+; GFX7-NEXT:    s_bfe_i32 s8, s1, 0x40000
+; GFX7-NEXT:    s_bfe_i32 s10, s1, 0x40004
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s21
+; GFX7-NEXT:    v_mad_i32_i24 v0, s2, v0, v1
+; GFX7-NEXT:    s_bfe_i32 s9, s0, 0x40004
+; GFX7-NEXT:    v_mov_b32_e32 v1, s10
+; GFX7-NEXT:    s_bfe_i32 s12, s1, 0x40008
+; GFX7-NEXT:    v_mad_i32_i24 v0, s9, v1, v0
+; GFX7-NEXT:    s_bfe_i32 s11, s0, 0x40008
+; GFX7-NEXT:    v_mov_b32_e32 v1, s12
+; GFX7-NEXT:    s_bfe_i32 s14, s1, 0x4000c
+; GFX7-NEXT:    v_mad_i32_i24 v0, s11, v1, v0
+; GFX7-NEXT:    s_bfe_i32 s13, s0, 0x4000c
+; GFX7-NEXT:    v_mov_b32_e32 v1, s14
+; GFX7-NEXT:    s_bfe_i32 s16, s1, 0x40010
+; GFX7-NEXT:    v_mad_i32_i24 v0, s13, v1, v0
+; GFX7-NEXT:    s_bfe_i32 s15, s0, 0x40010
+; GFX7-NEXT:    v_mov_b32_e32 v1, s16
+; GFX7-NEXT:    s_bfe_i32 s18, s1, 0x40014
+; GFX7-NEXT:    s_bfe_i32 s20, s1, 0x40018
+; GFX7-NEXT:    v_mad_i32_i24 v0, s15, v1, v0
+; GFX7-NEXT:    s_bfe_i32 s17, s0, 0x40014
+; GFX7-NEXT:    v_mov_b32_e32 v1, s18
+; GFX7-NEXT:    s_bfe_i32 s19, s0, 0x40018
+; GFX7-NEXT:    v_mad_i32_i24 v0, s17, v1, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s20
 ; GFX7-NEXT:    s_ashr_i32 s1, s1, 28
-; GFX7-NEXT:    v_mad_i32_i24 v0, s20, v1, v0
+; GFX7-NEXT:    v_mad_i32_i24 v0, s19, v1, v0
 ; GFX7-NEXT:    s_ashr_i32 s0, s0, 28
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    v_mad_i32_i24 v0, s0, v1, v0
@@ -61,43 +61,43 @@ define amdgpu_kernel void @idot8_acc32(<8 x i4> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX8-NEXT:    s_load_dword s4, s[6:7], 0x0
-; GFX8-NEXT:    s_load_dword s5, s[0:1], 0x0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    s_load_dword s19, s[0:1], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_bfe_i32 s0, s2, 0x40000
-; GFX8-NEXT:    s_bfe_i32 s1, s4, 0x40000
-; GFX8-NEXT:    s_bfe_i32 s7, s4, 0x40004
-; GFX8-NEXT:    v_mov_b32_e32 v2, s1
-; GFX8-NEXT:    v_mov_b32_e32 v3, s5
-; GFX8-NEXT:    v_mad_i32_i24 v2, s0, v2, v3
-; GFX8-NEXT:    s_bfe_i32 s6, s2, 0x40004
-; GFX8-NEXT:    v_mov_b32_e32 v3, s7
-; GFX8-NEXT:    s_bfe_i32 s9, s4, 0x40008
-; GFX8-NEXT:    v_mad_i32_i24 v2, s6, v3, v2
-; GFX8-NEXT:    s_bfe_i32 s8, s2, 0x40008
-; GFX8-NEXT:    v_mov_b32_e32 v3, s9
-; GFX8-NEXT:    s_bfe_i32 s11, s4, 0x4000c
-; GFX8-NEXT:    v_mad_i32_i24 v2, s8, v3, v2
-; GFX8-NEXT:    s_bfe_i32 s10, s2, 0x4000c
-; GFX8-NEXT:    v_mov_b32_e32 v3, s11
-; GFX8-NEXT:    s_bfe_i32 s13, s4, 0x40010
-; GFX8-NEXT:    v_mad_i32_i24 v2, s10, v3, v2
-; GFX8-NEXT:    s_bfe_i32 s12, s2, 0x40010
-; GFX8-NEXT:    v_mov_b32_e32 v3, s13
-; GFX8-NEXT:    s_bfe_i32 s15, s4, 0x40014
-; GFX8-NEXT:    s_bfe_i32 s17, s4, 0x40018
-; GFX8-NEXT:    v_mad_i32_i24 v2, s12, v3, v2
-; GFX8-NEXT:    s_bfe_i32 s14, s2, 0x40014
-; GFX8-NEXT:    v_mov_b32_e32 v3, s15
-; GFX8-NEXT:    s_bfe_i32 s16, s2, 0x40018
-; GFX8-NEXT:    v_mad_i32_i24 v2, s14, v3, v2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s17
+; GFX8-NEXT:    s_bfe_i32 s5, s2, 0x40000
+; GFX8-NEXT:    s_bfe_i32 s6, s4, 0x40000
+; GFX8-NEXT:    s_bfe_i32 s8, s4, 0x40004
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s19
+; GFX8-NEXT:    v_mad_i32_i24 v0, s5, v0, v1
+; GFX8-NEXT:    s_bfe_i32 s7, s2, 0x40004
+; GFX8-NEXT:    v_mov_b32_e32 v1, s8
+; GFX8-NEXT:    s_bfe_i32 s10, s4, 0x40008
+; GFX8-NEXT:    v_mad_i32_i24 v0, s7, v1, v0
+; GFX8-NEXT:    s_bfe_i32 s9, s2, 0x40008
+; GFX8-NEXT:    v_mov_b32_e32 v1, s10
+; GFX8-NEXT:    s_bfe_i32 s12, s4, 0x4000c
+; GFX8-NEXT:    v_mad_i32_i24 v0, s9, v1, v0
+; GFX8-NEXT:    s_bfe_i32 s11, s2, 0x4000c
+; GFX8-NEXT:    v_mov_b32_e32 v1, s12
+; GFX8-NEXT:    s_bfe_i32 s14, s4, 0x40010
+; GFX8-NEXT:    v_mad_i32_i24 v0, s11, v1, v0
+; GFX8-NEXT:    s_bfe_i32 s13, s2, 0x40010
+; GFX8-NEXT:    v_mov_b32_e32 v1, s14
+; GFX8-NEXT:    s_bfe_i32 s16, s4, 0x40014
+; GFX8-NEXT:    s_bfe_i32 s18, s4, 0x40018
+; GFX8-NEXT:    v_mad_i32_i24 v0, s13, v1, v0
+; GFX8-NEXT:    s_bfe_i32 s15, s2, 0x40014
+; GFX8-NEXT:    v_mov_b32_e32 v1, s16
+; GFX8-NEXT:    s_bfe_i32 s17, s2, 0x40018
+; GFX8-NEXT:    v_mad_i32_i24 v0, s15, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s18
 ; GFX8-NEXT:    s_ashr_i32 s4, s4, 28
-; GFX8-NEXT:    v_mad_i32_i24 v2, s16, v3, v2
+; GFX8-NEXT:    v_mad_i32_i24 v0, s17, v1, v0
 ; GFX8-NEXT:    s_ashr_i32 s2, s2, 28
-; GFX8-NEXT:    v_mov_b32_e32 v3, s4
-; GFX8-NEXT:    v_mad_i32_i24 v2, s2, v3, v2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s4
+; GFX8-NEXT:    v_mad_i32_i24 v2, s2, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -108,43 +108,43 @@ define amdgpu_kernel void @idot8_acc32(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-NEXT:    s_load_dword s4, s[6:7], 0x0
-; GFX9-NEXT:    s_load_dword s5, s[0:1], 0x0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    s_load_dword s19, s[0:1], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_bfe_i32 s0, s2, 0x40000
-; GFX9-NEXT:    s_bfe_i32 s1, s4, 0x40000
-; GFX9-NEXT:    s_bfe_i32 s7, s4, 0x40004
-; GFX9-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-NEXT:    v_mad_i32_i24 v2, s0, v2, v3
-; GFX9-NEXT:    s_bfe_i32 s6, s2, 0x40004
-; GFX9-NEXT:    v_mov_b32_e32 v3, s7
-; GFX9-NEXT:    s_bfe_i32 s9, s4, 0x40008
-; GFX9-NEXT:    v_mad_i32_i24 v2, s6, v3, v2
-; GFX9-NEXT:    s_bfe_i32 s8, s2, 0x40008
-; GFX9-NEXT:    v_mov_b32_e32 v3, s9
-; GFX9-NEXT:    s_bfe_i32 s11, s4, 0x4000c
-; GFX9-NEXT:    v_mad_i32_i24 v2, s8, v3, v2
-; GFX9-NEXT:    s_bfe_i32 s10, s2, 0x4000c
-; GFX9-NEXT:    v_mov_b32_e32 v3, s11
-; GFX9-NEXT:    s_bfe_i32 s13, s4, 0x40010
-; GFX9-NEXT:    v_mad_i32_i24 v2, s10, v3, v2
-; GFX9-NEXT:    s_bfe_i32 s12, s2, 0x40010
-; GFX9-NEXT:    v_mov_b32_e32 v3, s13
-; GFX9-NEXT:    s_bfe_i32 s15, s4, 0x40014
-; GFX9-NEXT:    s_bfe_i32 s17, s4, 0x40018
-; GFX9-NEXT:    v_mad_i32_i24 v2, s12, v3, v2
-; GFX9-NEXT:    s_bfe_i32 s14, s2, 0x40014
-; GFX9-NEXT:    v_mov_b32_e32 v3, s15
-; GFX9-NEXT:    s_bfe_i32 s16, s2, 0x40018
-; GFX9-NEXT:    v_mad_i32_i24 v2, s14, v3, v2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s17
+; GFX9-NEXT:    s_bfe_i32 s5, s2, 0x40000
+; GFX9-NEXT:    s_bfe_i32 s6, s4, 0x40000
+; GFX9-NEXT:    s_bfe_i32 s8, s4, 0x40004
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s19
+; GFX9-NEXT:    v_mad_i32_i24 v0, s5, v0, v1
+; GFX9-NEXT:    s_bfe_i32 s7, s2, 0x40004
+; GFX9-NEXT:    v_mov_b32_e32 v1, s8
+; GFX9-NEXT:    s_bfe_i32 s10, s4, 0x40008
+; GFX9-NEXT:    v_mad_i32_i24 v0, s7, v1, v0
+; GFX9-NEXT:    s_bfe_i32 s9, s2, 0x40008
+; GFX9-NEXT:    v_mov_b32_e32 v1, s10
+; GFX9-NEXT:    s_bfe_i32 s12, s4, 0x4000c
+; GFX9-NEXT:    v_mad_i32_i24 v0, s9, v1, v0
+; GFX9-NEXT:    s_bfe_i32 s11, s2, 0x4000c
+; GFX9-NEXT:    v_mov_b32_e32 v1, s12
+; GFX9-NEXT:    s_bfe_i32 s14, s4, 0x40010
+; GFX9-NEXT:    v_mad_i32_i24 v0, s11, v1, v0
+; GFX9-NEXT:    s_bfe_i32 s13, s2, 0x40010
+; GFX9-NEXT:    v_mov_b32_e32 v1, s14
+; GFX9-NEXT:    s_bfe_i32 s16, s4, 0x40014
+; GFX9-NEXT:    s_bfe_i32 s18, s4, 0x40018
+; GFX9-NEXT:    v_mad_i32_i24 v0, s13, v1, v0
+; GFX9-NEXT:    s_bfe_i32 s15, s2, 0x40014
+; GFX9-NEXT:    v_mov_b32_e32 v1, s16
+; GFX9-NEXT:    s_bfe_i32 s17, s2, 0x40018
+; GFX9-NEXT:    v_mad_i32_i24 v0, s15, v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s18
 ; GFX9-NEXT:    s_ashr_i32 s4, s4, 28
-; GFX9-NEXT:    v_mad_i32_i24 v2, s16, v3, v2
+; GFX9-NEXT:    v_mad_i32_i24 v0, s17, v1, v0
 ; GFX9-NEXT:    s_ashr_i32 s2, s2, 28
-; GFX9-NEXT:    v_mov_b32_e32 v3, s4
-; GFX9-NEXT:    v_mad_i32_i24 v2, s2, v3, v2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:    v_mad_i32_i24 v2, s2, v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -153,15 +153,15 @@ define amdgpu_kernel void @idot8_acc32(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX9-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
-; GFX9-DL-NEXT:    s_load_dword s5, s[0:1], 0x0
+; GFX9-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
+; GFX9-DL-NEXT:    s_load_dword s6, s[0:1], 0x0
+; GFX9-DL-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-DL-NEXT:    v_dot8_i32_i4 v2, s2, v2, v3
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s6
+; GFX9-DL-NEXT:    v_dot8_i32_i4 v2, s4, v2, v3
 ; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
 ;
@@ -171,14 +171,14 @@ define amdgpu_kernel void @idot8_acc32(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
-; GFX10-DL-NEXT:    s_load_dword s5, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-DL-NEXT:    v_dot8_i32_i4 v2, s4, s5, v0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-DL-NEXT:    v_dot8_i32_i4 v2, s2, s4, v2
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                        <8 x i4> addrspace(1)* %src2,
@@ -321,49 +321,49 @@ define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX8-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX8-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_load_ushort v2, v[0:1]
+; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_bfe_i32 s0, s2, 0x40000
-; GFX8-NEXT:    s_bfe_i32 s1, s4, 0x40000
+; GFX8-NEXT:    s_bfe_i32 s1, s2, 0x40000
+; GFX8-NEXT:    s_bfe_i32 s4, s2, 0x40004
+; GFX8-NEXT:    s_bfe_i32 s5, s2, 0x40008
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    s_bfe_i32 s5, s4, 0x40004
-; GFX8-NEXT:    s_bfe_i32 s6, s4, 0x40008
-; GFX8-NEXT:    s_lshr_b32 s1, s2, 12
-; GFX8-NEXT:    s_lshr_b32 s7, s4, 12
-; GFX8-NEXT:    s_bfe_i32 s8, s2, 0x40004
-; GFX8-NEXT:    s_bfe_i32 s9, s2, 0x40008
-; GFX8-NEXT:    v_mov_b32_e32 v4, s6
-; GFX8-NEXT:    v_mov_b32_e32 v7, s5
-; GFX8-NEXT:    v_lshlrev_b16_e64 v5, 12, s1
-; GFX8-NEXT:    v_lshlrev_b16_e64 v6, 12, s7
+; GFX8-NEXT:    s_bfe_i32 s6, s0, 0x40000
+; GFX8-NEXT:    s_lshr_b32 s1, s0, 12
+; GFX8-NEXT:    s_lshr_b32 s7, s2, 12
+; GFX8-NEXT:    v_mov_b32_e32 v4, s5
+; GFX8-NEXT:    s_bfe_i32 s9, s0, 0x40008
+; GFX8-NEXT:    v_mov_b32_e32 v5, s4
+; GFX8-NEXT:    s_bfe_i32 s8, s0, 0x40004
+; GFX8-NEXT:    v_lshlrev_b16_e64 v6, 12, s1
+; GFX8-NEXT:    v_lshlrev_b16_e64 v7, 12, s7
 ; GFX8-NEXT:    v_mul_i32_i24_e32 v4, s9, v4
-; GFX8-NEXT:    s_bfe_i32 s10, s4, 0x40010
-; GFX8-NEXT:    v_ashrrev_i16_e32 v5, 12, v5
+; GFX8-NEXT:    s_bfe_i32 s1, s2, 0x40010
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v6, 12, v6
-; GFX8-NEXT:    s_bfe_i32 s12, s4, 0x40014
-; GFX8-NEXT:    s_bfe_i32 s11, s2, 0x40010
-; GFX8-NEXT:    v_mov_b32_e32 v8, s10
-; GFX8-NEXT:    s_bfe_i32 s14, s4, 0x40018
-; GFX8-NEXT:    s_bfe_i32 s13, s2, 0x40014
-; GFX8-NEXT:    v_mov_b32_e32 v9, s12
-; GFX8-NEXT:    s_bfe_i32 s15, s2, 0x40018
-; GFX8-NEXT:    s_ashr_i32 s4, s4, 28
-; GFX8-NEXT:    v_mov_b32_e32 v10, s14
+; GFX8-NEXT:    v_ashrrev_i16_e32 v7, 12, v7
+; GFX8-NEXT:    s_bfe_i32 s5, s2, 0x40014
+; GFX8-NEXT:    v_mov_b32_e32 v8, s1
+; GFX8-NEXT:    s_bfe_i32 s4, s0, 0x40010
+; GFX8-NEXT:    s_bfe_i32 s7, s2, 0x40018
+; GFX8-NEXT:    v_mov_b32_e32 v9, s5
+; GFX8-NEXT:    s_bfe_i32 s1, s0, 0x40014
+; GFX8-NEXT:    s_bfe_i32 s5, s0, 0x40018
 ; GFX8-NEXT:    s_ashr_i32 s2, s2, 28
+; GFX8-NEXT:    v_mov_b32_e32 v10, s7
+; GFX8-NEXT:    s_ashr_i32 s0, s0, 28
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mad_i32_i24 v2, s0, v3, v2
-; GFX8-NEXT:    v_mad_i32_i24 v2, s8, v7, v2
+; GFX8-NEXT:    v_mad_i32_i24 v2, s6, v3, v2
+; GFX8-NEXT:    v_mad_i32_i24 v2, s8, v5, v2
 ; GFX8-NEXT:    v_add_u32_sdwa v2, vcc, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX8-NEXT:    v_mad_u32_u24 v2, v5, v6, v2
-; GFX8-NEXT:    v_mad_i32_i24 v2, s11, v8, v2
-; GFX8-NEXT:    v_mad_i32_i24 v2, s13, v9, v2
-; GFX8-NEXT:    v_mad_i32_i24 v2, s15, v10, v2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s4
-; GFX8-NEXT:    v_mad_i32_i24 v2, s2, v3, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, v6, v7, v2
+; GFX8-NEXT:    v_mad_i32_i24 v2, s4, v8, v2
+; GFX8-NEXT:    v_mad_i32_i24 v2, s1, v9, v2
+; GFX8-NEXT:    v_mad_i32_i24 v2, s5, v10, v2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s2
+; GFX8-NEXT:    v_mad_i32_i24 v2, s0, v3, v2
 ; GFX8-NEXT:    flat_store_short v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -372,49 +372,49 @@ define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX9-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX9-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    global_load_ushort v2, v[0:1], off
+; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_bfe_i32 s0, s2, 0x40000
-; GFX9-NEXT:    s_bfe_i32 s1, s4, 0x40000
+; GFX9-NEXT:    s_bfe_i32 s1, s2, 0x40000
+; GFX9-NEXT:    s_bfe_i32 s4, s2, 0x40004
+; GFX9-NEXT:    s_bfe_i32 s5, s2, 0x40008
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    s_bfe_i32 s5, s4, 0x40004
-; GFX9-NEXT:    s_bfe_i32 s6, s4, 0x40008
-; GFX9-NEXT:    s_lshr_b32 s1, s2, 12
-; GFX9-NEXT:    s_lshr_b32 s7, s4, 12
-; GFX9-NEXT:    s_bfe_i32 s8, s2, 0x40004
-; GFX9-NEXT:    s_bfe_i32 s9, s2, 0x40008
-; GFX9-NEXT:    v_mov_b32_e32 v4, s6
-; GFX9-NEXT:    v_mov_b32_e32 v7, s5
-; GFX9-NEXT:    v_lshlrev_b16_e64 v5, 12, s1
-; GFX9-NEXT:    v_lshlrev_b16_e64 v6, 12, s7
+; GFX9-NEXT:    s_bfe_i32 s6, s0, 0x40000
+; GFX9-NEXT:    s_lshr_b32 s1, s0, 12
+; GFX9-NEXT:    s_lshr_b32 s7, s2, 12
+; GFX9-NEXT:    v_mov_b32_e32 v4, s5
+; GFX9-NEXT:    s_bfe_i32 s9, s0, 0x40008
+; GFX9-NEXT:    v_mov_b32_e32 v5, s4
+; GFX9-NEXT:    s_bfe_i32 s8, s0, 0x40004
+; GFX9-NEXT:    v_lshlrev_b16_e64 v6, 12, s1
+; GFX9-NEXT:    v_lshlrev_b16_e64 v7, 12, s7
 ; GFX9-NEXT:    v_mul_i32_i24_e32 v4, s9, v4
-; GFX9-NEXT:    s_bfe_i32 s10, s4, 0x40010
-; GFX9-NEXT:    v_ashrrev_i16_e32 v5, 12, v5
+; GFX9-NEXT:    s_bfe_i32 s1, s2, 0x40010
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v6, 12, v6
-; GFX9-NEXT:    s_bfe_i32 s12, s4, 0x40014
-; GFX9-NEXT:    s_bfe_i32 s11, s2, 0x40010
-; GFX9-NEXT:    v_mov_b32_e32 v8, s10
-; GFX9-NEXT:    s_bfe_i32 s14, s4, 0x40018
-; GFX9-NEXT:    s_bfe_i32 s13, s2, 0x40014
-; GFX9-NEXT:    v_mov_b32_e32 v9, s12
-; GFX9-NEXT:    s_bfe_i32 s15, s2, 0x40018
-; GFX9-NEXT:    s_ashr_i32 s4, s4, 28
-; GFX9-NEXT:    v_mov_b32_e32 v10, s14
+; GFX9-NEXT:    v_ashrrev_i16_e32 v7, 12, v7
+; GFX9-NEXT:    s_bfe_i32 s5, s2, 0x40014
+; GFX9-NEXT:    v_mov_b32_e32 v8, s1
+; GFX9-NEXT:    s_bfe_i32 s4, s0, 0x40010
+; GFX9-NEXT:    s_bfe_i32 s7, s2, 0x40018
+; GFX9-NEXT:    v_mov_b32_e32 v9, s5
+; GFX9-NEXT:    s_bfe_i32 s1, s0, 0x40014
+; GFX9-NEXT:    s_bfe_i32 s5, s0, 0x40018
 ; GFX9-NEXT:    s_ashr_i32 s2, s2, 28
+; GFX9-NEXT:    v_mov_b32_e32 v10, s7
+; GFX9-NEXT:    s_ashr_i32 s0, s0, 28
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mad_i32_i24 v2, s0, v3, v2
-; GFX9-NEXT:    v_mad_i32_i24 v2, s8, v7, v2
+; GFX9-NEXT:    v_mad_i32_i24 v2, s6, v3, v2
+; GFX9-NEXT:    v_mad_i32_i24 v2, s8, v5, v2
 ; GFX9-NEXT:    v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX9-NEXT:    v_mad_u32_u24 v2, v5, v6, v2
-; GFX9-NEXT:    v_mad_i32_i24 v2, s11, v8, v2
-; GFX9-NEXT:    v_mad_i32_i24 v2, s13, v9, v2
-; GFX9-NEXT:    v_mad_i32_i24 v2, s15, v10, v2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s4
-; GFX9-NEXT:    v_mad_i32_i24 v2, s2, v3, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, v6, v7, v2
+; GFX9-NEXT:    v_mad_i32_i24 v2, s4, v8, v2
+; GFX9-NEXT:    v_mad_i32_i24 v2, s1, v9, v2
+; GFX9-NEXT:    v_mad_i32_i24 v2, s5, v10, v2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s2
+; GFX9-NEXT:    v_mad_i32_i24 v2, s0, v3, v2
 ; GFX9-NEXT:    global_store_short v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -423,49 +423,49 @@ define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX9-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX9-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DL-NEXT:    global_load_ushort v2, v[0:1], off
+; GFX9-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_bfe_i32 s0, s2, 0x40000
-; GFX9-DL-NEXT:    s_bfe_i32 s1, s4, 0x40000
+; GFX9-DL-NEXT:    s_bfe_i32 s1, s2, 0x40000
+; GFX9-DL-NEXT:    s_bfe_i32 s4, s2, 0x40004
+; GFX9-DL-NEXT:    s_bfe_i32 s5, s2, 0x40008
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-DL-NEXT:    s_bfe_i32 s5, s4, 0x40004
-; GFX9-DL-NEXT:    s_bfe_i32 s6, s4, 0x40008
-; GFX9-DL-NEXT:    s_lshr_b32 s1, s2, 12
-; GFX9-DL-NEXT:    s_lshr_b32 s7, s4, 12
-; GFX9-DL-NEXT:    s_bfe_i32 s8, s2, 0x40004
-; GFX9-DL-NEXT:    s_bfe_i32 s9, s2, 0x40008
-; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s6
-; GFX9-DL-NEXT:    v_mov_b32_e32 v7, s5
-; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v5, 12, s1
-; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v6, 12, s7
+; GFX9-DL-NEXT:    s_bfe_i32 s6, s0, 0x40000
+; GFX9-DL-NEXT:    s_lshr_b32 s1, s0, 12
+; GFX9-DL-NEXT:    s_lshr_b32 s7, s2, 12
+; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s5
+; GFX9-DL-NEXT:    s_bfe_i32 s9, s0, 0x40008
+; GFX9-DL-NEXT:    v_mov_b32_e32 v5, s4
+; GFX9-DL-NEXT:    s_bfe_i32 s8, s0, 0x40004
+; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v6, 12, s1
+; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v7, 12, s7
 ; GFX9-DL-NEXT:    v_mul_i32_i24_e32 v4, s9, v4
-; GFX9-DL-NEXT:    s_bfe_i32 s10, s4, 0x40010
-; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v5, 12, v5
+; GFX9-DL-NEXT:    s_bfe_i32 s1, s2, 0x40010
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v6, 12, v6
-; GFX9-DL-NEXT:    s_bfe_i32 s12, s4, 0x40014
-; GFX9-DL-NEXT:    s_bfe_i32 s11, s2, 0x40010
-; GFX9-DL-NEXT:    v_mov_b32_e32 v8, s10
-; GFX9-DL-NEXT:    s_bfe_i32 s14, s4, 0x40018
-; GFX9-DL-NEXT:    s_bfe_i32 s13, s2, 0x40014
-; GFX9-DL-NEXT:    v_mov_b32_e32 v9, s12
-; GFX9-DL-NEXT:    s_bfe_i32 s15, s2, 0x40018
-; GFX9-DL-NEXT:    s_ashr_i32 s4, s4, 28
-; GFX9-DL-NEXT:    v_mov_b32_e32 v10, s14
+; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v7, 12, v7
+; GFX9-DL-NEXT:    s_bfe_i32 s5, s2, 0x40014
+; GFX9-DL-NEXT:    v_mov_b32_e32 v8, s1
+; GFX9-DL-NEXT:    s_bfe_i32 s4, s0, 0x40010
+; GFX9-DL-NEXT:    s_bfe_i32 s7, s2, 0x40018
+; GFX9-DL-NEXT:    v_mov_b32_e32 v9, s5
+; GFX9-DL-NEXT:    s_bfe_i32 s1, s0, 0x40014
+; GFX9-DL-NEXT:    s_bfe_i32 s5, s0, 0x40018
 ; GFX9-DL-NEXT:    s_ashr_i32 s2, s2, 28
+; GFX9-DL-NEXT:    v_mov_b32_e32 v10, s7
+; GFX9-DL-NEXT:    s_ashr_i32 s0, s0, 28
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s0, v3, v2
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s8, v7, v2
+; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s6, v3, v2
+; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s8, v5, v2
 ; GFX9-DL-NEXT:    v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, v5, v6, v2
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s11, v8, v2
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s13, v9, v2
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s15, v10, v2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s4
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s2, v3, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, v6, v7, v2
+; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s4, v8, v2
+; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s1, v9, v2
+; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s5, v10, v2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s2
+; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s0, v3, v2
 ; GFX9-DL-NEXT:    global_store_short v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
 ;
@@ -476,44 +476,44 @@ define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX10-DL-NEXT:    s_load_dword s5, s[6:7], 0x0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX10-DL-NEXT:    global_load_ushort v2, v[0:1], off
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_lshr_b32 s0, s4, 12
-; GFX10-DL-NEXT:    s_lshr_b32 s1, s5, 12
-; GFX10-DL-NEXT:    s_bfe_i32 s6, s4, 0x40000
-; GFX10-DL-NEXT:    s_bfe_i32 s7, s5, 0x40000
-; GFX10-DL-NEXT:    s_bfe_i32 s8, s4, 0x40004
-; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v3, 12, s0
-; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v4, 12, s1
-; GFX10-DL-NEXT:    s_bfe_i32 s9, s5, 0x40004
-; GFX10-DL-NEXT:    s_bfe_i32 s10, s4, 0x40008
-; GFX10-DL-NEXT:    s_bfe_i32 s11, s5, 0x40008
+; GFX10-DL-NEXT:    s_lshr_b32 s4, s0, 12
+; GFX10-DL-NEXT:    s_lshr_b32 s5, s1, 12
+; GFX10-DL-NEXT:    s_bfe_i32 s6, s0, 0x40000
+; GFX10-DL-NEXT:    s_bfe_i32 s7, s1, 0x40000
+; GFX10-DL-NEXT:    s_bfe_i32 s8, s0, 0x40004
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v3, 12, s4
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v4, 12, s5
+; GFX10-DL-NEXT:    s_bfe_i32 s9, s1, 0x40004
+; GFX10-DL-NEXT:    s_bfe_i32 s10, s0, 0x40008
+; GFX10-DL-NEXT:    s_bfe_i32 s11, s1, 0x40008
 ; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v3, 12, v3
 ; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v4, 12, v4
-; GFX10-DL-NEXT:    s_bfe_i32 s0, s4, 0x40010
-; GFX10-DL-NEXT:    s_bfe_i32 s1, s5, 0x40010
+; GFX10-DL-NEXT:    s_bfe_i32 s4, s1, 0x40010
+; GFX10-DL-NEXT:    s_bfe_i32 s5, s0, 0x40014
 ; GFX10-DL-NEXT:    v_mul_i32_i24_e64 v5, s10, s11
 ; GFX10-DL-NEXT:    v_and_b32_e32 v3, s2, v3
 ; GFX10-DL-NEXT:    v_and_b32_e32 v4, s2, v4
-; GFX10-DL-NEXT:    s_bfe_i32 s10, s4, 0x40014
-; GFX10-DL-NEXT:    s_bfe_i32 s11, s5, 0x40014
-; GFX10-DL-NEXT:    s_bfe_i32 s12, s4, 0x40018
-; GFX10-DL-NEXT:    s_bfe_i32 s2, s5, 0x40018
-; GFX10-DL-NEXT:    s_ashr_i32 s4, s4, 28
-; GFX10-DL-NEXT:    s_ashr_i32 s5, s5, 28
+; GFX10-DL-NEXT:    s_bfe_i32 s2, s0, 0x40010
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s6, s7, v2
+; GFX10-DL-NEXT:    s_bfe_i32 s6, s1, 0x40014
 ; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s8, s9, v2
 ; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
 ; GFX10-DL-NEXT:    v_mad_u32_u24 v2, v3, v4, v2
+; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s2, s4, v2
+; GFX10-DL-NEXT:    s_bfe_i32 s2, s0, 0x40018
+; GFX10-DL-NEXT:    s_bfe_i32 s4, s1, 0x40018
+; GFX10-DL-NEXT:    s_ashr_i32 s0, s0, 28
+; GFX10-DL-NEXT:    s_ashr_i32 s1, s1, 28
+; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s5, s6, v2
+; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s2, s4, v2
 ; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s0, s1, v2
-; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s10, s11, v2
-; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s12, s2, v2
-; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s4, s5, v2
 ; GFX10-DL-NEXT:    global_store_short v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                        <8 x i4> addrspace(1)* %src2,
@@ -656,21 +656,21 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX8-NEXT:    s_movk_i32 s2, 0xff
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_load_dword s6, s[6:7], 0x0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_load_ubyte v2, v[0:1]
 ; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX8-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_lshr_b32 s4, s0, 12
-; GFX8-NEXT:    s_bfe_i32 s7, s1, 0x40000
-; GFX8-NEXT:    s_lshr_b32 s5, s1, 12
-; GFX8-NEXT:    s_bfe_i32 s9, s1, 0x40004
-; GFX8-NEXT:    s_bfe_i32 s11, s1, 0x40008
-; GFX8-NEXT:    s_bfe_i32 s6, s0, 0x40000
+; GFX8-NEXT:    s_bfe_i32 s7, s6, 0x40000
+; GFX8-NEXT:    s_lshr_b32 s4, s6, 12
+; GFX8-NEXT:    s_bfe_i32 s9, s6, 0x40004
+; GFX8-NEXT:    s_bfe_i32 s11, s6, 0x40008
+; GFX8-NEXT:    s_lshr_b32 s1, s0, 12
+; GFX8-NEXT:    s_bfe_i32 s5, s0, 0x40000
 ; GFX8-NEXT:    v_mov_b32_e32 v6, s7
-; GFX8-NEXT:    v_lshlrev_b16_e64 v4, 12, s4
-; GFX8-NEXT:    v_lshlrev_b16_e64 v5, 12, s5
+; GFX8-NEXT:    v_lshlrev_b16_e64 v4, 12, s1
+; GFX8-NEXT:    v_lshlrev_b16_e64 v5, 12, s4
 ; GFX8-NEXT:    s_bfe_i32 s8, s0, 0x40004
 ; GFX8-NEXT:    s_bfe_i32 s10, s0, 0x40008
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s11
@@ -678,28 +678,28 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1,
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v4, 12, v4
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v5, 12, v5
 ; GFX8-NEXT:    v_mul_i32_i24_e32 v3, s10, v3
-; GFX8-NEXT:    s_bfe_i32 s13, s1, 0x40010
+; GFX8-NEXT:    s_bfe_i32 s13, s6, 0x40010
 ; GFX8-NEXT:    v_and_b32_e32 v4, s2, v4
 ; GFX8-NEXT:    v_and_b32_e32 v5, s2, v5
-; GFX8-NEXT:    s_bfe_i32 s15, s1, 0x40014
+; GFX8-NEXT:    s_bfe_i32 s15, s6, 0x40014
 ; GFX8-NEXT:    s_bfe_i32 s12, s0, 0x40010
 ; GFX8-NEXT:    v_mov_b32_e32 v8, s13
-; GFX8-NEXT:    s_bfe_i32 s17, s1, 0x40018
+; GFX8-NEXT:    s_bfe_i32 s17, s6, 0x40018
 ; GFX8-NEXT:    s_bfe_i32 s14, s0, 0x40014
 ; GFX8-NEXT:    v_mov_b32_e32 v9, s15
 ; GFX8-NEXT:    s_bfe_i32 s16, s0, 0x40018
-; GFX8-NEXT:    s_ashr_i32 s1, s1, 28
+; GFX8-NEXT:    s_ashr_i32 s6, s6, 28
 ; GFX8-NEXT:    v_mov_b32_e32 v10, s17
 ; GFX8-NEXT:    s_ashr_i32 s0, s0, 28
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mad_i32_i24 v2, s6, v6, v2
+; GFX8-NEXT:    v_mad_i32_i24 v2, s5, v6, v2
 ; GFX8-NEXT:    v_mad_i32_i24 v2, s8, v7, v2
 ; GFX8-NEXT:    v_add_u32_sdwa v2, vcc, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
 ; GFX8-NEXT:    v_mad_u32_u24 v2, v4, v5, v2
 ; GFX8-NEXT:    v_mad_i32_i24 v2, s12, v8, v2
 ; GFX8-NEXT:    v_mad_i32_i24 v2, s14, v9, v2
 ; GFX8-NEXT:    v_mad_i32_i24 v2, s16, v10, v2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX8-NEXT:    v_mad_i32_i24 v2, s0, v3, v2
 ; GFX8-NEXT:    flat_store_byte v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
@@ -710,21 +710,21 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-NEXT:    s_movk_i32 s2, 0xff
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_load_dword s6, s[6:7], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    global_load_ubyte v2, v[0:1], off
 ; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX9-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshr_b32 s4, s0, 12
-; GFX9-NEXT:    s_bfe_i32 s7, s1, 0x40000
-; GFX9-NEXT:    s_lshr_b32 s5, s1, 12
-; GFX9-NEXT:    s_bfe_i32 s9, s1, 0x40004
-; GFX9-NEXT:    s_bfe_i32 s11, s1, 0x40008
-; GFX9-NEXT:    s_bfe_i32 s6, s0, 0x40000
+; GFX9-NEXT:    s_bfe_i32 s7, s6, 0x40000
+; GFX9-NEXT:    s_lshr_b32 s4, s6, 12
+; GFX9-NEXT:    s_bfe_i32 s9, s6, 0x40004
+; GFX9-NEXT:    s_bfe_i32 s11, s6, 0x40008
+; GFX9-NEXT:    s_lshr_b32 s1, s0, 12
+; GFX9-NEXT:    s_bfe_i32 s5, s0, 0x40000
 ; GFX9-NEXT:    v_mov_b32_e32 v6, s7
-; GFX9-NEXT:    v_lshlrev_b16_e64 v4, 12, s4
-; GFX9-NEXT:    v_lshlrev_b16_e64 v5, 12, s5
+; GFX9-NEXT:    v_lshlrev_b16_e64 v4, 12, s1
+; GFX9-NEXT:    v_lshlrev_b16_e64 v5, 12, s4
 ; GFX9-NEXT:    s_bfe_i32 s8, s0, 0x40004
 ; GFX9-NEXT:    s_bfe_i32 s10, s0, 0x40008
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s11
@@ -732,28 +732,28 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v4, 12, v4
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v5, 12, v5
 ; GFX9-NEXT:    v_mul_i32_i24_e32 v3, s10, v3
-; GFX9-NEXT:    s_bfe_i32 s13, s1, 0x40010
+; GFX9-NEXT:    s_bfe_i32 s13, s6, 0x40010
 ; GFX9-NEXT:    v_and_b32_e32 v4, s2, v4
 ; GFX9-NEXT:    v_and_b32_e32 v5, s2, v5
-; GFX9-NEXT:    s_bfe_i32 s15, s1, 0x40014
+; GFX9-NEXT:    s_bfe_i32 s15, s6, 0x40014
 ; GFX9-NEXT:    s_bfe_i32 s12, s0, 0x40010
 ; GFX9-NEXT:    v_mov_b32_e32 v8, s13
-; GFX9-NEXT:    s_bfe_i32 s17, s1, 0x40018
+; GFX9-NEXT:    s_bfe_i32 s17, s6, 0x40018
 ; GFX9-NEXT:    s_bfe_i32 s14, s0, 0x40014
 ; GFX9-NEXT:    v_mov_b32_e32 v9, s15
 ; GFX9-NEXT:    s_bfe_i32 s16, s0, 0x40018
-; GFX9-NEXT:    s_ashr_i32 s1, s1, 28
+; GFX9-NEXT:    s_ashr_i32 s6, s6, 28
 ; GFX9-NEXT:    v_mov_b32_e32 v10, s17
 ; GFX9-NEXT:    s_ashr_i32 s0, s0, 28
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mad_i32_i24 v2, s6, v6, v2
+; GFX9-NEXT:    v_mad_i32_i24 v2, s5, v6, v2
 ; GFX9-NEXT:    v_mad_i32_i24 v2, s8, v7, v2
 ; GFX9-NEXT:    v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
 ; GFX9-NEXT:    v_mad_u32_u24 v2, v4, v5, v2
 ; GFX9-NEXT:    v_mad_i32_i24 v2, s12, v8, v2
 ; GFX9-NEXT:    v_mad_i32_i24 v2, s14, v9, v2
 ; GFX9-NEXT:    v_mad_i32_i24 v2, s16, v10, v2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX9-NEXT:    v_mad_i32_i24 v2, s0, v3, v2
 ; GFX9-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
@@ -764,21 +764,21 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-DL-NEXT:    s_movk_i32 s2, 0xff
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    s_load_dword s6, s[6:7], 0x0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DL-NEXT:    global_load_ubyte v2, v[0:1], off
 ; GFX9-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX9-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_lshr_b32 s4, s0, 12
-; GFX9-DL-NEXT:    s_bfe_i32 s7, s1, 0x40000
-; GFX9-DL-NEXT:    s_lshr_b32 s5, s1, 12
-; GFX9-DL-NEXT:    s_bfe_i32 s9, s1, 0x40004
-; GFX9-DL-NEXT:    s_bfe_i32 s11, s1, 0x40008
-; GFX9-DL-NEXT:    s_bfe_i32 s6, s0, 0x40000
+; GFX9-DL-NEXT:    s_bfe_i32 s7, s6, 0x40000
+; GFX9-DL-NEXT:    s_lshr_b32 s4, s6, 12
+; GFX9-DL-NEXT:    s_bfe_i32 s9, s6, 0x40004
+; GFX9-DL-NEXT:    s_bfe_i32 s11, s6, 0x40008
+; GFX9-DL-NEXT:    s_lshr_b32 s1, s0, 12
+; GFX9-DL-NEXT:    s_bfe_i32 s5, s0, 0x40000
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v6, s7
-; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v4, 12, s4
-; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v5, 12, s5
+; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v4, 12, s1
+; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v5, 12, s4
 ; GFX9-DL-NEXT:    s_bfe_i32 s8, s0, 0x40004
 ; GFX9-DL-NEXT:    s_bfe_i32 s10, s0, 0x40008
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s11
@@ -786,28 +786,28 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v4, 12, v4
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v5, 12, v5
 ; GFX9-DL-NEXT:    v_mul_i32_i24_e32 v3, s10, v3
-; GFX9-DL-NEXT:    s_bfe_i32 s13, s1, 0x40010
+; GFX9-DL-NEXT:    s_bfe_i32 s13, s6, 0x40010
 ; GFX9-DL-NEXT:    v_and_b32_e32 v4, s2, v4
 ; GFX9-DL-NEXT:    v_and_b32_e32 v5, s2, v5
-; GFX9-DL-NEXT:    s_bfe_i32 s15, s1, 0x40014
+; GFX9-DL-NEXT:    s_bfe_i32 s15, s6, 0x40014
 ; GFX9-DL-NEXT:    s_bfe_i32 s12, s0, 0x40010
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v8, s13
-; GFX9-DL-NEXT:    s_bfe_i32 s17, s1, 0x40018
+; GFX9-DL-NEXT:    s_bfe_i32 s17, s6, 0x40018
 ; GFX9-DL-NEXT:    s_bfe_i32 s14, s0, 0x40014
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v9, s15
 ; GFX9-DL-NEXT:    s_bfe_i32 s16, s0, 0x40018
-; GFX9-DL-NEXT:    s_ashr_i32 s1, s1, 28
+; GFX9-DL-NEXT:    s_ashr_i32 s6, s6, 28
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v10, s17
 ; GFX9-DL-NEXT:    s_ashr_i32 s0, s0, 28
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s6, v6, v2
+; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s5, v6, v2
 ; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s8, v7, v2
 ; GFX9-DL-NEXT:    v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
 ; GFX9-DL-NEXT:    v_mad_u32_u24 v2, v4, v5, v2
 ; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s12, v8, v2
 ; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s14, v9, v2
 ; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s16, v10, v2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s0, v3, v2
 ; GFX9-DL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
@@ -819,44 +819,44 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_movk_i32 s2, 0xff
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX10-DL-NEXT:    s_load_dword s5, s[6:7], 0x0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX10-DL-NEXT:    global_load_ubyte v2, v[0:1], off
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_lshr_b32 s0, s4, 12
-; GFX10-DL-NEXT:    s_lshr_b32 s1, s5, 12
-; GFX10-DL-NEXT:    s_bfe_i32 s6, s4, 0x40000
-; GFX10-DL-NEXT:    s_bfe_i32 s7, s5, 0x40000
-; GFX10-DL-NEXT:    s_bfe_i32 s8, s4, 0x40004
-; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v3, 12, s0
-; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v4, 12, s1
-; GFX10-DL-NEXT:    s_bfe_i32 s9, s5, 0x40004
-; GFX10-DL-NEXT:    s_bfe_i32 s10, s4, 0x40008
-; GFX10-DL-NEXT:    s_bfe_i32 s11, s5, 0x40008
+; GFX10-DL-NEXT:    s_lshr_b32 s4, s0, 12
+; GFX10-DL-NEXT:    s_lshr_b32 s5, s1, 12
+; GFX10-DL-NEXT:    s_bfe_i32 s6, s0, 0x40000
+; GFX10-DL-NEXT:    s_bfe_i32 s7, s1, 0x40000
+; GFX10-DL-NEXT:    s_bfe_i32 s8, s0, 0x40004
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v3, 12, s4
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v4, 12, s5
+; GFX10-DL-NEXT:    s_bfe_i32 s9, s1, 0x40004
+; GFX10-DL-NEXT:    s_bfe_i32 s10, s0, 0x40008
+; GFX10-DL-NEXT:    s_bfe_i32 s11, s1, 0x40008
 ; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v3, 12, v3
 ; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v4, 12, v4
-; GFX10-DL-NEXT:    s_bfe_i32 s0, s4, 0x40010
-; GFX10-DL-NEXT:    s_bfe_i32 s1, s5, 0x40010
+; GFX10-DL-NEXT:    s_bfe_i32 s4, s1, 0x40010
+; GFX10-DL-NEXT:    s_bfe_i32 s5, s0, 0x40014
 ; GFX10-DL-NEXT:    v_mul_i32_i24_e64 v5, s10, s11
 ; GFX10-DL-NEXT:    v_and_b32_e32 v3, s2, v3
 ; GFX10-DL-NEXT:    v_and_b32_e32 v4, s2, v4
-; GFX10-DL-NEXT:    s_bfe_i32 s10, s4, 0x40014
-; GFX10-DL-NEXT:    s_bfe_i32 s11, s5, 0x40014
-; GFX10-DL-NEXT:    s_bfe_i32 s12, s4, 0x40018
-; GFX10-DL-NEXT:    s_bfe_i32 s2, s5, 0x40018
-; GFX10-DL-NEXT:    s_ashr_i32 s4, s4, 28
-; GFX10-DL-NEXT:    s_ashr_i32 s5, s5, 28
+; GFX10-DL-NEXT:    s_bfe_i32 s2, s0, 0x40010
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s6, s7, v2
+; GFX10-DL-NEXT:    s_bfe_i32 s6, s1, 0x40014
 ; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s8, s9, v2
 ; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
 ; GFX10-DL-NEXT:    v_mad_u32_u24 v2, v3, v4, v2
+; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s2, s4, v2
+; GFX10-DL-NEXT:    s_bfe_i32 s2, s0, 0x40018
+; GFX10-DL-NEXT:    s_bfe_i32 s4, s1, 0x40018
+; GFX10-DL-NEXT:    s_ashr_i32 s0, s0, 28
+; GFX10-DL-NEXT:    s_ashr_i32 s1, s1, 28
+; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s5, s6, v2
+; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s2, s4, v2
 ; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s0, s1, v2
-; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s10, s11, v2
-; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s12, s2, v2
-; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s4, s5, v2
 ; GFX10-DL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                        <8 x i4> addrspace(1)* %src2,
@@ -939,39 +939,39 @@ define amdgpu_kernel void @idot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_load_dword s0, s[8:9], 0x0
 ; GFX7-NEXT:    s_load_dword s1, s[10:11], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s21, s[4:5], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_bfe_i32 s8, s0, 0x40000
-; GFX7-NEXT:    s_bfe_i32 s9, s1, 0x40000
-; GFX7-NEXT:    v_mov_b32_e32 v0, s9
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mad_i32_i24 v1, s8, v0, v1
-; GFX7-NEXT:    s_bfe_i32 s11, s1, 0x40004
-; GFX7-NEXT:    s_bfe_i32 s10, s0, 0x40004
-; GFX7-NEXT:    s_bfe_i32 s13, s1, 0x40008
-; GFX7-NEXT:    v_mad_i32_i24 v0, s8, v0, v1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s11
-; GFX7-NEXT:    v_mad_i32_i24 v0, s10, v2, v0
-; GFX7-NEXT:    s_bfe_i32 s12, s0, 0x40008
-; GFX7-NEXT:    v_mov_b32_e32 v2, s13
-; GFX7-NEXT:    s_bfe_i32 s15, s1, 0x4000c
-; GFX7-NEXT:    v_mad_i32_i24 v0, s12, v2, v0
-; GFX7-NEXT:    s_bfe_i32 s14, s0, 0x4000c
-; GFX7-NEXT:    v_mov_b32_e32 v2, s15
-; GFX7-NEXT:    s_bfe_i32 s17, s1, 0x40010
-; GFX7-NEXT:    v_mad_i32_i24 v0, s14, v2, v0
-; GFX7-NEXT:    s_bfe_i32 s16, s0, 0x40010
-; GFX7-NEXT:    v_mov_b32_e32 v2, s17
-; GFX7-NEXT:    s_bfe_i32 s19, s1, 0x40014
-; GFX7-NEXT:    s_bfe_i32 s21, s1, 0x40018
-; GFX7-NEXT:    v_mad_i32_i24 v0, s16, v2, v0
-; GFX7-NEXT:    s_bfe_i32 s18, s0, 0x40014
-; GFX7-NEXT:    v_mov_b32_e32 v2, s19
-; GFX7-NEXT:    s_bfe_i32 s20, s0, 0x40018
-; GFX7-NEXT:    v_mad_i32_i24 v0, s18, v2, v0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s21
+; GFX7-NEXT:    s_bfe_i32 s2, s0, 0x40000
+; GFX7-NEXT:    s_bfe_i32 s8, s1, 0x40000
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    v_mov_b32_e32 v1, s21
+; GFX7-NEXT:    v_mad_i32_i24 v1, s2, v0, v1
+; GFX7-NEXT:    s_bfe_i32 s10, s1, 0x40004
+; GFX7-NEXT:    s_bfe_i32 s9, s0, 0x40004
+; GFX7-NEXT:    s_bfe_i32 s12, s1, 0x40008
+; GFX7-NEXT:    v_mad_i32_i24 v0, s2, v0, v1
+; GFX7-NEXT:    v_mov_b32_e32 v2, s10
+; GFX7-NEXT:    v_mad_i32_i24 v0, s9, v2, v0
+; GFX7-NEXT:    s_bfe_i32 s11, s0, 0x40008
+; GFX7-NEXT:    v_mov_b32_e32 v2, s12
+; GFX7-NEXT:    s_bfe_i32 s14, s1, 0x4000c
+; GFX7-NEXT:    v_mad_i32_i24 v0, s11, v2, v0
+; GFX7-NEXT:    s_bfe_i32 s13, s0, 0x4000c
+; GFX7-NEXT:    v_mov_b32_e32 v2, s14
+; GFX7-NEXT:    s_bfe_i32 s16, s1, 0x40010
+; GFX7-NEXT:    v_mad_i32_i24 v0, s13, v2, v0
+; GFX7-NEXT:    s_bfe_i32 s15, s0, 0x40010
+; GFX7-NEXT:    v_mov_b32_e32 v2, s16
+; GFX7-NEXT:    s_bfe_i32 s18, s1, 0x40014
+; GFX7-NEXT:    s_bfe_i32 s20, s1, 0x40018
+; GFX7-NEXT:    v_mad_i32_i24 v0, s15, v2, v0
+; GFX7-NEXT:    s_bfe_i32 s17, s0, 0x40014
+; GFX7-NEXT:    v_mov_b32_e32 v2, s18
+; GFX7-NEXT:    s_bfe_i32 s19, s0, 0x40018
+; GFX7-NEXT:    v_mad_i32_i24 v0, s17, v2, v0
+; GFX7-NEXT:    v_mov_b32_e32 v2, s20
 ; GFX7-NEXT:    s_ashr_i32 s1, s1, 28
-; GFX7-NEXT:    v_mad_i32_i24 v0, s20, v2, v0
+; GFX7-NEXT:    v_mad_i32_i24 v0, s19, v2, v0
 ; GFX7-NEXT:    s_ashr_i32 s0, s0, 28
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX7-NEXT:    v_mad_i32_i24 v0, s0, v2, v0
@@ -986,45 +986,45 @@ define amdgpu_kernel void @idot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX8-NEXT:    s_load_dword s4, s[6:7], 0x0
-; GFX8-NEXT:    s_load_dword s5, s[0:1], 0x0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    s_load_dword s19, s[0:1], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_bfe_i32 s0, s2, 0x40000
-; GFX8-NEXT:    s_bfe_i32 s1, s4, 0x40000
-; GFX8-NEXT:    v_mov_b32_e32 v2, s1
-; GFX8-NEXT:    v_mov_b32_e32 v3, s5
-; GFX8-NEXT:    v_mad_i32_i24 v3, s0, v2, v3
-; GFX8-NEXT:    s_bfe_i32 s7, s4, 0x40004
-; GFX8-NEXT:    s_bfe_i32 s6, s2, 0x40004
-; GFX8-NEXT:    s_bfe_i32 s9, s4, 0x40008
-; GFX8-NEXT:    v_mad_i32_i24 v2, s0, v2, v3
-; GFX8-NEXT:    v_mov_b32_e32 v4, s7
-; GFX8-NEXT:    v_mad_i32_i24 v2, s6, v4, v2
-; GFX8-NEXT:    s_bfe_i32 s8, s2, 0x40008
-; GFX8-NEXT:    v_mov_b32_e32 v4, s9
-; GFX8-NEXT:    s_bfe_i32 s11, s4, 0x4000c
-; GFX8-NEXT:    v_mad_i32_i24 v2, s8, v4, v2
-; GFX8-NEXT:    s_bfe_i32 s10, s2, 0x4000c
-; GFX8-NEXT:    v_mov_b32_e32 v4, s11
-; GFX8-NEXT:    s_bfe_i32 s13, s4, 0x40010
-; GFX8-NEXT:    v_mad_i32_i24 v2, s10, v4, v2
-; GFX8-NEXT:    s_bfe_i32 s12, s2, 0x40010
-; GFX8-NEXT:    v_mov_b32_e32 v4, s13
-; GFX8-NEXT:    s_bfe_i32 s15, s4, 0x40014
-; GFX8-NEXT:    s_bfe_i32 s17, s4, 0x40018
-; GFX8-NEXT:    v_mad_i32_i24 v2, s12, v4, v2
-; GFX8-NEXT:    s_bfe_i32 s14, s2, 0x40014
-; GFX8-NEXT:    v_mov_b32_e32 v4, s15
-; GFX8-NEXT:    s_bfe_i32 s16, s2, 0x40018
-; GFX8-NEXT:    v_mad_i32_i24 v2, s14, v4, v2
-; GFX8-NEXT:    v_mov_b32_e32 v4, s17
+; GFX8-NEXT:    s_bfe_i32 s5, s2, 0x40000
+; GFX8-NEXT:    s_bfe_i32 s6, s4, 0x40000
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s19
+; GFX8-NEXT:    v_mad_i32_i24 v1, s5, v0, v1
+; GFX8-NEXT:    s_bfe_i32 s8, s4, 0x40004
+; GFX8-NEXT:    s_bfe_i32 s7, s2, 0x40004
+; GFX8-NEXT:    s_bfe_i32 s10, s4, 0x40008
+; GFX8-NEXT:    v_mad_i32_i24 v0, s5, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s8
+; GFX8-NEXT:    v_mad_i32_i24 v0, s7, v2, v0
+; GFX8-NEXT:    s_bfe_i32 s9, s2, 0x40008
+; GFX8-NEXT:    v_mov_b32_e32 v2, s10
+; GFX8-NEXT:    s_bfe_i32 s12, s4, 0x4000c
+; GFX8-NEXT:    v_mad_i32_i24 v0, s9, v2, v0
+; GFX8-NEXT:    s_bfe_i32 s11, s2, 0x4000c
+; GFX8-NEXT:    v_mov_b32_e32 v2, s12
+; GFX8-NEXT:    s_bfe_i32 s14, s4, 0x40010
+; GFX8-NEXT:    v_mad_i32_i24 v0, s11, v2, v0
+; GFX8-NEXT:    s_bfe_i32 s13, s2, 0x40010
+; GFX8-NEXT:    v_mov_b32_e32 v2, s14
+; GFX8-NEXT:    s_bfe_i32 s16, s4, 0x40014
+; GFX8-NEXT:    s_bfe_i32 s18, s4, 0x40018
+; GFX8-NEXT:    v_mad_i32_i24 v0, s13, v2, v0
+; GFX8-NEXT:    s_bfe_i32 s15, s2, 0x40014
+; GFX8-NEXT:    v_mov_b32_e32 v2, s16
+; GFX8-NEXT:    s_bfe_i32 s17, s2, 0x40018
+; GFX8-NEXT:    v_mad_i32_i24 v0, s15, v2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s18
 ; GFX8-NEXT:    s_ashr_i32 s4, s4, 28
-; GFX8-NEXT:    v_mad_i32_i24 v2, s16, v4, v2
+; GFX8-NEXT:    v_mad_i32_i24 v0, s17, v2, v0
 ; GFX8-NEXT:    s_ashr_i32 s2, s2, 28
-; GFX8-NEXT:    v_mov_b32_e32 v4, s4
-; GFX8-NEXT:    v_mad_i32_i24 v2, s2, v4, v2
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    v_mad_i32_i24 v0, s2, v2, v0
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -1035,45 +1035,45 @@ define amdgpu_kernel void @idot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-NEXT:    s_load_dword s4, s[6:7], 0x0
-; GFX9-NEXT:    s_load_dword s5, s[0:1], 0x0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    s_load_dword s19, s[0:1], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_bfe_i32 s0, s2, 0x40000
-; GFX9-NEXT:    s_bfe_i32 s1, s4, 0x40000
-; GFX9-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-NEXT:    v_mad_i32_i24 v3, s0, v2, v3
-; GFX9-NEXT:    s_bfe_i32 s7, s4, 0x40004
-; GFX9-NEXT:    s_bfe_i32 s6, s2, 0x40004
-; GFX9-NEXT:    s_bfe_i32 s9, s4, 0x40008
-; GFX9-NEXT:    v_mad_i32_i24 v2, s0, v2, v3
-; GFX9-NEXT:    v_mov_b32_e32 v4, s7
-; GFX9-NEXT:    v_mad_i32_i24 v2, s6, v4, v2
-; GFX9-NEXT:    s_bfe_i32 s8, s2, 0x40008
-; GFX9-NEXT:    v_mov_b32_e32 v4, s9
-; GFX9-NEXT:    s_bfe_i32 s11, s4, 0x4000c
-; GFX9-NEXT:    v_mad_i32_i24 v2, s8, v4, v2
-; GFX9-NEXT:    s_bfe_i32 s10, s2, 0x4000c
-; GFX9-NEXT:    v_mov_b32_e32 v4, s11
-; GFX9-NEXT:    s_bfe_i32 s13, s4, 0x40010
-; GFX9-NEXT:    v_mad_i32_i24 v2, s10, v4, v2
-; GFX9-NEXT:    s_bfe_i32 s12, s2, 0x40010
-; GFX9-NEXT:    v_mov_b32_e32 v4, s13
-; GFX9-NEXT:    s_bfe_i32 s15, s4, 0x40014
-; GFX9-NEXT:    s_bfe_i32 s17, s4, 0x40018
-; GFX9-NEXT:    v_mad_i32_i24 v2, s12, v4, v2
-; GFX9-NEXT:    s_bfe_i32 s14, s2, 0x40014
-; GFX9-NEXT:    v_mov_b32_e32 v4, s15
-; GFX9-NEXT:    s_bfe_i32 s16, s2, 0x40018
-; GFX9-NEXT:    v_mad_i32_i24 v2, s14, v4, v2
-; GFX9-NEXT:    v_mov_b32_e32 v4, s17
+; GFX9-NEXT:    s_bfe_i32 s5, s2, 0x40000
+; GFX9-NEXT:    s_bfe_i32 s6, s4, 0x40000
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s19
+; GFX9-NEXT:    v_mad_i32_i24 v1, s5, v0, v1
+; GFX9-NEXT:    s_bfe_i32 s8, s4, 0x40004
+; GFX9-NEXT:    s_bfe_i32 s7, s2, 0x40004
+; GFX9-NEXT:    s_bfe_i32 s10, s4, 0x40008
+; GFX9-NEXT:    v_mad_i32_i24 v0, s5, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s8
+; GFX9-NEXT:    v_mad_i32_i24 v0, s7, v2, v0
+; GFX9-NEXT:    s_bfe_i32 s9, s2, 0x40008
+; GFX9-NEXT:    v_mov_b32_e32 v2, s10
+; GFX9-NEXT:    s_bfe_i32 s12, s4, 0x4000c
+; GFX9-NEXT:    v_mad_i32_i24 v0, s9, v2, v0
+; GFX9-NEXT:    s_bfe_i32 s11, s2, 0x4000c
+; GFX9-NEXT:    v_mov_b32_e32 v2, s12
+; GFX9-NEXT:    s_bfe_i32 s14, s4, 0x40010
+; GFX9-NEXT:    v_mad_i32_i24 v0, s11, v2, v0
+; GFX9-NEXT:    s_bfe_i32 s13, s2, 0x40010
+; GFX9-NEXT:    v_mov_b32_e32 v2, s14
+; GFX9-NEXT:    s_bfe_i32 s16, s4, 0x40014
+; GFX9-NEXT:    s_bfe_i32 s18, s4, 0x40018
+; GFX9-NEXT:    v_mad_i32_i24 v0, s13, v2, v0
+; GFX9-NEXT:    s_bfe_i32 s15, s2, 0x40014
+; GFX9-NEXT:    v_mov_b32_e32 v2, s16
+; GFX9-NEXT:    s_bfe_i32 s17, s2, 0x40018
+; GFX9-NEXT:    v_mad_i32_i24 v0, s15, v2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, s18
 ; GFX9-NEXT:    s_ashr_i32 s4, s4, 28
-; GFX9-NEXT:    v_mad_i32_i24 v2, s16, v4, v2
+; GFX9-NEXT:    v_mad_i32_i24 v0, s17, v2, v0
 ; GFX9-NEXT:    s_ashr_i32 s2, s2, 28
-; GFX9-NEXT:    v_mov_b32_e32 v4, s4
-; GFX9-NEXT:    v_mad_i32_i24 v2, s2, v4, v2
-; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_mad_i32_i24 v0, s2, v2, v0
+; GFX9-NEXT:    v_add_u32_e32 v2, v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -1084,45 +1084,45 @@ define amdgpu_kernel void @idot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
-; GFX9-DL-NEXT:    s_load_dword s5, s[0:1], 0x0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-DL-NEXT:    s_load_dword s19, s[0:1], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_bfe_i32 s0, s2, 0x40000
-; GFX9-DL-NEXT:    s_bfe_i32 s1, s4, 0x40000
-; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-DL-NEXT:    v_mad_i32_i24 v3, s0, v2, v3
-; GFX9-DL-NEXT:    s_bfe_i32 s7, s4, 0x40004
-; GFX9-DL-NEXT:    s_bfe_i32 s6, s2, 0x40004
-; GFX9-DL-NEXT:    s_bfe_i32 s9, s4, 0x40008
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s0, v2, v3
-; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s7
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s6, v4, v2
-; GFX9-DL-NEXT:    s_bfe_i32 s8, s2, 0x40008
-; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s9
-; GFX9-DL-NEXT:    s_bfe_i32 s11, s4, 0x4000c
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s8, v4, v2
-; GFX9-DL-NEXT:    s_bfe_i32 s10, s2, 0x4000c
-; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s11
-; GFX9-DL-NEXT:    s_bfe_i32 s13, s4, 0x40010
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s10, v4, v2
-; GFX9-DL-NEXT:    s_bfe_i32 s12, s2, 0x40010
-; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s13
-; GFX9-DL-NEXT:    s_bfe_i32 s15, s4, 0x40014
-; GFX9-DL-NEXT:    s_bfe_i32 s17, s4, 0x40018
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s12, v4, v2
-; GFX9-DL-NEXT:    s_bfe_i32 s14, s2, 0x40014
-; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s15
-; GFX9-DL-NEXT:    s_bfe_i32 s16, s2, 0x40018
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s14, v4, v2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s17
+; GFX9-DL-NEXT:    s_bfe_i32 s5, s2, 0x40000
+; GFX9-DL-NEXT:    s_bfe_i32 s6, s4, 0x40000
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s19
+; GFX9-DL-NEXT:    v_mad_i32_i24 v1, s5, v0, v1
+; GFX9-DL-NEXT:    s_bfe_i32 s8, s4, 0x40004
+; GFX9-DL-NEXT:    s_bfe_i32 s7, s2, 0x40004
+; GFX9-DL-NEXT:    s_bfe_i32 s10, s4, 0x40008
+; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s5, v0, v1
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s8
+; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s7, v2, v0
+; GFX9-DL-NEXT:    s_bfe_i32 s9, s2, 0x40008
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s10
+; GFX9-DL-NEXT:    s_bfe_i32 s12, s4, 0x4000c
+; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s9, v2, v0
+; GFX9-DL-NEXT:    s_bfe_i32 s11, s2, 0x4000c
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s12
+; GFX9-DL-NEXT:    s_bfe_i32 s14, s4, 0x40010
+; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s11, v2, v0
+; GFX9-DL-NEXT:    s_bfe_i32 s13, s2, 0x40010
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s14
+; GFX9-DL-NEXT:    s_bfe_i32 s16, s4, 0x40014
+; GFX9-DL-NEXT:    s_bfe_i32 s18, s4, 0x40018
+; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s13, v2, v0
+; GFX9-DL-NEXT:    s_bfe_i32 s15, s2, 0x40014
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s16
+; GFX9-DL-NEXT:    s_bfe_i32 s17, s2, 0x40018
+; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s15, v2, v0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s18
 ; GFX9-DL-NEXT:    s_ashr_i32 s4, s4, 28
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s16, v4, v2
+; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s17, v2, v0
 ; GFX9-DL-NEXT:    s_ashr_i32 s2, s2, 28
-; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s4
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s2, v4, v2
-; GFX9-DL-NEXT:    v_add_u32_e32 v2, v3, v2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s2, v2, v0
+; GFX9-DL-NEXT:    v_add_u32_e32 v2, v1, v0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
 ;
@@ -1135,36 +1135,36 @@ define amdgpu_kernel void @idot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s5, s[0:1], 0x0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_bfe_i32 s0, s2, 0x40000
-; GFX10-DL-NEXT:    s_bfe_i32 s1, s4, 0x40000
-; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-DL-NEXT:    s_bfe_i32 s6, s2, 0x40000
+; GFX10-DL-NEXT:    s_bfe_i32 s7, s4, 0x40000
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-DL-NEXT:    s_bfe_i32 s5, s2, 0x40004
-; GFX10-DL-NEXT:    s_bfe_i32 s6, s4, 0x40004
-; GFX10-DL-NEXT:    s_bfe_i32 s7, s2, 0x40008
-; GFX10-DL-NEXT:    s_bfe_i32 s8, s4, 0x40008
-; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s0, s1, v2
-; GFX10-DL-NEXT:    s_bfe_i32 s9, s2, 0x4000c
-; GFX10-DL-NEXT:    s_bfe_i32 s10, s4, 0x4000c
-; GFX10-DL-NEXT:    s_bfe_i32 s11, s2, 0x40010
-; GFX10-DL-NEXT:    s_bfe_i32 s12, s4, 0x40010
-; GFX10-DL-NEXT:    v_mad_i32_i24 v3, s0, s1, v2
-; GFX10-DL-NEXT:    s_bfe_i32 s0, s2, 0x40014
-; GFX10-DL-NEXT:    s_bfe_i32 s1, s4, 0x40014
-; GFX10-DL-NEXT:    s_bfe_i32 s13, s2, 0x40018
-; GFX10-DL-NEXT:    s_bfe_i32 s14, s4, 0x40018
-; GFX10-DL-NEXT:    v_mad_i32_i24 v3, s5, s6, v3
-; GFX10-DL-NEXT:    v_mad_i32_i24 v3, s7, s8, v3
-; GFX10-DL-NEXT:    v_mad_i32_i24 v3, s9, s10, v3
-; GFX10-DL-NEXT:    v_mad_i32_i24 v3, s11, s12, v3
-; GFX10-DL-NEXT:    v_mad_i32_i24 v3, s0, s1, v3
-; GFX10-DL-NEXT:    s_ashr_i32 s0, s2, 28
-; GFX10-DL-NEXT:    s_ashr_i32 s1, s4, 28
-; GFX10-DL-NEXT:    v_mad_i32_i24 v3, s13, s14, v3
-; GFX10-DL-NEXT:    v_mad_i32_i24 v3, s0, s1, v3
-; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v2, v3
+; GFX10-DL-NEXT:    s_bfe_i32 s8, s4, 0x40004
+; GFX10-DL-NEXT:    v_mad_i32_i24 v0, s6, s7, v0
+; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s6, s7, v0
+; GFX10-DL-NEXT:    s_bfe_i32 s6, s2, 0x40008
+; GFX10-DL-NEXT:    s_bfe_i32 s7, s4, 0x40008
+; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s5, s8, v1
+; GFX10-DL-NEXT:    s_bfe_i32 s5, s2, 0x4000c
+; GFX10-DL-NEXT:    s_bfe_i32 s8, s4, 0x4000c
+; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s6, s7, v1
+; GFX10-DL-NEXT:    s_bfe_i32 s6, s2, 0x40010
+; GFX10-DL-NEXT:    s_bfe_i32 s7, s4, 0x40010
+; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s5, s8, v1
+; GFX10-DL-NEXT:    s_bfe_i32 s5, s2, 0x40014
+; GFX10-DL-NEXT:    s_bfe_i32 s8, s4, 0x40014
+; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s6, s7, v1
+; GFX10-DL-NEXT:    s_bfe_i32 s6, s2, 0x40018
+; GFX10-DL-NEXT:    s_bfe_i32 s7, s4, 0x40018
+; GFX10-DL-NEXT:    s_ashr_i32 s2, s2, 28
+; GFX10-DL-NEXT:    s_ashr_i32 s4, s4, 28
+; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s5, s8, v1
+; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s6, s7, v1
+; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s2, s4, v1
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v0, v1
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                                 <8 x i4> addrspace(1)* %src2,
@@ -1248,17 +1248,15 @@ define amdgpu_kernel void @idot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_load_dword s1, s[8:9], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[10:11], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_ashr_i64 s[10:11], s[0:1], 60
 ; GFX7-NEXT:    s_lshl_b32 s11, s1, 4
-; GFX7-NEXT:    s_ashr_i64 s[14:15], s[10:11], 60
-; GFX7-NEXT:    s_lshl_b32 s11, s1, 12
 ; GFX7-NEXT:    s_ashr_i64 s[16:17], s[10:11], 60
 ; GFX7-NEXT:    s_lshl_b32 s11, s1, 16
 ; GFX7-NEXT:    s_ashr_i64 s[18:19], s[10:11], 60
 ; GFX7-NEXT:    s_lshl_b32 s11, s1, 20
 ; GFX7-NEXT:    s_lshl_b32 s13, s1, 8
+; GFX7-NEXT:    s_lshl_b32 s15, s1, 12
 ; GFX7-NEXT:    s_ashr_i64 s[20:21], s[10:11], 60
 ; GFX7-NEXT:    s_lshl_b32 s11, s1, 24
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 28
@@ -1278,10 +1276,14 @@ define amdgpu_kernel void @idot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_lshl_b32 s1, s9, 28
 ; GFX7-NEXT:    s_ashr_i64 s[24:25], s[8:9], 60
 ; GFX7-NEXT:    s_ashr_i64 s[8:9], s[0:1], 60
+; GFX7-NEXT:    s_load_dword s1, s[4:5], 0x0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s8
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mad_i32_i24 v0, s0, v0, v1
 ; GFX7-NEXT:    s_ashr_i64 s[22:23], s[10:11], 60
+; GFX7-NEXT:    s_ashr_i64 s[14:15], s[14:15], 60
+; GFX7-NEXT:    s_ashr_i64 s[12:13], s[12:13], 60
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mad_i32_i24 v0, s0, v0, v1
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s36
 ; GFX7-NEXT:    v_mad_i32_i24 v0, s22, v1, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s34
@@ -1289,12 +1291,11 @@ define amdgpu_kernel void @idot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s32
 ; GFX7-NEXT:    v_mad_i32_i24 v0, s18, v1, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s30
-; GFX7-NEXT:    v_mad_i32_i24 v0, s16, v1, v0
-; GFX7-NEXT:    s_ashr_i64 s[12:13], s[12:13], 60
+; GFX7-NEXT:    v_mad_i32_i24 v0, s14, v1, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s28
 ; GFX7-NEXT:    v_mad_i32_i24 v0, s12, v1, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s26
-; GFX7-NEXT:    v_mad_i32_i24 v0, s14, v1, v0
+; GFX7-NEXT:    v_mad_i32_i24 v0, s16, v1, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s24
 ; GFX7-NEXT:    v_mad_i32_i24 v0, s10, v1, v0
 ; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
@@ -1308,56 +1309,56 @@ define amdgpu_kernel void @idot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_load_dword s5, s[4:5], 0x0
 ; GFX8-NEXT:    s_load_dword s7, s[6:7], 0x0
 ; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_ashr_i64 s[0:1], s[4:5], 60
-; GFX8-NEXT:    s_lshl_b32 s1, s5, 4
-; GFX8-NEXT:    s_ashr_i64 s[12:13], s[0:1], 60
-; GFX8-NEXT:    s_lshl_b32 s1, s5, 16
-; GFX8-NEXT:    s_ashr_i64 s[14:15], s[0:1], 60
-; GFX8-NEXT:    s_lshl_b32 s1, s5, 20
-; GFX8-NEXT:    s_ashr_i64 s[16:17], s[0:1], 60
-; GFX8-NEXT:    s_lshl_b32 s1, s5, 24
-; GFX8-NEXT:    s_ashr_i64 s[18:19], s[0:1], 60
-; GFX8-NEXT:    s_lshl_b32 s1, s5, 28
-; GFX8-NEXT:    s_lshl_b32 s9, s5, 8
-; GFX8-NEXT:    s_lshl_b32 s11, s5, 12
-; GFX8-NEXT:    s_ashr_i64 s[4:5], s[0:1], 60
-; GFX8-NEXT:    s_lshl_b32 s1, s7, 4
-; GFX8-NEXT:    s_ashr_i64 s[22:23], s[0:1], 60
-; GFX8-NEXT:    s_lshl_b32 s1, s7, 8
-; GFX8-NEXT:    s_ashr_i64 s[24:25], s[0:1], 60
-; GFX8-NEXT:    s_lshl_b32 s1, s7, 12
-; GFX8-NEXT:    s_ashr_i64 s[26:27], s[0:1], 60
-; GFX8-NEXT:    s_lshl_b32 s1, s7, 16
-; GFX8-NEXT:    s_ashr_i64 s[28:29], s[0:1], 60
-; GFX8-NEXT:    s_lshl_b32 s1, s7, 20
-; GFX8-NEXT:    s_ashr_i64 s[30:31], s[0:1], 60
-; GFX8-NEXT:    s_lshl_b32 s1, s7, 24
-; GFX8-NEXT:    s_ashr_i64 s[32:33], s[0:1], 60
-; GFX8-NEXT:    s_lshl_b32 s1, s7, 28
-; GFX8-NEXT:    s_ashr_i64 s[20:21], s[6:7], 60
-; GFX8-NEXT:    s_ashr_i64 s[6:7], s[0:1], 60
-; GFX8-NEXT:    v_mov_b32_e32 v2, s6
-; GFX8-NEXT:    v_mov_b32_e32 v3, s2
-; GFX8-NEXT:    v_mad_i32_i24 v2, s4, v2, v3
-; GFX8-NEXT:    v_mov_b32_e32 v3, s32
-; GFX8-NEXT:    v_mad_i32_i24 v2, s18, v3, v2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s30
-; GFX8-NEXT:    v_mad_i32_i24 v2, s16, v3, v2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s28
-; GFX8-NEXT:    v_mad_i32_i24 v2, s14, v3, v2
+; GFX8-NEXT:    s_ashr_i64 s[8:9], s[4:5], 60
+; GFX8-NEXT:    s_lshl_b32 s9, s5, 4
+; GFX8-NEXT:    s_ashr_i64 s[16:17], s[8:9], 60
+; GFX8-NEXT:    s_lshl_b32 s9, s5, 20
+; GFX8-NEXT:    s_lshl_b32 s11, s5, 8
+; GFX8-NEXT:    s_lshl_b32 s13, s5, 12
+; GFX8-NEXT:    s_lshl_b32 s15, s5, 16
+; GFX8-NEXT:    s_ashr_i64 s[18:19], s[8:9], 60
+; GFX8-NEXT:    s_lshl_b32 s9, s5, 24
+; GFX8-NEXT:    s_lshl_b32 s5, s5, 28
+; GFX8-NEXT:    s_ashr_i64 s[4:5], s[4:5], 60
+; GFX8-NEXT:    s_lshl_b32 s5, s7, 4
+; GFX8-NEXT:    s_ashr_i64 s[24:25], s[4:5], 60
+; GFX8-NEXT:    s_lshl_b32 s5, s7, 8
+; GFX8-NEXT:    s_ashr_i64 s[26:27], s[4:5], 60
+; GFX8-NEXT:    s_lshl_b32 s5, s7, 12
+; GFX8-NEXT:    s_ashr_i64 s[28:29], s[4:5], 60
+; GFX8-NEXT:    s_lshl_b32 s5, s7, 16
+; GFX8-NEXT:    s_ashr_i64 s[30:31], s[4:5], 60
+; GFX8-NEXT:    s_lshl_b32 s5, s7, 20
+; GFX8-NEXT:    s_ashr_i64 s[32:33], s[4:5], 60
+; GFX8-NEXT:    s_lshl_b32 s5, s7, 24
+; GFX8-NEXT:    s_ashr_i64 s[34:35], s[4:5], 60
+; GFX8-NEXT:    s_lshl_b32 s5, s7, 28
+; GFX8-NEXT:    s_ashr_i64 s[22:23], s[6:7], 60
+; GFX8-NEXT:    s_ashr_i64 s[6:7], s[4:5], 60
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s2
+; GFX8-NEXT:    v_mad_i32_i24 v0, s4, v0, v1
+; GFX8-NEXT:    s_ashr_i64 s[20:21], s[8:9], 60
+; GFX8-NEXT:    v_mov_b32_e32 v1, s34
+; GFX8-NEXT:    v_mad_i32_i24 v0, s20, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s32
+; GFX8-NEXT:    v_mad_i32_i24 v0, s18, v1, v0
+; GFX8-NEXT:    s_ashr_i64 s[14:15], s[14:15], 60
+; GFX8-NEXT:    v_mov_b32_e32 v1, s30
+; GFX8-NEXT:    v_mad_i32_i24 v0, s14, v1, v0
+; GFX8-NEXT:    s_ashr_i64 s[12:13], s[12:13], 60
+; GFX8-NEXT:    v_mov_b32_e32 v1, s28
+; GFX8-NEXT:    v_mad_i32_i24 v0, s12, v1, v0
 ; GFX8-NEXT:    s_ashr_i64 s[10:11], s[10:11], 60
-; GFX8-NEXT:    v_mov_b32_e32 v3, s26
-; GFX8-NEXT:    v_mad_i32_i24 v2, s10, v3, v2
-; GFX8-NEXT:    s_ashr_i64 s[8:9], s[8:9], 60
-; GFX8-NEXT:    v_mov_b32_e32 v3, s24
-; GFX8-NEXT:    v_mad_i32_i24 v2, s8, v3, v2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s22
-; GFX8-NEXT:    v_mad_i32_i24 v2, s12, v3, v2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s20
-; GFX8-NEXT:    v_mad_i32_i24 v2, s0, v3, v2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s26
+; GFX8-NEXT:    v_mad_i32_i24 v0, s10, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s24
+; GFX8-NEXT:    v_mad_i32_i24 v0, s16, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s22
+; GFX8-NEXT:    v_mad_i32_i24 v2, s8, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -1369,56 +1370,56 @@ define amdgpu_kernel void @idot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    s_load_dword s5, s[4:5], 0x0
 ; GFX9-NEXT:    s_load_dword s7, s[6:7], 0x0
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i64 s[0:1], s[4:5], 60
-; GFX9-NEXT:    s_lshl_b32 s1, s5, 4
-; GFX9-NEXT:    s_ashr_i64 s[12:13], s[0:1], 60
-; GFX9-NEXT:    s_lshl_b32 s1, s5, 16
-; GFX9-NEXT:    s_ashr_i64 s[14:15], s[0:1], 60
-; GFX9-NEXT:    s_lshl_b32 s1, s5, 20
-; GFX9-NEXT:    s_ashr_i64 s[16:17], s[0:1], 60
-; GFX9-NEXT:    s_lshl_b32 s1, s5, 24
-; GFX9-NEXT:    s_ashr_i64 s[18:19], s[0:1], 60
-; GFX9-NEXT:    s_lshl_b32 s1, s5, 28
-; GFX9-NEXT:    s_lshl_b32 s9, s5, 8
-; GFX9-NEXT:    s_lshl_b32 s11, s5, 12
-; GFX9-NEXT:    s_ashr_i64 s[4:5], s[0:1], 60
-; GFX9-NEXT:    s_lshl_b32 s1, s7, 4
-; GFX9-NEXT:    s_ashr_i64 s[22:23], s[0:1], 60
-; GFX9-NEXT:    s_lshl_b32 s1, s7, 8
-; GFX9-NEXT:    s_ashr_i64 s[24:25], s[0:1], 60
-; GFX9-NEXT:    s_lshl_b32 s1, s7, 12
-; GFX9-NEXT:    s_ashr_i64 s[26:27], s[0:1], 60
-; GFX9-NEXT:    s_lshl_b32 s1, s7, 16
-; GFX9-NEXT:    s_ashr_i64 s[28:29], s[0:1], 60
-; GFX9-NEXT:    s_lshl_b32 s1, s7, 20
-; GFX9-NEXT:    s_ashr_i64 s[30:31], s[0:1], 60
-; GFX9-NEXT:    s_lshl_b32 s1, s7, 24
-; GFX9-NEXT:    s_ashr_i64 s[32:33], s[0:1], 60
-; GFX9-NEXT:    s_lshl_b32 s1, s7, 28
-; GFX9-NEXT:    s_ashr_i64 s[20:21], s[6:7], 60
-; GFX9-NEXT:    s_ashr_i64 s[6:7], s[0:1], 60
-; GFX9-NEXT:    v_mov_b32_e32 v2, s6
-; GFX9-NEXT:    v_mov_b32_e32 v3, s2
-; GFX9-NEXT:    v_mad_i32_i24 v2, s4, v2, v3
-; GFX9-NEXT:    v_mov_b32_e32 v3, s32
-; GFX9-NEXT:    v_mad_i32_i24 v2, s18, v3, v2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s30
-; GFX9-NEXT:    v_mad_i32_i24 v2, s16, v3, v2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s28
-; GFX9-NEXT:    v_mad_i32_i24 v2, s14, v3, v2
+; GFX9-NEXT:    s_ashr_i64 s[8:9], s[4:5], 60
+; GFX9-NEXT:    s_lshl_b32 s9, s5, 4
+; GFX9-NEXT:    s_ashr_i64 s[16:17], s[8:9], 60
+; GFX9-NEXT:    s_lshl_b32 s9, s5, 20
+; GFX9-NEXT:    s_lshl_b32 s11, s5, 8
+; GFX9-NEXT:    s_lshl_b32 s13, s5, 12
+; GFX9-NEXT:    s_lshl_b32 s15, s5, 16
+; GFX9-NEXT:    s_ashr_i64 s[18:19], s[8:9], 60
+; GFX9-NEXT:    s_lshl_b32 s9, s5, 24
+; GFX9-NEXT:    s_lshl_b32 s5, s5, 28
+; GFX9-NEXT:    s_ashr_i64 s[4:5], s[4:5], 60
+; GFX9-NEXT:    s_lshl_b32 s5, s7, 4
+; GFX9-NEXT:    s_ashr_i64 s[24:25], s[4:5], 60
+; GFX9-NEXT:    s_lshl_b32 s5, s7, 8
+; GFX9-NEXT:    s_ashr_i64 s[26:27], s[4:5], 60
+; GFX9-NEXT:    s_lshl_b32 s5, s7, 12
+; GFX9-NEXT:    s_ashr_i64 s[28:29], s[4:5], 60
+; GFX9-NEXT:    s_lshl_b32 s5, s7, 16
+; GFX9-NEXT:    s_ashr_i64 s[30:31], s[4:5], 60
+; GFX9-NEXT:    s_lshl_b32 s5, s7, 20
+; GFX9-NEXT:    s_ashr_i64 s[32:33], s[4:5], 60
+; GFX9-NEXT:    s_lshl_b32 s5, s7, 24
+; GFX9-NEXT:    s_ashr_i64 s[34:35], s[4:5], 60
+; GFX9-NEXT:    s_lshl_b32 s5, s7, 28
+; GFX9-NEXT:    s_ashr_i64 s[22:23], s[6:7], 60
+; GFX9-NEXT:    s_ashr_i64 s[6:7], s[4:5], 60
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    v_mad_i32_i24 v0, s4, v0, v1
+; GFX9-NEXT:    s_ashr_i64 s[20:21], s[8:9], 60
+; GFX9-NEXT:    v_mov_b32_e32 v1, s34
+; GFX9-NEXT:    v_mad_i32_i24 v0, s20, v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s32
+; GFX9-NEXT:    v_mad_i32_i24 v0, s18, v1, v0
+; GFX9-NEXT:    s_ashr_i64 s[14:15], s[14:15], 60
+; GFX9-NEXT:    v_mov_b32_e32 v1, s30
+; GFX9-NEXT:    v_mad_i32_i24 v0, s14, v1, v0
+; GFX9-NEXT:    s_ashr_i64 s[12:13], s[12:13], 60
+; GFX9-NEXT:    v_mov_b32_e32 v1, s28
+; GFX9-NEXT:    v_mad_i32_i24 v0, s12, v1, v0
 ; GFX9-NEXT:    s_ashr_i64 s[10:11], s[10:11], 60
-; GFX9-NEXT:    v_mov_b32_e32 v3, s26
-; GFX9-NEXT:    v_mad_i32_i24 v2, s10, v3, v2
-; GFX9-NEXT:    s_ashr_i64 s[8:9], s[8:9], 60
-; GFX9-NEXT:    v_mov_b32_e32 v3, s24
-; GFX9-NEXT:    v_mad_i32_i24 v2, s8, v3, v2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s22
-; GFX9-NEXT:    v_mad_i32_i24 v2, s12, v3, v2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s20
-; GFX9-NEXT:    v_mad_i32_i24 v2, s0, v3, v2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s26
+; GFX9-NEXT:    v_mad_i32_i24 v0, s10, v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s24
+; GFX9-NEXT:    v_mad_i32_i24 v0, s16, v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s22
+; GFX9-NEXT:    v_mad_i32_i24 v2, s8, v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -1430,56 +1431,56 @@ define amdgpu_kernel void @idot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dword s5, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s7, s[6:7], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_ashr_i64 s[0:1], s[4:5], 60
-; GFX9-DL-NEXT:    s_lshl_b32 s1, s5, 4
-; GFX9-DL-NEXT:    s_ashr_i64 s[12:13], s[0:1], 60
-; GFX9-DL-NEXT:    s_lshl_b32 s1, s5, 16
-; GFX9-DL-NEXT:    s_ashr_i64 s[14:15], s[0:1], 60
-; GFX9-DL-NEXT:    s_lshl_b32 s1, s5, 20
-; GFX9-DL-NEXT:    s_ashr_i64 s[16:17], s[0:1], 60
-; GFX9-DL-NEXT:    s_lshl_b32 s1, s5, 24
-; GFX9-DL-NEXT:    s_ashr_i64 s[18:19], s[0:1], 60
-; GFX9-DL-NEXT:    s_lshl_b32 s1, s5, 28
-; GFX9-DL-NEXT:    s_lshl_b32 s9, s5, 8
-; GFX9-DL-NEXT:    s_lshl_b32 s11, s5, 12
-; GFX9-DL-NEXT:    s_ashr_i64 s[4:5], s[0:1], 60
-; GFX9-DL-NEXT:    s_lshl_b32 s1, s7, 4
-; GFX9-DL-NEXT:    s_ashr_i64 s[22:23], s[0:1], 60
-; GFX9-DL-NEXT:    s_lshl_b32 s1, s7, 8
-; GFX9-DL-NEXT:    s_ashr_i64 s[24:25], s[0:1], 60
-; GFX9-DL-NEXT:    s_lshl_b32 s1, s7, 12
-; GFX9-DL-NEXT:    s_ashr_i64 s[26:27], s[0:1], 60
-; GFX9-DL-NEXT:    s_lshl_b32 s1, s7, 16
-; GFX9-DL-NEXT:    s_ashr_i64 s[28:29], s[0:1], 60
-; GFX9-DL-NEXT:    s_lshl_b32 s1, s7, 20
-; GFX9-DL-NEXT:    s_ashr_i64 s[30:31], s[0:1], 60
-; GFX9-DL-NEXT:    s_lshl_b32 s1, s7, 24
-; GFX9-DL-NEXT:    s_ashr_i64 s[32:33], s[0:1], 60
-; GFX9-DL-NEXT:    s_lshl_b32 s1, s7, 28
-; GFX9-DL-NEXT:    s_ashr_i64 s[20:21], s[6:7], 60
-; GFX9-DL-NEXT:    s_ashr_i64 s[6:7], s[0:1], 60
-; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s6
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s2
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s4, v2, v3
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s32
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s18, v3, v2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s30
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s16, v3, v2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s28
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s14, v3, v2
+; GFX9-DL-NEXT:    s_ashr_i64 s[8:9], s[4:5], 60
+; GFX9-DL-NEXT:    s_lshl_b32 s9, s5, 4
+; GFX9-DL-NEXT:    s_ashr_i64 s[16:17], s[8:9], 60
+; GFX9-DL-NEXT:    s_lshl_b32 s9, s5, 20
+; GFX9-DL-NEXT:    s_lshl_b32 s11, s5, 8
+; GFX9-DL-NEXT:    s_lshl_b32 s13, s5, 12
+; GFX9-DL-NEXT:    s_lshl_b32 s15, s5, 16
+; GFX9-DL-NEXT:    s_ashr_i64 s[18:19], s[8:9], 60
+; GFX9-DL-NEXT:    s_lshl_b32 s9, s5, 24
+; GFX9-DL-NEXT:    s_lshl_b32 s5, s5, 28
+; GFX9-DL-NEXT:    s_ashr_i64 s[4:5], s[4:5], 60
+; GFX9-DL-NEXT:    s_lshl_b32 s5, s7, 4
+; GFX9-DL-NEXT:    s_ashr_i64 s[24:25], s[4:5], 60
+; GFX9-DL-NEXT:    s_lshl_b32 s5, s7, 8
+; GFX9-DL-NEXT:    s_ashr_i64 s[26:27], s[4:5], 60
+; GFX9-DL-NEXT:    s_lshl_b32 s5, s7, 12
+; GFX9-DL-NEXT:    s_ashr_i64 s[28:29], s[4:5], 60
+; GFX9-DL-NEXT:    s_lshl_b32 s5, s7, 16
+; GFX9-DL-NEXT:    s_ashr_i64 s[30:31], s[4:5], 60
+; GFX9-DL-NEXT:    s_lshl_b32 s5, s7, 20
+; GFX9-DL-NEXT:    s_ashr_i64 s[32:33], s[4:5], 60
+; GFX9-DL-NEXT:    s_lshl_b32 s5, s7, 24
+; GFX9-DL-NEXT:    s_ashr_i64 s[34:35], s[4:5], 60
+; GFX9-DL-NEXT:    s_lshl_b32 s5, s7, 28
+; GFX9-DL-NEXT:    s_ashr_i64 s[22:23], s[6:7], 60
+; GFX9-DL-NEXT:    s_ashr_i64 s[6:7], s[4:5], 60
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s4, v0, v1
+; GFX9-DL-NEXT:    s_ashr_i64 s[20:21], s[8:9], 60
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s34
+; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s20, v1, v0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s32
+; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s18, v1, v0
+; GFX9-DL-NEXT:    s_ashr_i64 s[14:15], s[14:15], 60
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s30
+; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s14, v1, v0
+; GFX9-DL-NEXT:    s_ashr_i64 s[12:13], s[12:13], 60
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s28
+; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s12, v1, v0
 ; GFX9-DL-NEXT:    s_ashr_i64 s[10:11], s[10:11], 60
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s26
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s10, v3, v2
-; GFX9-DL-NEXT:    s_ashr_i64 s[8:9], s[8:9], 60
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s24
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s8, v3, v2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s22
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s12, v3, v2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s20
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s0, v3, v2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s26
+; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s10, v1, v0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s24
+; GFX9-DL-NEXT:    v_mad_i32_i24 v0, s16, v1, v0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s22
+; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s8, v1, v0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
 ;
@@ -1492,48 +1493,48 @@ define amdgpu_kernel void @idot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_load_dword s5, s[4:5], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s7, s[6:7], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_lshl_b32 s1, s5, 28
-; GFX10-DL-NEXT:    s_lshl_b32 s9, s7, 28
-; GFX10-DL-NEXT:    s_lshl_b32 s11, s5, 24
-; GFX10-DL-NEXT:    s_lshl_b32 s13, s7, 24
-; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-DL-NEXT:    s_ashr_i64 s[0:1], s[0:1], 60
+; GFX10-DL-NEXT:    s_lshl_b32 s9, s5, 28
+; GFX10-DL-NEXT:    s_lshl_b32 s11, s7, 28
+; GFX10-DL-NEXT:    s_lshl_b32 s13, s5, 24
+; GFX10-DL-NEXT:    s_lshl_b32 s15, s7, 24
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX10-DL-NEXT:    s_ashr_i64 s[8:9], s[8:9], 60
-; GFX10-DL-NEXT:    s_lshl_b32 s1, s5, 20
 ; GFX10-DL-NEXT:    s_ashr_i64 s[10:11], s[10:11], 60
-; GFX10-DL-NEXT:    s_lshl_b32 s9, s7, 20
 ; GFX10-DL-NEXT:    s_ashr_i64 s[12:13], s[12:13], 60
-; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s0, s8, v2
-; GFX10-DL-NEXT:    s_ashr_i64 s[0:1], s[0:1], 60
-; GFX10-DL-NEXT:    s_lshl_b32 s11, s5, 16
+; GFX10-DL-NEXT:    s_ashr_i64 s[14:15], s[14:15], 60
+; GFX10-DL-NEXT:    s_lshl_b32 s9, s5, 20
+; GFX10-DL-NEXT:    s_lshl_b32 s11, s7, 20
+; GFX10-DL-NEXT:    v_mad_i32_i24 v0, s8, s10, v0
+; GFX10-DL-NEXT:    s_lshl_b32 s13, s5, 16
+; GFX10-DL-NEXT:    s_lshl_b32 s15, s7, 16
 ; GFX10-DL-NEXT:    s_ashr_i64 s[8:9], s[8:9], 60
-; GFX10-DL-NEXT:    s_lshl_b32 s1, s7, 16
-; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s10, s12, v2
-; GFX10-DL-NEXT:    s_lshl_b32 s9, s5, 12
 ; GFX10-DL-NEXT:    s_ashr_i64 s[10:11], s[10:11], 60
+; GFX10-DL-NEXT:    v_mad_i32_i24 v0, s12, s14, v0
+; GFX10-DL-NEXT:    s_ashr_i64 s[12:13], s[12:13], 60
+; GFX10-DL-NEXT:    s_ashr_i64 s[14:15], s[14:15], 60
+; GFX10-DL-NEXT:    s_lshl_b32 s9, s5, 12
 ; GFX10-DL-NEXT:    s_lshl_b32 s11, s7, 12
-; GFX10-DL-NEXT:    s_ashr_i64 s[12:13], s[0:1], 60
-; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s0, s8, v2
-; GFX10-DL-NEXT:    s_lshl_b32 s1, s5, 8
+; GFX10-DL-NEXT:    v_mad_i32_i24 v0, s8, s10, v0
+; GFX10-DL-NEXT:    s_lshl_b32 s13, s5, 8
+; GFX10-DL-NEXT:    s_lshl_b32 s15, s7, 8
+; GFX10-DL-NEXT:    s_ashr_i64 s[8:9], s[8:9], 60
+; GFX10-DL-NEXT:    s_ashr_i64 s[10:11], s[10:11], 60
+; GFX10-DL-NEXT:    v_mad_i32_i24 v0, s12, s14, v0
+; GFX10-DL-NEXT:    s_lshl_b32 s9, s5, 4
+; GFX10-DL-NEXT:    s_lshl_b32 s11, s7, 4
+; GFX10-DL-NEXT:    s_ashr_i64 s[12:13], s[12:13], 60
+; GFX10-DL-NEXT:    s_ashr_i64 s[14:15], s[14:15], 60
+; GFX10-DL-NEXT:    v_mad_i32_i24 v0, s8, s10, v0
 ; GFX10-DL-NEXT:    s_ashr_i64 s[8:9], s[8:9], 60
-; GFX10-DL-NEXT:    s_ashr_i64 s[14:15], s[10:11], 60
-; GFX10-DL-NEXT:    s_lshl_b32 s9, s7, 8
-; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s10, s12, v2
-; GFX10-DL-NEXT:    s_ashr_i64 s[0:1], s[0:1], 60
-; GFX10-DL-NEXT:    s_lshl_b32 s11, s5, 4
-; GFX10-DL-NEXT:    s_lshl_b32 s1, s7, 4
-; GFX10-DL-NEXT:    s_ashr_i64 s[12:13], s[8:9], 60
-; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s8, s14, v2
-; GFX10-DL-NEXT:    s_ashr_i64 s[8:9], s[10:11], 60
-; GFX10-DL-NEXT:    s_ashr_i64 s[10:11], s[0:1], 60
-; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s0, s12, v2
-; GFX10-DL-NEXT:    s_ashr_i64 s[0:1], s[4:5], 60
-; GFX10-DL-NEXT:    s_ashr_i64 s[4:5], s[6:7], 60
-; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s8, s10, v2
-; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s0, s4, v2
+; GFX10-DL-NEXT:    s_ashr_i64 s[10:11], s[10:11], 60
+; GFX10-DL-NEXT:    s_ashr_i64 s[4:5], s[4:5], 60
+; GFX10-DL-NEXT:    s_ashr_i64 s[6:7], s[6:7], 60
+; GFX10-DL-NEXT:    v_mad_i32_i24 v0, s12, s14, v0
+; GFX10-DL-NEXT:    v_mad_i32_i24 v0, s8, s10, v0
+; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s4, s6, v0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                               <8 x i4> addrspace(1)* %src2,
@@ -1579,31 +1580,31 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_load_dword s0, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s0, s[10:11], 0x0
 ; GFX7-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
-; GFX7-NEXT:    s_load_dword s1, s[10:11], 0x0
+; GFX7-NEXT:    s_load_dword s1, s[8:9], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_ashr_i32 s8, s0, 28
-; GFX7-NEXT:    s_bfe_i32 s9, s0, 0x40018
-; GFX7-NEXT:    s_bfe_i32 s16, s1, 0x40018
-; GFX7-NEXT:    s_bfe_i32 s17, s1, 0x40014
-; GFX7-NEXT:    s_bfe_i32 s18, s1, 0x40010
-; GFX7-NEXT:    s_bfe_i32 s19, s1, 0x40000
-; GFX7-NEXT:    s_bfe_i32 s20, s1, 0x40004
-; GFX7-NEXT:    s_bfe_i32 s21, s1, 0x40008
-; GFX7-NEXT:    s_ashr_i32 s15, s1, 28
-; GFX7-NEXT:    s_bfe_i32 s1, s1, 0x4000c
-; GFX7-NEXT:    s_bfe_i32 s10, s0, 0x40014
-; GFX7-NEXT:    s_bfe_i32 s11, s0, 0x40010
-; GFX7-NEXT:    s_bfe_i32 s12, s0, 0x40000
+; GFX7-NEXT:    s_bfe_i32 s16, s0, 0x40018
+; GFX7-NEXT:    s_bfe_i32 s17, s0, 0x40014
+; GFX7-NEXT:    s_bfe_i32 s18, s0, 0x40010
+; GFX7-NEXT:    s_bfe_i32 s19, s0, 0x40000
+; GFX7-NEXT:    s_bfe_i32 s20, s0, 0x40004
+; GFX7-NEXT:    s_bfe_i32 s21, s0, 0x40008
+; GFX7-NEXT:    s_ashr_i32 s15, s0, 28
+; GFX7-NEXT:    s_bfe_i32 s0, s0, 0x4000c
+; GFX7-NEXT:    s_ashr_i32 s8, s1, 28
+; GFX7-NEXT:    s_bfe_i32 s9, s1, 0x40018
+; GFX7-NEXT:    s_bfe_i32 s10, s1, 0x40014
+; GFX7-NEXT:    s_bfe_i32 s11, s1, 0x40010
+; GFX7-NEXT:    s_bfe_i32 s12, s1, 0x40000
 ; GFX7-NEXT:    v_mov_b32_e32 v4, s19
-; GFX7-NEXT:    s_bfe_i32 s13, s0, 0x40004
+; GFX7-NEXT:    s_bfe_i32 s13, s1, 0x40004
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s20
-; GFX7-NEXT:    s_bfe_i32 s14, s0, 0x40008
+; GFX7-NEXT:    s_bfe_i32 s14, s1, 0x40008
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s21
-; GFX7-NEXT:    s_bfe_i32 s0, s0, 0x4000c
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mul_i32_i24_e32 v1, s0, v1
+; GFX7-NEXT:    s_bfe_i32 s1, s1, 0x4000c
+; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    v_mul_i32_i24_e32 v1, s1, v1
 ; GFX7-NEXT:    v_mul_i32_i24_e32 v2, s14, v2
 ; GFX7-NEXT:    v_mul_i32_i24_e32 v3, s13, v3
 ; GFX7-NEXT:    v_mul_i32_i24_e32 v4, s12, v4
@@ -1636,68 +1637,68 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX8-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX8-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_load_ushort v2, v[0:1]
+; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b16_e64 v3, 12, s2
-; GFX8-NEXT:    v_lshlrev_b16_e64 v4, 12, s4
-; GFX8-NEXT:    s_lshr_b32 s0, s2, 4
-; GFX8-NEXT:    s_lshr_b32 s1, s2, 8
-; GFX8-NEXT:    s_lshr_b32 s5, s4, 4
-; GFX8-NEXT:    s_lshr_b32 s6, s4, 8
-; GFX8-NEXT:    v_lshlrev_b16_e64 v5, 12, s1
-; GFX8-NEXT:    v_lshlrev_b16_e64 v6, 12, s0
-; GFX8-NEXT:    v_lshlrev_b16_e64 v7, 12, s6
-; GFX8-NEXT:    v_lshlrev_b16_e64 v8, 12, s5
+; GFX8-NEXT:    v_lshlrev_b16_e64 v4, 12, s2
+; GFX8-NEXT:    s_lshr_b32 s15, s2, 4
+; GFX8-NEXT:    s_lshr_b32 s16, s2, 8
+; GFX8-NEXT:    v_lshlrev_b16_e64 v12, 12, s16
+; GFX8-NEXT:    v_lshlrev_b16_e64 v3, 12, s0
+; GFX8-NEXT:    s_lshr_b32 s8, s0, 4
+; GFX8-NEXT:    s_lshr_b32 s9, s0, 8
+; GFX8-NEXT:    v_lshlrev_b16_e64 v5, 12, s9
+; GFX8-NEXT:    v_lshlrev_b16_e64 v6, 12, s8
+; GFX8-NEXT:    v_lshlrev_b16_e64 v13, 12, s15
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v3, 12, v3
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v4, 12, v4
-; GFX8-NEXT:    s_lshr_b32 s0, s2, 12
-; GFX8-NEXT:    s_lshr_b32 s1, s4, 12
+; GFX8-NEXT:    s_lshr_b32 s7, s0, 12
+; GFX8-NEXT:    s_lshr_b32 s14, s2, 12
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v6, 12, v6
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v5, 12, v5
-; GFX8-NEXT:    v_ashrrev_i16_e32 v7, 12, v7
-; GFX8-NEXT:    v_ashrrev_i16_e32 v8, 12, v8
-; GFX8-NEXT:    v_lshlrev_b16_e64 v9, 12, s0
-; GFX8-NEXT:    v_lshlrev_b16_e64 v10, 12, s1
-; GFX8-NEXT:    s_lshr_b32 s5, s2, 16
-; GFX8-NEXT:    s_lshr_b32 s6, s4, 16
-; GFX8-NEXT:    v_mul_u32_u24_e32 v5, v5, v7
-; GFX8-NEXT:    v_lshlrev_b16_e64 v11, 12, s5
-; GFX8-NEXT:    v_lshlrev_b16_e64 v12, 12, s6
-; GFX8-NEXT:    s_lshr_b32 s0, s2, 20
-; GFX8-NEXT:    s_lshr_b32 s1, s4, 20
-; GFX8-NEXT:    v_ashrrev_i16_e32 v9, 12, v9
-; GFX8-NEXT:    v_ashrrev_i16_e32 v10, 12, v10
-; GFX8-NEXT:    v_lshlrev_b16_e64 v13, 12, s0
-; GFX8-NEXT:    v_lshlrev_b16_e64 v14, 12, s1
-; GFX8-NEXT:    s_lshr_b32 s5, s2, 24
-; GFX8-NEXT:    s_lshr_b32 s6, s4, 24
-; GFX8-NEXT:    v_ashrrev_i16_e32 v11, 12, v11
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v12, 12, v12
-; GFX8-NEXT:    v_lshlrev_b16_e64 v15, 12, s5
-; GFX8-NEXT:    v_lshlrev_b16_e64 v17, 12, s6
-; GFX8-NEXT:    s_lshr_b32 s0, s2, 28
-; GFX8-NEXT:    s_lshr_b32 s1, s4, 28
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v13, 12, v13
+; GFX8-NEXT:    v_lshlrev_b16_e64 v7, 12, s7
+; GFX8-NEXT:    v_lshlrev_b16_e64 v14, 12, s14
+; GFX8-NEXT:    s_lshr_b32 s6, s0, 16
+; GFX8-NEXT:    s_lshr_b32 s13, s2, 16
+; GFX8-NEXT:    v_mul_u32_u24_e32 v5, v5, v12
+; GFX8-NEXT:    v_lshlrev_b16_e64 v8, 12, s6
+; GFX8-NEXT:    v_lshlrev_b16_e64 v15, 12, s13
+; GFX8-NEXT:    s_lshr_b32 s5, s0, 20
+; GFX8-NEXT:    s_lshr_b32 s12, s2, 20
+; GFX8-NEXT:    v_ashrrev_i16_e32 v7, 12, v7
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v14, 12, v14
-; GFX8-NEXT:    v_lshlrev_b16_e64 v16, 12, s0
-; GFX8-NEXT:    v_lshlrev_b16_e64 v18, 12, s1
+; GFX8-NEXT:    v_lshlrev_b16_e64 v9, 12, s5
+; GFX8-NEXT:    v_lshlrev_b16_e64 v16, 12, s12
+; GFX8-NEXT:    s_lshr_b32 s4, s0, 24
+; GFX8-NEXT:    s_lshr_b32 s11, s2, 24
+; GFX8-NEXT:    v_ashrrev_i16_e32 v8, 12, v8
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v15, 12, v15
-; GFX8-NEXT:    v_ashrrev_i16_e32 v17, 12, v17
+; GFX8-NEXT:    v_lshlrev_b16_e64 v10, 12, s4
+; GFX8-NEXT:    v_lshlrev_b16_e64 v17, 12, s11
+; GFX8-NEXT:    s_lshr_b32 s1, s0, 28
+; GFX8-NEXT:    s_lshr_b32 s10, s2, 28
+; GFX8-NEXT:    v_ashrrev_i16_e32 v9, 12, v9
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v16, 12, v16
+; GFX8-NEXT:    v_lshlrev_b16_e64 v11, 12, s1
+; GFX8-NEXT:    v_lshlrev_b16_e64 v18, 12, s10
+; GFX8-NEXT:    v_ashrrev_i16_e32 v10, 12, v10
+; GFX8-NEXT:    v_ashrrev_i16_e32 v17, 12, v17
+; GFX8-NEXT:    v_ashrrev_i16_e32 v11, 12, v11
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v18, 12, v18
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mad_u32_u24 v2, v3, v4, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, v6, v8, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, v6, v13, v2
 ; GFX8-NEXT:    v_add_u32_sdwa v2, vcc, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX8-NEXT:    v_mad_u32_u24 v2, v9, v10, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, v11, v12, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, v13, v14, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, v15, v17, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, v16, v18, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, v7, v14, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, v8, v15, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, v9, v16, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, v10, v17, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, v11, v18, v2
 ; GFX8-NEXT:    flat_store_short v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -1707,64 +1708,64 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX9-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX9-NEXT:    s_load_dword s6, s[6:7], 0x0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_and_b32 s4, s2, 15
+; GFX9-NEXT:    s_bfe_u32 s5, s2, 0x40004
+; GFX9-NEXT:    s_bfe_u32 s8, s2, 0x40008
+; GFX9-NEXT:    s_bfe_u32 s9, s2, 0x4000c
+; GFX9-NEXT:    s_bfe_u32 s10, s2, 0x40010
+; GFX9-NEXT:    s_bfe_u32 s11, s2, 0x40018
+; GFX9-NEXT:    s_lshr_b32 s12, s2, 28
+; GFX9-NEXT:    s_bfe_u32 s2, s2, 0x40014
+; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s10, s2
+; GFX9-NEXT:    v_pk_lshlrev_b16 v2, 12, s2 op_sel_hi:[0,1]
+; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s11, s12
+; GFX9-NEXT:    s_bfe_u32 s7, s6, 0x40018
+; GFX9-NEXT:    s_lshr_b32 s13, s6, 28
+; GFX9-NEXT:    s_bfe_u32 s14, s6, 0x40010
+; GFX9-NEXT:    s_bfe_u32 s15, s6, 0x40014
+; GFX9-NEXT:    s_bfe_u32 s16, s6, 0x40008
+; GFX9-NEXT:    s_bfe_u32 s17, s6, 0x4000c
+; GFX9-NEXT:    s_and_b32 s18, s6, 15
+; GFX9-NEXT:    s_bfe_u32 s6, s6, 0x40004
+; GFX9-NEXT:    v_pk_lshlrev_b16 v3, 12, s2 op_sel_hi:[0,1]
+; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s4, s5
+; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s18, s6
+; GFX9-NEXT:    v_pk_lshlrev_b16 v0, 12, s4 op_sel_hi:[0,1]
+; GFX9-NEXT:    v_pk_lshlrev_b16 v4, 12, s2 op_sel_hi:[0,1]
+; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s8, s9
+; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s16, s17
+; GFX9-NEXT:    v_pk_lshlrev_b16 v5, 12, s2 op_sel_hi:[0,1]
+; GFX9-NEXT:    v_pk_lshlrev_b16 v1, 12, s4 op_sel_hi:[0,1]
+; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s14, s15
+; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 12, v0 op_sel_hi:[0,1]
+; GFX9-NEXT:    v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1]
+; GFX9-NEXT:    v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1]
+; GFX9-NEXT:    v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1]
+; GFX9-NEXT:    v_pk_lshlrev_b16 v6, 12, s2 op_sel_hi:[0,1]
+; GFX9-NEXT:    v_pk_mul_lo_u16 v5, v1, v5
+; GFX9-NEXT:    v_pk_mul_lo_u16 v4, v0, v4
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1]
+; GFX9-NEXT:    v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_load_ushort v2, v[0:1], off
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s0, s2, 15
-; GFX9-NEXT:    s_bfe_u32 s1, s2, 0x40004
-; GFX9-NEXT:    s_and_b32 s5, s4, 15
-; GFX9-NEXT:    s_bfe_u32 s6, s4, 0x40004
-; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
-; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s5, s6
-; GFX9-NEXT:    s_bfe_u32 s5, s2, 0x40008
-; GFX9-NEXT:    s_bfe_u32 s6, s2, 0x4000c
-; GFX9-NEXT:    v_pk_lshlrev_b16 v3, 12, s0 op_sel_hi:[0,1]
-; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s5, s6
-; GFX9-NEXT:    s_bfe_u32 s9, s2, 0x40010
-; GFX9-NEXT:    s_bfe_u32 s10, s2, 0x40014
-; GFX9-NEXT:    v_pk_lshlrev_b16 v4, 12, s0 op_sel_hi:[0,1]
-; GFX9-NEXT:    s_bfe_u32 s13, s2, 0x40018
-; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s9, s10
-; GFX9-NEXT:    s_lshr_b32 s2, s2, 28
-; GFX9-NEXT:    v_pk_lshlrev_b16 v5, 12, s0 op_sel_hi:[0,1]
-; GFX9-NEXT:    v_pk_lshlrev_b16 v7, 12, s1 op_sel_hi:[0,1]
-; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s13, s2
-; GFX9-NEXT:    s_bfe_u32 s7, s4, 0x40008
-; GFX9-NEXT:    s_bfe_u32 s8, s4, 0x4000c
-; GFX9-NEXT:    v_pk_lshlrev_b16 v6, 12, s0 op_sel_hi:[0,1]
-; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s7, s8
+; GFX9-NEXT:    v_pk_mul_lo_u16 v2, v2, v6
+; GFX9-NEXT:    global_load_ushort v6, v[0:1], off
+; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s7, s13
+; GFX9-NEXT:    v_pk_lshlrev_b16 v7, 12, s2 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1]
-; GFX9-NEXT:    v_pk_lshlrev_b16 v8, 12, s0 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_pk_mul_lo_u16 v3, v3, v7
-; GFX9-NEXT:    s_bfe_u32 s11, s4, 0x40010
-; GFX9-NEXT:    s_bfe_u32 s12, s4, 0x40014
-; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s11, s12
-; GFX9-NEXT:    v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1]
-; GFX9-NEXT:    v_pk_ashrrev_i16 v8, 12, v8 op_sel_hi:[0,1]
-; GFX9-NEXT:    v_pk_lshlrev_b16 v9, 12, s0 op_sel_hi:[0,1]
-; GFX9-NEXT:    s_bfe_u32 s14, s4, 0x40018
-; GFX9-NEXT:    s_lshr_b32 s4, s4, 28
-; GFX9-NEXT:    v_pk_mul_lo_u16 v4, v4, v8
-; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s14, s4
-; GFX9-NEXT:    v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1]
-; GFX9-NEXT:    v_pk_ashrrev_i16 v9, 12, v9 op_sel_hi:[0,1]
-; GFX9-NEXT:    v_pk_lshlrev_b16 v10, 12, s0 op_sel_hi:[0,1]
-; GFX9-NEXT:    v_pk_mul_lo_u16 v5, v5, v9
-; GFX9-NEXT:    v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1]
-; GFX9-NEXT:    v_pk_ashrrev_i16 v10, 12, v10 op_sel_hi:[0,1]
-; GFX9-NEXT:    v_pk_mul_lo_u16 v6, v6, v10
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
+; GFX9-NEXT:    v_add_u32_e32 v6, v4, v6
+; GFX9-NEXT:    v_add_u32_sdwa v4, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_add_u32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX9-NEXT:    v_add_u32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_add_u32_e32 v4, v4, v2
+; GFX9-NEXT:    v_add_u32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_add_u32_e32 v2, v2, v3
 ; GFX9-NEXT:    v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT:    v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX9-NEXT:    v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT:    v_add_u32_e32 v2, v2, v5
-; GFX9-NEXT:    v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT:    v_add_u32_e32 v2, v2, v6
-; GFX9-NEXT:    v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-NEXT:    global_store_short v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -1774,64 +1775,64 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX9-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX9-DL-NEXT:    s_load_dword s6, s[6:7], 0x0
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    s_and_b32 s4, s2, 15
+; GFX9-DL-NEXT:    s_bfe_u32 s5, s2, 0x40004
+; GFX9-DL-NEXT:    s_bfe_u32 s8, s2, 0x40008
+; GFX9-DL-NEXT:    s_bfe_u32 s9, s2, 0x4000c
+; GFX9-DL-NEXT:    s_bfe_u32 s10, s2, 0x40010
+; GFX9-DL-NEXT:    s_bfe_u32 s11, s2, 0x40018
+; GFX9-DL-NEXT:    s_lshr_b32 s12, s2, 28
+; GFX9-DL-NEXT:    s_bfe_u32 s2, s2, 0x40014
+; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s2, s10, s2
+; GFX9-DL-NEXT:    v_pk_lshlrev_b16 v2, 12, s2 op_sel_hi:[0,1]
+; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s2, s11, s12
+; GFX9-DL-NEXT:    s_bfe_u32 s7, s6, 0x40018
+; GFX9-DL-NEXT:    s_lshr_b32 s13, s6, 28
+; GFX9-DL-NEXT:    s_bfe_u32 s14, s6, 0x40010
+; GFX9-DL-NEXT:    s_bfe_u32 s15, s6, 0x40014
+; GFX9-DL-NEXT:    s_bfe_u32 s16, s6, 0x40008
+; GFX9-DL-NEXT:    s_bfe_u32 s17, s6, 0x4000c
+; GFX9-DL-NEXT:    s_and_b32 s18, s6, 15
+; GFX9-DL-NEXT:    s_bfe_u32 s6, s6, 0x40004
+; GFX9-DL-NEXT:    v_pk_lshlrev_b16 v3, 12, s2 op_sel_hi:[0,1]
+; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s4, s4, s5
+; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s2, s18, s6
+; GFX9-DL-NEXT:    v_pk_lshlrev_b16 v0, 12, s4 op_sel_hi:[0,1]
+; GFX9-DL-NEXT:    v_pk_lshlrev_b16 v4, 12, s2 op_sel_hi:[0,1]
+; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s4, s8, s9
+; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s2, s16, s17
+; GFX9-DL-NEXT:    v_pk_lshlrev_b16 v5, 12, s2 op_sel_hi:[0,1]
+; GFX9-DL-NEXT:    v_pk_lshlrev_b16 v1, 12, s4 op_sel_hi:[0,1]
+; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s2, s14, s15
+; GFX9-DL-NEXT:    v_pk_ashrrev_i16 v0, 12, v0 op_sel_hi:[0,1]
+; GFX9-DL-NEXT:    v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1]
+; GFX9-DL-NEXT:    v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1]
+; GFX9-DL-NEXT:    v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1]
+; GFX9-DL-NEXT:    v_pk_lshlrev_b16 v6, 12, s2 op_sel_hi:[0,1]
+; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v5, v1, v5
+; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v4, v0, v4
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-DL-NEXT:    v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1]
+; GFX9-DL-NEXT:    v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1]
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT:    global_load_ushort v2, v[0:1], off
-; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_and_b32 s0, s2, 15
-; GFX9-DL-NEXT:    s_bfe_u32 s1, s2, 0x40004
-; GFX9-DL-NEXT:    s_and_b32 s5, s4, 15
-; GFX9-DL-NEXT:    s_bfe_u32 s6, s4, 0x40004
-; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
-; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s1, s5, s6
-; GFX9-DL-NEXT:    s_bfe_u32 s5, s2, 0x40008
-; GFX9-DL-NEXT:    s_bfe_u32 s6, s2, 0x4000c
-; GFX9-DL-NEXT:    v_pk_lshlrev_b16 v3, 12, s0 op_sel_hi:[0,1]
-; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s0, s5, s6
-; GFX9-DL-NEXT:    s_bfe_u32 s9, s2, 0x40010
-; GFX9-DL-NEXT:    s_bfe_u32 s10, s2, 0x40014
-; GFX9-DL-NEXT:    v_pk_lshlrev_b16 v4, 12, s0 op_sel_hi:[0,1]
-; GFX9-DL-NEXT:    s_bfe_u32 s13, s2, 0x40018
-; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s0, s9, s10
-; GFX9-DL-NEXT:    s_lshr_b32 s2, s2, 28
-; GFX9-DL-NEXT:    v_pk_lshlrev_b16 v5, 12, s0 op_sel_hi:[0,1]
-; GFX9-DL-NEXT:    v_pk_lshlrev_b16 v7, 12, s1 op_sel_hi:[0,1]
-; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s0, s13, s2
-; GFX9-DL-NEXT:    s_bfe_u32 s7, s4, 0x40008
-; GFX9-DL-NEXT:    s_bfe_u32 s8, s4, 0x4000c
-; GFX9-DL-NEXT:    v_pk_lshlrev_b16 v6, 12, s0 op_sel_hi:[0,1]
-; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s0, s7, s8
+; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v2, v2, v6
+; GFX9-DL-NEXT:    global_load_ushort v6, v[0:1], off
+; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s2, s7, s13
+; GFX9-DL-NEXT:    v_pk_lshlrev_b16 v7, 12, s2 op_sel_hi:[0,1]
 ; GFX9-DL-NEXT:    v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1]
 ; GFX9-DL-NEXT:    v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1]
-; GFX9-DL-NEXT:    v_pk_lshlrev_b16 v8, 12, s0 op_sel_hi:[0,1]
 ; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v3, v3, v7
-; GFX9-DL-NEXT:    s_bfe_u32 s11, s4, 0x40010
-; GFX9-DL-NEXT:    s_bfe_u32 s12, s4, 0x40014
-; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s0, s11, s12
-; GFX9-DL-NEXT:    v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1]
-; GFX9-DL-NEXT:    v_pk_ashrrev_i16 v8, 12, v8 op_sel_hi:[0,1]
-; GFX9-DL-NEXT:    v_pk_lshlrev_b16 v9, 12, s0 op_sel_hi:[0,1]
-; GFX9-DL-NEXT:    s_bfe_u32 s14, s4, 0x40018
-; GFX9-DL-NEXT:    s_lshr_b32 s4, s4, 28
-; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v4, v4, v8
-; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s0, s14, s4
-; GFX9-DL-NEXT:    v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1]
-; GFX9-DL-NEXT:    v_pk_ashrrev_i16 v9, 12, v9 op_sel_hi:[0,1]
-; GFX9-DL-NEXT:    v_pk_lshlrev_b16 v10, 12, s0 op_sel_hi:[0,1]
-; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v5, v5, v9
-; GFX9-DL-NEXT:    v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1]
-; GFX9-DL-NEXT:    v_pk_ashrrev_i16 v10, 12, v10 op_sel_hi:[0,1]
-; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v6, v6, v10
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_add_u32_e32 v2, v3, v2
+; GFX9-DL-NEXT:    v_add_u32_e32 v6, v4, v6
+; GFX9-DL-NEXT:    v_add_u32_sdwa v4, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-DL-NEXT:    v_add_u32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX9-DL-NEXT:    v_add_u32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-DL-NEXT:    v_add_u32_e32 v4, v4, v2
+; GFX9-DL-NEXT:    v_add_u32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-DL-NEXT:    v_add_u32_e32 v2, v2, v3
 ; GFX9-DL-NEXT:    v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-DL-NEXT:    v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX9-DL-NEXT:    v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-DL-NEXT:    v_add_u32_e32 v2, v2, v5
-; GFX9-DL-NEXT:    v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-DL-NEXT:    v_add_u32_e32 v2, v2, v6
-; GFX9-DL-NEXT:    v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-DL-NEXT:    global_store_short v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
 ;
@@ -1841,65 +1842,65 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX10-DL-NEXT:    global_load_ushort v2, v[0:1], off
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_and_b32 s0, s2, 15
-; GFX10-DL-NEXT:    s_bfe_u32 s1, s2, 0x40004
-; GFX10-DL-NEXT:    s_and_b32 s5, s4, 15
-; GFX10-DL-NEXT:    s_bfe_u32 s6, s4, 0x40004
-; GFX10-DL-NEXT:    s_bfe_u32 s7, s2, 0x40008
-; GFX10-DL-NEXT:    s_bfe_u32 s8, s2, 0x4000c
-; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
-; GFX10-DL-NEXT:    s_bfe_u32 s1, s4, 0x40008
+; GFX10-DL-NEXT:    s_and_b32 s5, s0, 15
+; GFX10-DL-NEXT:    s_bfe_u32 s6, s0, 0x40004
+; GFX10-DL-NEXT:    s_and_b32 s7, s1, 15
+; GFX10-DL-NEXT:    s_bfe_u32 s8, s1, 0x40004
+; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40018
+; GFX10-DL-NEXT:    s_lshr_b32 s4, s0, 28
 ; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
-; GFX10-DL-NEXT:    s_bfe_u32 s6, s4, 0x4000c
+; GFX10-DL-NEXT:    s_bfe_u32 s9, s0, 0x40010
 ; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s7, s7, s8
-; GFX10-DL-NEXT:    v_pk_lshlrev_b16 v3, 12, s0 op_sel_hi:[0,1]
-; GFX10-DL-NEXT:    s_bfe_u32 s0, s2, 0x40010
-; GFX10-DL-NEXT:    v_pk_lshlrev_b16 v4, 12, s5 op_sel_hi:[0,1]
-; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s1, s1, s6
-; GFX10-DL-NEXT:    s_bfe_u32 s5, s2, 0x40014
+; GFX10-DL-NEXT:    s_bfe_u32 s10, s0, 0x40014
+; GFX10-DL-NEXT:    s_bfe_u32 s6, s0, 0x40008
+; GFX10-DL-NEXT:    v_pk_lshlrev_b16 v3, 12, s5 op_sel_hi:[0,1]
+; GFX10-DL-NEXT:    s_bfe_u32 s0, s0, 0x4000c
+; GFX10-DL-NEXT:    v_pk_lshlrev_b16 v4, 12, s7 op_sel_hi:[0,1]
+; GFX10-DL-NEXT:    s_bfe_u32 s8, s1, 0x40008
+; GFX10-DL-NEXT:    s_bfe_u32 s5, s1, 0x4000c
 ; GFX10-DL-NEXT:    v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1]
-; GFX10-DL-NEXT:    s_bfe_u32 s6, s4, 0x40010
+; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s0, s6, s0
 ; GFX10-DL-NEXT:    v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1]
-; GFX10-DL-NEXT:    s_bfe_u32 s8, s4, 0x40014
-; GFX10-DL-NEXT:    v_pk_lshlrev_b16 v6, 12, s1 op_sel_hi:[0,1]
-; GFX10-DL-NEXT:    v_pk_lshlrev_b16 v5, 12, s7 op_sel_hi:[0,1]
-; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s0, s0, s5
+; GFX10-DL-NEXT:    s_bfe_u32 s6, s1, 0x40010
+; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s5, s8, s5
+; GFX10-DL-NEXT:    s_bfe_u32 s7, s1, 0x40018
+; GFX10-DL-NEXT:    v_pk_lshlrev_b16 v5, 12, s0 op_sel_hi:[0,1]
+; GFX10-DL-NEXT:    s_bfe_u32 s0, s1, 0x40014
 ; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v3, v3, v4
-; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s1, s6, s8
-; GFX10-DL-NEXT:    s_bfe_u32 s5, s2, 0x40018
+; GFX10-DL-NEXT:    v_pk_lshlrev_b16 v6, 12, s5 op_sel_hi:[0,1]
+; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s5, s9, s10
 ; GFX10-DL-NEXT:    v_pk_ashrrev_i16 v4, 12, v5 op_sel_hi:[0,1]
+; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s0, s6, s0
+; GFX10-DL-NEXT:    s_lshr_b32 s1, s1, 28
 ; GFX10-DL-NEXT:    v_pk_ashrrev_i16 v5, 12, v6 op_sel_hi:[0,1]
-; GFX10-DL-NEXT:    s_bfe_u32 s6, s4, 0x40018
-; GFX10-DL-NEXT:    s_lshr_b32 s2, s2, 28
-; GFX10-DL-NEXT:    v_pk_lshlrev_b16 v6, 12, s0 op_sel_hi:[0,1]
-; GFX10-DL-NEXT:    s_lshr_b32 s4, s4, 28
-; GFX10-DL-NEXT:    v_pk_lshlrev_b16 v7, 12, s1 op_sel_hi:[0,1]
-; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v4, v4, v5
-; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s0, s5, s2
-; GFX10-DL-NEXT:    v_pk_ashrrev_i16 v5, 12, v6 op_sel_hi:[0,1]
-; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s1, s6, s4
-; GFX10-DL-NEXT:    v_pk_ashrrev_i16 v6, 12, v7 op_sel_hi:[0,1]
+; GFX10-DL-NEXT:    v_pk_lshlrev_b16 v6, 12, s5 op_sel_hi:[0,1]
 ; GFX10-DL-NEXT:    v_pk_lshlrev_b16 v7, 12, s0 op_sel_hi:[0,1]
-; GFX10-DL-NEXT:    v_pk_lshlrev_b16 v8, 12, s1 op_sel_hi:[0,1]
-; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v5, v5, v6
-; GFX10-DL-NEXT:    v_pk_ashrrev_i16 v6, 12, v7 op_sel_hi:[0,1]
-; GFX10-DL-NEXT:    v_pk_ashrrev_i16 v7, 12, v8 op_sel_hi:[0,1]
-; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v7, v6, v7
+; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s0, s2, s4
+; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v4, v4, v5
+; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s1, s7, s1
+; GFX10-DL-NEXT:    v_pk_ashrrev_i16 v5, 12, v7 op_sel_hi:[0,1]
+; GFX10-DL-NEXT:    v_pk_lshlrev_b16 v7, 12, s1 op_sel_hi:[0,1]
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v3, v2
 ; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-DL-NEXT:    v_pk_ashrrev_i16 v3, 12, v6 op_sel_hi:[0,1]
+; GFX10-DL-NEXT:    v_pk_lshlrev_b16 v6, 12, s0 op_sel_hi:[0,1]
 ; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v3, v3, v5
+; GFX10-DL-NEXT:    v_pk_ashrrev_i16 v5, 12, v7 op_sel_hi:[0,1]
+; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-DL-NEXT:    v_pk_ashrrev_i16 v4, 12, v6 op_sel_hi:[0,1]
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v2, v3
+; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v4, v4, v5
+; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v2, v4
 ; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v2, v5
-; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v2, v7
-; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX10-DL-NEXT:    global_store_short v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                               <8 x i4> addrspace(1)* %src2,
@@ -2289,80 +2290,80 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX10-DL-NEXT:    s_load_dword s5, s[6:7], 0x0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX10-DL-NEXT:    global_load_ubyte v2, v[0:1], off
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_lshr_b32 s0, s4, 4
-; GFX10-DL-NEXT:    s_lshr_b32 s1, s5, 4
-; GFX10-DL-NEXT:    s_lshr_b32 s6, s4, 12
-; GFX10-DL-NEXT:    s_lshr_b32 s7, s5, 12
-; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v6, 12, s4
+; GFX10-DL-NEXT:    s_lshr_b32 s8, s0, 4
+; GFX10-DL-NEXT:    s_lshr_b32 s15, s1, 4
+; GFX10-DL-NEXT:    s_lshr_b32 s9, s0, 12
+; GFX10-DL-NEXT:    s_lshr_b32 s16, s1, 12
 ; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v3, 12, s0
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v7, 12, s8
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v12, 12, s15
 ; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v4, 12, s1
-; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v5, 12, s6
-; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v7, 12, s7
-; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v8, 12, s5
-; GFX10-DL-NEXT:    s_lshr_b32 s8, s4, 8
-; GFX10-DL-NEXT:    s_lshr_b32 s0, s5, 8
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v14, 12, s16
+; GFX10-DL-NEXT:    s_lshr_b32 s10, s0, 8
+; GFX10-DL-NEXT:    s_lshr_b32 s17, s1, 8
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v6, 12, s9
+; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v7, 12, v7
+; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v12, 12, v12
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v5, 12, s10
 ; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v3, 12, v3
 ; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v4, 12, v4
-; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v5, 12, v5
-; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v9, 12, s8
-; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v10, 12, s0
-; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v7, 12, v7
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v13, 12, s17
+; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v7, v7, v12
+; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v19, 12, v6
+; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v14, 12, v14
+; GFX10-DL-NEXT:    s_lshr_b32 s4, s0, 20
+; GFX10-DL-NEXT:    s_lshr_b32 s5, s0, 16
+; GFX10-DL-NEXT:    s_lshr_b32 s6, s0, 28
+; GFX10-DL-NEXT:    s_lshr_b32 s7, s0, 24
 ; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v3, v3, v4
-; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v6, 12, v6
-; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v8, 12, v8
-; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v4, 12, v9
-; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v5, v5, v7
-; GFX10-DL-NEXT:    s_lshr_b32 s0, s4, 20
-; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v9, 12, v10
-; GFX10-DL-NEXT:    s_lshr_b32 s1, s5, 20
-; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v6, v6, v8
-; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v3, 8, v3
-; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v7, 12, s0
-; GFX10-DL-NEXT:    s_lshr_b32 s8, s5, 16
-; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v8, 12, s1
-; GFX10-DL-NEXT:    s_lshr_b32 s9, s5, 28
-; GFX10-DL-NEXT:    v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-DL-NEXT:    s_lshr_b32 s7, s4, 28
-; GFX10-DL-NEXT:    s_lshr_b32 s6, s4, 16
-; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v4, v4, v9
-; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v5, 8, v5
-; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v10, 12, s9
-; GFX10-DL-NEXT:    s_lshr_b32 s0, s4, 24
-; GFX10-DL-NEXT:    s_lshr_b32 s1, s5, 24
-; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v6, 12, s7
+; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v4, v19, v14
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v6, 8, v7
+; GFX10-DL-NEXT:    s_lshr_b32 s11, s1, 20
+; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v12, 12, v13
+; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v5, 12, v5
+; GFX10-DL-NEXT:    s_lshr_b32 s12, s1, 16
+; GFX10-DL-NEXT:    v_or_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-DL-NEXT:    s_lshr_b32 s13, s1, 28
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v8, 12, s7
 ; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v9, 12, s6
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v10, 12, s5
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v11, 12, s4
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v13, 12, s11
+; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v5, v5, v12
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v4, 8, v4
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v7, 12, s12
+; GFX10-DL-NEXT:    s_lshr_b32 s14, s1, 24
+; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v6, 12, v8
+; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v8, 12, v9
+; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v9, 12, v10
+; GFX10-DL-NEXT:    v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-DL-NEXT:    v_and_b32_e32 v3, s2, v3
-; GFX10-DL-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v16, 12, s13
+; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v5, 12, v11
+; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v10, 12, v13
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v15, 12, s14
 ; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v7, 12, v7
-; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v8, 12, v8
-; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v13, 12, s8
-; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v11, 12, s0
-; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v12, 12, s1
-; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v15, 12, v9
-; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v19, 12, v6
-; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v9, 12, v13
-; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v10, 12, v10
+; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v11, 12, v16
 ; GFX10-DL-NEXT:    v_or_b32_e32 v4, v3, v4
-; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v7, v7, v8
-; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v11, 12, v11
-; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v12, 12, v12
-; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v6, v19, v10
-; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v5, v15, v9
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v8, 8, v4
-; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v6, 8, v6
+; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v5, v5, v10
+; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v12, 12, v15
+; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v10, v9, v7
+; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v8, v8, v11
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v9, 8, v4
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v3, v2
-; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v3, 8, v7
-; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v7, v11, v12
-; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v2, v8
-; GFX10-DL-NEXT:    v_or_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-DL-NEXT:    v_or_b32_sdwa v5, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v3, 8, v5
+; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v5, v6, v12
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v6, 8, v8
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v2, v9
+; GFX10-DL-NEXT:    v_or_b32_sdwa v3, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-DL-NEXT:    v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2
 ; GFX10-DL-NEXT:    v_and_b32_e32 v3, s2, v3
 ; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3

diff  --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll
index 0532042417da..2041fbb82460 100644
--- a/llvm/test/CodeGen/AMDGPU/idot8u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll
@@ -15,42 +15,42 @@ define amdgpu_kernel void @udot8_acc32(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_load_dword s0, s[8:9], 0x0
-; GFX7-NEXT:    s_load_dword s1, s[10:11], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s10, s[10:11], 0x0
+; GFX7-NEXT:    s_load_dword s21, s[4:5], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_lshr_b32 s8, s0, 28
-; GFX7-NEXT:    s_lshr_b32 s15, s1, 28
-; GFX7-NEXT:    s_bfe_u32 s16, s1, 0x40018
-; GFX7-NEXT:    s_bfe_u32 s17, s1, 0x40014
-; GFX7-NEXT:    s_bfe_u32 s18, s1, 0x40010
-; GFX7-NEXT:    s_bfe_u32 s19, s1, 0x4000c
-; GFX7-NEXT:    s_bfe_u32 s20, s1, 0x40008
-; GFX7-NEXT:    s_bfe_u32 s21, s1, 0x40004
-; GFX7-NEXT:    s_and_b32 s1, s1, 15
-; GFX7-NEXT:    s_bfe_u32 s9, s0, 0x40018
-; GFX7-NEXT:    s_bfe_u32 s10, s0, 0x40014
-; GFX7-NEXT:    s_bfe_u32 s11, s0, 0x40010
+; GFX7-NEXT:    s_lshr_b32 s1, s0, 28
+; GFX7-NEXT:    s_lshr_b32 s11, s10, 28
+; GFX7-NEXT:    s_bfe_u32 s15, s10, 0x40018
+; GFX7-NEXT:    s_bfe_u32 s16, s10, 0x40014
+; GFX7-NEXT:    s_bfe_u32 s17, s10, 0x40010
+; GFX7-NEXT:    s_bfe_u32 s18, s10, 0x4000c
+; GFX7-NEXT:    s_bfe_u32 s19, s10, 0x40008
+; GFX7-NEXT:    s_bfe_u32 s20, s10, 0x40004
+; GFX7-NEXT:    s_and_b32 s10, s10, 15
+; GFX7-NEXT:    s_bfe_u32 s2, s0, 0x40018
+; GFX7-NEXT:    s_bfe_u32 s8, s0, 0x40014
+; GFX7-NEXT:    s_bfe_u32 s9, s0, 0x40010
 ; GFX7-NEXT:    s_bfe_u32 s12, s0, 0x4000c
 ; GFX7-NEXT:    s_bfe_u32 s13, s0, 0x40008
 ; GFX7-NEXT:    s_bfe_u32 s14, s0, 0x40004
 ; GFX7-NEXT:    s_and_b32 s0, s0, 15
-; GFX7-NEXT:    v_mov_b32_e32 v0, s1
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mad_u32_u24 v0, s0, v0, v1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s10
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s21
-; GFX7-NEXT:    v_mad_u32_u24 v0, s14, v1, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s0, v0, v1
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s20
-; GFX7-NEXT:    v_mad_u32_u24 v0, s13, v1, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s14, v1, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s19
-; GFX7-NEXT:    v_mad_u32_u24 v0, s12, v1, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s13, v1, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s18
-; GFX7-NEXT:    v_mad_u32_u24 v0, s11, v1, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s12, v1, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s17
-; GFX7-NEXT:    v_mad_u32_u24 v0, s10, v1, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s16
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s9, v1, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s15
+; GFX7-NEXT:    v_mov_b32_e32 v1, s16
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s8, v1, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s15
+; GFX7-NEXT:    v_mad_u32_u24 v0, s2, v1, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s11
+; GFX7-NEXT:    v_mad_u32_u24 v0, s1, v1, v0
 ; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
 ;
@@ -60,44 +60,44 @@ define amdgpu_kernel void @udot8_acc32(<8 x i4> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX8-NEXT:    s_load_dword s4, s[6:7], 0x0
-; GFX8-NEXT:    s_load_dword s5, s[0:1], 0x0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    s_load_dword s6, s[6:7], 0x0
+; GFX8-NEXT:    s_load_dword s19, s[0:1], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_lshr_b32 s0, s2, 28
-; GFX8-NEXT:    s_lshr_b32 s11, s4, 28
-; GFX8-NEXT:    s_bfe_u32 s12, s4, 0x40018
-; GFX8-NEXT:    s_bfe_u32 s13, s4, 0x40014
-; GFX8-NEXT:    s_bfe_u32 s14, s4, 0x40010
-; GFX8-NEXT:    s_bfe_u32 s15, s4, 0x4000c
-; GFX8-NEXT:    s_bfe_u32 s16, s4, 0x40008
-; GFX8-NEXT:    s_bfe_u32 s17, s4, 0x40004
-; GFX8-NEXT:    s_and_b32 s4, s4, 15
-; GFX8-NEXT:    s_bfe_u32 s1, s2, 0x40018
-; GFX8-NEXT:    s_bfe_u32 s6, s2, 0x40014
-; GFX8-NEXT:    s_bfe_u32 s7, s2, 0x40010
-; GFX8-NEXT:    s_bfe_u32 s8, s2, 0x4000c
-; GFX8-NEXT:    s_bfe_u32 s9, s2, 0x40008
-; GFX8-NEXT:    s_bfe_u32 s10, s2, 0x40004
+; GFX8-NEXT:    s_lshr_b32 s4, s2, 28
+; GFX8-NEXT:    s_lshr_b32 s7, s6, 28
+; GFX8-NEXT:    s_bfe_u32 s13, s6, 0x40018
+; GFX8-NEXT:    s_bfe_u32 s14, s6, 0x40014
+; GFX8-NEXT:    s_bfe_u32 s15, s6, 0x40010
+; GFX8-NEXT:    s_bfe_u32 s16, s6, 0x4000c
+; GFX8-NEXT:    s_bfe_u32 s17, s6, 0x40008
+; GFX8-NEXT:    s_bfe_u32 s18, s6, 0x40004
+; GFX8-NEXT:    s_and_b32 s6, s6, 15
+; GFX8-NEXT:    s_bfe_u32 s5, s2, 0x40018
+; GFX8-NEXT:    s_bfe_u32 s8, s2, 0x40014
+; GFX8-NEXT:    s_bfe_u32 s9, s2, 0x40010
+; GFX8-NEXT:    s_bfe_u32 s10, s2, 0x4000c
+; GFX8-NEXT:    s_bfe_u32 s11, s2, 0x40008
+; GFX8-NEXT:    s_bfe_u32 s12, s2, 0x40004
 ; GFX8-NEXT:    s_and_b32 s2, s2, 15
-; GFX8-NEXT:    v_mov_b32_e32 v2, s4
-; GFX8-NEXT:    v_mov_b32_e32 v3, s5
-; GFX8-NEXT:    v_mad_u32_u24 v2, s2, v2, v3
-; GFX8-NEXT:    v_mov_b32_e32 v3, s17
-; GFX8-NEXT:    v_mad_u32_u24 v2, s10, v3, v2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s16
-; GFX8-NEXT:    v_mad_u32_u24 v2, s9, v3, v2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s15
-; GFX8-NEXT:    v_mad_u32_u24 v2, s8, v3, v2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s14
-; GFX8-NEXT:    v_mad_u32_u24 v2, s7, v3, v2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s13
-; GFX8-NEXT:    v_mad_u32_u24 v2, s6, v3, v2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s12
-; GFX8-NEXT:    v_mad_u32_u24 v2, s1, v3, v2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s11
-; GFX8-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s19
+; GFX8-NEXT:    v_mad_u32_u24 v0, s2, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s18
+; GFX8-NEXT:    v_mad_u32_u24 v0, s12, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s17
+; GFX8-NEXT:    v_mad_u32_u24 v0, s11, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s16
+; GFX8-NEXT:    v_mad_u32_u24 v0, s10, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s15
+; GFX8-NEXT:    v_mad_u32_u24 v0, s9, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s14
+; GFX8-NEXT:    v_mad_u32_u24 v0, s8, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s13
+; GFX8-NEXT:    v_mad_u32_u24 v0, s5, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mad_u32_u24 v2, s4, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -107,44 +107,44 @@ define amdgpu_kernel void @udot8_acc32(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX9-NEXT:    s_load_dword s4, s[6:7], 0x0
-; GFX9-NEXT:    s_load_dword s5, s[0:1], 0x0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    s_load_dword s6, s[6:7], 0x0
+; GFX9-NEXT:    s_load_dword s19, s[0:1], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshr_b32 s0, s2, 28
-; GFX9-NEXT:    s_lshr_b32 s11, s4, 28
-; GFX9-NEXT:    s_bfe_u32 s12, s4, 0x40018
-; GFX9-NEXT:    s_bfe_u32 s13, s4, 0x40014
-; GFX9-NEXT:    s_bfe_u32 s14, s4, 0x40010
-; GFX9-NEXT:    s_bfe_u32 s15, s4, 0x4000c
-; GFX9-NEXT:    s_bfe_u32 s16, s4, 0x40008
-; GFX9-NEXT:    s_bfe_u32 s17, s4, 0x40004
-; GFX9-NEXT:    s_and_b32 s4, s4, 15
-; GFX9-NEXT:    s_bfe_u32 s1, s2, 0x40018
-; GFX9-NEXT:    s_bfe_u32 s6, s2, 0x40014
-; GFX9-NEXT:    s_bfe_u32 s7, s2, 0x40010
-; GFX9-NEXT:    s_bfe_u32 s8, s2, 0x4000c
-; GFX9-NEXT:    s_bfe_u32 s9, s2, 0x40008
-; GFX9-NEXT:    s_bfe_u32 s10, s2, 0x40004
+; GFX9-NEXT:    s_lshr_b32 s4, s2, 28
+; GFX9-NEXT:    s_lshr_b32 s7, s6, 28
+; GFX9-NEXT:    s_bfe_u32 s13, s6, 0x40018
+; GFX9-NEXT:    s_bfe_u32 s14, s6, 0x40014
+; GFX9-NEXT:    s_bfe_u32 s15, s6, 0x40010
+; GFX9-NEXT:    s_bfe_u32 s16, s6, 0x4000c
+; GFX9-NEXT:    s_bfe_u32 s17, s6, 0x40008
+; GFX9-NEXT:    s_bfe_u32 s18, s6, 0x40004
+; GFX9-NEXT:    s_and_b32 s6, s6, 15
+; GFX9-NEXT:    s_bfe_u32 s5, s2, 0x40018
+; GFX9-NEXT:    s_bfe_u32 s8, s2, 0x40014
+; GFX9-NEXT:    s_bfe_u32 s9, s2, 0x40010
+; GFX9-NEXT:    s_bfe_u32 s10, s2, 0x4000c
+; GFX9-NEXT:    s_bfe_u32 s11, s2, 0x40008
+; GFX9-NEXT:    s_bfe_u32 s12, s2, 0x40004
 ; GFX9-NEXT:    s_and_b32 s2, s2, 15
-; GFX9-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-NEXT:    v_mad_u32_u24 v2, s2, v2, v3
-; GFX9-NEXT:    v_mov_b32_e32 v3, s17
-; GFX9-NEXT:    v_mad_u32_u24 v2, s10, v3, v2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s16
-; GFX9-NEXT:    v_mad_u32_u24 v2, s9, v3, v2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s15
-; GFX9-NEXT:    v_mad_u32_u24 v2, s8, v3, v2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s14
-; GFX9-NEXT:    v_mad_u32_u24 v2, s7, v3, v2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s13
-; GFX9-NEXT:    v_mad_u32_u24 v2, s6, v3, v2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s12
-; GFX9-NEXT:    v_mad_u32_u24 v2, s1, v3, v2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s11
-; GFX9-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s19
+; GFX9-NEXT:    v_mad_u32_u24 v0, s2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, s18
+; GFX9-NEXT:    v_mad_u32_u24 v0, s12, v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s17
+; GFX9-NEXT:    v_mad_u32_u24 v0, s11, v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s16
+; GFX9-NEXT:    v_mad_u32_u24 v0, s10, v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s15
+; GFX9-NEXT:    v_mad_u32_u24 v0, s9, v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s14
+; GFX9-NEXT:    v_mad_u32_u24 v0, s8, v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s13
+; GFX9-NEXT:    v_mad_u32_u24 v0, s5, v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    v_mad_u32_u24 v2, s4, v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -153,15 +153,15 @@ define amdgpu_kernel void @udot8_acc32(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX9-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
-; GFX9-DL-NEXT:    s_load_dword s5, s[0:1], 0x0
+; GFX9-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
+; GFX9-DL-NEXT:    s_load_dword s6, s[0:1], 0x0
+; GFX9-DL-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-DL-NEXT:    v_dot8_u32_u4 v2, s2, v2, v3
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s6
+; GFX9-DL-NEXT:    v_dot8_u32_u4 v2, s4, v2, v3
 ; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
 ;
@@ -171,14 +171,14 @@ define amdgpu_kernel void @udot8_acc32(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
-; GFX10-DL-NEXT:    s_load_dword s5, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-DL-NEXT:    v_dot8_u32_u4 v2, s4, s5, v0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-DL-NEXT:    v_dot8_u32_u4 v2, s2, s4, v2
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                        <8 x i4> addrspace(1)* %src2,
@@ -304,46 +304,46 @@ define amdgpu_kernel void @udot8_acc16(<8 x i4> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX8-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX8-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_load_ushort v2, v[0:1]
+; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_and_b32 s0, s2, 15
-; GFX8-NEXT:    s_and_b32 s1, s4, 15
+; GFX8-NEXT:    s_and_b32 s1, s2, 15
+; GFX8-NEXT:    s_bfe_u32 s4, s2, 0x40004
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    s_bfe_u32 s5, s4, 0x40004
-; GFX8-NEXT:    v_mov_b32_e32 v4, s5
-; GFX8-NEXT:    s_bfe_u32 s1, s2, 0x40004
-; GFX8-NEXT:    s_bfe_u32 s5, s4, 0x40008
-; GFX8-NEXT:    s_bfe_u32 s8, s4, 0x40010
-; GFX8-NEXT:    s_bfe_u32 s10, s4, 0x40014
-; GFX8-NEXT:    s_bfe_u32 s12, s4, 0x40018
-; GFX8-NEXT:    s_lshr_b32 s14, s4, 28
-; GFX8-NEXT:    s_bfe_u32 s4, s4, 0x4000c
-; GFX8-NEXT:    s_bfe_u32 s6, s2, 0x40008
-; GFX8-NEXT:    v_mov_b32_e32 v5, s5
+; GFX8-NEXT:    v_mov_b32_e32 v4, s4
+; GFX8-NEXT:    s_and_b32 s1, s0, 15
+; GFX8-NEXT:    s_bfe_u32 s5, s0, 0x40004
+; GFX8-NEXT:    s_bfe_u32 s4, s2, 0x40008
 ; GFX8-NEXT:    s_bfe_u32 s7, s2, 0x4000c
-; GFX8-NEXT:    v_mov_b32_e32 v6, s4
-; GFX8-NEXT:    s_bfe_u32 s9, s2, 0x40010
+; GFX8-NEXT:    v_mov_b32_e32 v5, s4
+; GFX8-NEXT:    s_bfe_u32 s6, s0, 0x40008
+; GFX8-NEXT:    s_bfe_u32 s8, s2, 0x40010
+; GFX8-NEXT:    v_mov_b32_e32 v6, s7
+; GFX8-NEXT:    s_bfe_u32 s4, s0, 0x4000c
+; GFX8-NEXT:    s_bfe_u32 s9, s2, 0x40014
+; GFX8-NEXT:    s_bfe_u32 s7, s0, 0x40010
 ; GFX8-NEXT:    v_mov_b32_e32 v7, s8
-; GFX8-NEXT:    s_bfe_u32 s11, s2, 0x40014
-; GFX8-NEXT:    v_mov_b32_e32 v8, s10
-; GFX8-NEXT:    s_bfe_u32 s13, s2, 0x40018
-; GFX8-NEXT:    v_mov_b32_e32 v9, s12
+; GFX8-NEXT:    s_bfe_u32 s11, s2, 0x40018
+; GFX8-NEXT:    s_bfe_u32 s10, s0, 0x40014
+; GFX8-NEXT:    v_mov_b32_e32 v8, s9
+; GFX8-NEXT:    s_bfe_u32 s12, s0, 0x40018
 ; GFX8-NEXT:    s_lshr_b32 s2, s2, 28
+; GFX8-NEXT:    v_mov_b32_e32 v9, s11
+; GFX8-NEXT:    s_lshr_b32 s0, s0, 28
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s1, v4, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s1, v3, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s5, v4, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX8-NEXT:    v_mad_u32_u24 v2, s6, v5, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s7, v6, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s9, v7, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s11, v8, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s13, v9, v2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s14
-; GFX8-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s4, v6, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s7, v7, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s10, v8, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s12, v9, v2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
 ; GFX8-NEXT:    flat_store_short v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -352,46 +352,46 @@ define amdgpu_kernel void @udot8_acc16(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX9-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX9-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    global_load_ushort v2, v[0:1], off
+; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s0, s2, 15
-; GFX9-NEXT:    s_and_b32 s1, s4, 15
+; GFX9-NEXT:    s_and_b32 s1, s2, 15
+; GFX9-NEXT:    s_bfe_u32 s4, s2, 0x40004
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    s_bfe_u32 s5, s4, 0x40004
-; GFX9-NEXT:    v_mov_b32_e32 v4, s5
-; GFX9-NEXT:    s_bfe_u32 s1, s2, 0x40004
-; GFX9-NEXT:    s_bfe_u32 s5, s4, 0x40008
-; GFX9-NEXT:    s_bfe_u32 s8, s4, 0x40010
-; GFX9-NEXT:    s_bfe_u32 s10, s4, 0x40014
-; GFX9-NEXT:    s_bfe_u32 s12, s4, 0x40018
-; GFX9-NEXT:    s_lshr_b32 s14, s4, 28
-; GFX9-NEXT:    s_bfe_u32 s4, s4, 0x4000c
-; GFX9-NEXT:    s_bfe_u32 s6, s2, 0x40008
-; GFX9-NEXT:    v_mov_b32_e32 v5, s5
+; GFX9-NEXT:    v_mov_b32_e32 v4, s4
+; GFX9-NEXT:    s_and_b32 s1, s0, 15
+; GFX9-NEXT:    s_bfe_u32 s5, s0, 0x40004
+; GFX9-NEXT:    s_bfe_u32 s4, s2, 0x40008
 ; GFX9-NEXT:    s_bfe_u32 s7, s2, 0x4000c
-; GFX9-NEXT:    v_mov_b32_e32 v6, s4
-; GFX9-NEXT:    s_bfe_u32 s9, s2, 0x40010
+; GFX9-NEXT:    v_mov_b32_e32 v5, s4
+; GFX9-NEXT:    s_bfe_u32 s6, s0, 0x40008
+; GFX9-NEXT:    s_bfe_u32 s8, s2, 0x40010
+; GFX9-NEXT:    v_mov_b32_e32 v6, s7
+; GFX9-NEXT:    s_bfe_u32 s4, s0, 0x4000c
+; GFX9-NEXT:    s_bfe_u32 s9, s2, 0x40014
+; GFX9-NEXT:    s_bfe_u32 s7, s0, 0x40010
 ; GFX9-NEXT:    v_mov_b32_e32 v7, s8
-; GFX9-NEXT:    s_bfe_u32 s11, s2, 0x40014
-; GFX9-NEXT:    v_mov_b32_e32 v8, s10
-; GFX9-NEXT:    s_bfe_u32 s13, s2, 0x40018
-; GFX9-NEXT:    v_mov_b32_e32 v9, s12
+; GFX9-NEXT:    s_bfe_u32 s11, s2, 0x40018
+; GFX9-NEXT:    s_bfe_u32 s10, s0, 0x40014
+; GFX9-NEXT:    v_mov_b32_e32 v8, s9
+; GFX9-NEXT:    s_bfe_u32 s12, s0, 0x40018
 ; GFX9-NEXT:    s_lshr_b32 s2, s2, 28
+; GFX9-NEXT:    v_mov_b32_e32 v9, s11
+; GFX9-NEXT:    s_lshr_b32 s0, s0, 28
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s1, v4, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s1, v3, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s5, v4, v2
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX9-NEXT:    v_mad_u32_u24 v2, s6, v5, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s7, v6, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s9, v7, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s11, v8, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s13, v9, v2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s14
-; GFX9-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s4, v6, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s7, v7, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s10, v8, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s12, v9, v2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
 ; GFX9-NEXT:    global_store_short v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -400,46 +400,46 @@ define amdgpu_kernel void @udot8_acc16(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX9-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX9-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DL-NEXT:    global_load_ushort v2, v[0:1], off
+; GFX9-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_and_b32 s0, s2, 15
-; GFX9-DL-NEXT:    s_and_b32 s1, s4, 15
+; GFX9-DL-NEXT:    s_and_b32 s1, s2, 15
+; GFX9-DL-NEXT:    s_bfe_u32 s4, s2, 0x40004
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-DL-NEXT:    s_bfe_u32 s5, s4, 0x40004
-; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s5
-; GFX9-DL-NEXT:    s_bfe_u32 s1, s2, 0x40004
-; GFX9-DL-NEXT:    s_bfe_u32 s5, s4, 0x40008
-; GFX9-DL-NEXT:    s_bfe_u32 s8, s4, 0x40010
-; GFX9-DL-NEXT:    s_bfe_u32 s10, s4, 0x40014
-; GFX9-DL-NEXT:    s_bfe_u32 s12, s4, 0x40018
-; GFX9-DL-NEXT:    s_lshr_b32 s14, s4, 28
-; GFX9-DL-NEXT:    s_bfe_u32 s4, s4, 0x4000c
-; GFX9-DL-NEXT:    s_bfe_u32 s6, s2, 0x40008
-; GFX9-DL-NEXT:    v_mov_b32_e32 v5, s5
+; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s4
+; GFX9-DL-NEXT:    s_and_b32 s1, s0, 15
+; GFX9-DL-NEXT:    s_bfe_u32 s5, s0, 0x40004
+; GFX9-DL-NEXT:    s_bfe_u32 s4, s2, 0x40008
 ; GFX9-DL-NEXT:    s_bfe_u32 s7, s2, 0x4000c
-; GFX9-DL-NEXT:    v_mov_b32_e32 v6, s4
-; GFX9-DL-NEXT:    s_bfe_u32 s9, s2, 0x40010
+; GFX9-DL-NEXT:    v_mov_b32_e32 v5, s4
+; GFX9-DL-NEXT:    s_bfe_u32 s6, s0, 0x40008
+; GFX9-DL-NEXT:    s_bfe_u32 s8, s2, 0x40010
+; GFX9-DL-NEXT:    v_mov_b32_e32 v6, s7
+; GFX9-DL-NEXT:    s_bfe_u32 s4, s0, 0x4000c
+; GFX9-DL-NEXT:    s_bfe_u32 s9, s2, 0x40014
+; GFX9-DL-NEXT:    s_bfe_u32 s7, s0, 0x40010
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v7, s8
-; GFX9-DL-NEXT:    s_bfe_u32 s11, s2, 0x40014
-; GFX9-DL-NEXT:    v_mov_b32_e32 v8, s10
-; GFX9-DL-NEXT:    s_bfe_u32 s13, s2, 0x40018
-; GFX9-DL-NEXT:    v_mov_b32_e32 v9, s12
+; GFX9-DL-NEXT:    s_bfe_u32 s11, s2, 0x40018
+; GFX9-DL-NEXT:    s_bfe_u32 s10, s0, 0x40014
+; GFX9-DL-NEXT:    v_mov_b32_e32 v8, s9
+; GFX9-DL-NEXT:    s_bfe_u32 s12, s0, 0x40018
 ; GFX9-DL-NEXT:    s_lshr_b32 s2, s2, 28
+; GFX9-DL-NEXT:    v_mov_b32_e32 v9, s11
+; GFX9-DL-NEXT:    s_lshr_b32 s0, s0, 28
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s1, v4, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s1, v3, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s5, v4, v2
 ; GFX9-DL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s6, v5, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s7, v6, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s9, v7, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s11, v8, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s13, v9, v2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s14
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s4, v6, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s7, v7, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s10, v8, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s12, v9, v2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
 ; GFX9-DL-NEXT:    global_store_short v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
 ;
@@ -449,38 +449,38 @@ define amdgpu_kernel void @udot8_acc16(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX10-DL-NEXT:    global_load_ushort v2, v[0:1], off
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_and_b32 s0, s2, 15
-; GFX10-DL-NEXT:    s_and_b32 s1, s4, 15
-; GFX10-DL-NEXT:    s_bfe_u32 s5, s2, 0x40004
-; GFX10-DL-NEXT:    s_bfe_u32 s6, s4, 0x40004
-; GFX10-DL-NEXT:    s_bfe_u32 s7, s2, 0x40008
-; GFX10-DL-NEXT:    s_bfe_u32 s8, s4, 0x40008
-; GFX10-DL-NEXT:    s_bfe_u32 s9, s2, 0x4000c
-; GFX10-DL-NEXT:    s_bfe_u32 s10, s4, 0x4000c
-; GFX10-DL-NEXT:    s_bfe_u32 s11, s2, 0x40010
-; GFX10-DL-NEXT:    s_bfe_u32 s12, s4, 0x40010
-; GFX10-DL-NEXT:    s_bfe_u32 s13, s2, 0x40014
-; GFX10-DL-NEXT:    s_bfe_u32 s14, s4, 0x40014
+; GFX10-DL-NEXT:    s_and_b32 s2, s0, 15
+; GFX10-DL-NEXT:    s_and_b32 s4, s1, 15
+; GFX10-DL-NEXT:    s_bfe_u32 s5, s0, 0x40004
+; GFX10-DL-NEXT:    s_bfe_u32 s6, s1, 0x40004
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s0, s1, v2
-; GFX10-DL-NEXT:    s_bfe_u32 s0, s2, 0x40018
-; GFX10-DL-NEXT:    s_bfe_u32 s1, s4, 0x40018
-; GFX10-DL-NEXT:    s_lshr_b32 s2, s2, 28
-; GFX10-DL-NEXT:    s_lshr_b32 s4, s4, 28
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s4, v2
+; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40008
+; GFX10-DL-NEXT:    s_bfe_u32 s4, s1, 0x40008
 ; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s5, s6, v2
+; GFX10-DL-NEXT:    s_bfe_u32 s5, s0, 0x4000c
+; GFX10-DL-NEXT:    s_bfe_u32 s6, s1, 0x4000c
 ; GFX10-DL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s7, s8, v2
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s9, s10, v2
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s11, s12, v2
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s13, s14, v2
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s0, s1, v2
 ; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s4, v2
+; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40010
+; GFX10-DL-NEXT:    s_bfe_u32 s4, s1, 0x40010
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s5, s6, v2
+; GFX10-DL-NEXT:    s_bfe_u32 s5, s0, 0x40014
+; GFX10-DL-NEXT:    s_bfe_u32 s6, s1, 0x40014
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s4, v2
+; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40018
+; GFX10-DL-NEXT:    s_bfe_u32 s4, s1, 0x40018
+; GFX10-DL-NEXT:    s_lshr_b32 s0, s0, 28
+; GFX10-DL-NEXT:    s_lshr_b32 s1, s1, 28
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s5, s6, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s4, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s0, s1, v2
 ; GFX10-DL-NEXT:    global_store_short v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                        <8 x i4> addrspace(1)* %src2,
@@ -606,46 +606,46 @@ define amdgpu_kernel void @udot8_acc8(<8 x i4> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX8-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX8-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_load_ubyte v2, v[0:1]
+; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_and_b32 s0, s2, 15
-; GFX8-NEXT:    s_and_b32 s1, s4, 15
+; GFX8-NEXT:    s_and_b32 s1, s2, 15
+; GFX8-NEXT:    s_bfe_u32 s4, s2, 0x40004
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    s_bfe_u32 s5, s4, 0x40004
-; GFX8-NEXT:    v_mov_b32_e32 v4, s5
-; GFX8-NEXT:    s_bfe_u32 s1, s2, 0x40004
-; GFX8-NEXT:    s_bfe_u32 s5, s4, 0x40008
-; GFX8-NEXT:    s_bfe_u32 s8, s4, 0x40010
-; GFX8-NEXT:    s_bfe_u32 s10, s4, 0x40014
-; GFX8-NEXT:    s_bfe_u32 s12, s4, 0x40018
-; GFX8-NEXT:    s_lshr_b32 s14, s4, 28
-; GFX8-NEXT:    s_bfe_u32 s4, s4, 0x4000c
-; GFX8-NEXT:    s_bfe_u32 s6, s2, 0x40008
-; GFX8-NEXT:    v_mov_b32_e32 v5, s5
+; GFX8-NEXT:    v_mov_b32_e32 v4, s4
+; GFX8-NEXT:    s_and_b32 s1, s0, 15
+; GFX8-NEXT:    s_bfe_u32 s5, s0, 0x40004
+; GFX8-NEXT:    s_bfe_u32 s4, s2, 0x40008
 ; GFX8-NEXT:    s_bfe_u32 s7, s2, 0x4000c
-; GFX8-NEXT:    v_mov_b32_e32 v6, s4
-; GFX8-NEXT:    s_bfe_u32 s9, s2, 0x40010
+; GFX8-NEXT:    v_mov_b32_e32 v5, s4
+; GFX8-NEXT:    s_bfe_u32 s6, s0, 0x40008
+; GFX8-NEXT:    s_bfe_u32 s8, s2, 0x40010
+; GFX8-NEXT:    v_mov_b32_e32 v6, s7
+; GFX8-NEXT:    s_bfe_u32 s4, s0, 0x4000c
+; GFX8-NEXT:    s_bfe_u32 s9, s2, 0x40014
+; GFX8-NEXT:    s_bfe_u32 s7, s0, 0x40010
 ; GFX8-NEXT:    v_mov_b32_e32 v7, s8
-; GFX8-NEXT:    s_bfe_u32 s11, s2, 0x40014
-; GFX8-NEXT:    v_mov_b32_e32 v8, s10
-; GFX8-NEXT:    s_bfe_u32 s13, s2, 0x40018
-; GFX8-NEXT:    v_mov_b32_e32 v9, s12
+; GFX8-NEXT:    s_bfe_u32 s11, s2, 0x40018
+; GFX8-NEXT:    s_bfe_u32 s10, s0, 0x40014
+; GFX8-NEXT:    v_mov_b32_e32 v8, s9
+; GFX8-NEXT:    s_bfe_u32 s12, s0, 0x40018
 ; GFX8-NEXT:    s_lshr_b32 s2, s2, 28
+; GFX8-NEXT:    v_mov_b32_e32 v9, s11
+; GFX8-NEXT:    s_lshr_b32 s0, s0, 28
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s1, v4, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s1, v3, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s5, v4, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; GFX8-NEXT:    v_mad_u32_u24 v2, s6, v5, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s7, v6, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s9, v7, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s11, v8, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s13, v9, v2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s14
-; GFX8-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s4, v6, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s7, v7, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s10, v8, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s12, v9, v2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
 ; GFX8-NEXT:    flat_store_byte v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -654,46 +654,46 @@ define amdgpu_kernel void @udot8_acc8(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX9-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX9-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    global_load_ubyte v2, v[0:1], off
+; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s0, s2, 15
-; GFX9-NEXT:    s_and_b32 s1, s4, 15
+; GFX9-NEXT:    s_and_b32 s1, s2, 15
+; GFX9-NEXT:    s_bfe_u32 s4, s2, 0x40004
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    s_bfe_u32 s5, s4, 0x40004
-; GFX9-NEXT:    v_mov_b32_e32 v4, s5
-; GFX9-NEXT:    s_bfe_u32 s1, s2, 0x40004
-; GFX9-NEXT:    s_bfe_u32 s5, s4, 0x40008
-; GFX9-NEXT:    s_bfe_u32 s8, s4, 0x40010
-; GFX9-NEXT:    s_bfe_u32 s10, s4, 0x40014
-; GFX9-NEXT:    s_bfe_u32 s12, s4, 0x40018
-; GFX9-NEXT:    s_lshr_b32 s14, s4, 28
-; GFX9-NEXT:    s_bfe_u32 s4, s4, 0x4000c
-; GFX9-NEXT:    s_bfe_u32 s6, s2, 0x40008
-; GFX9-NEXT:    v_mov_b32_e32 v5, s5
+; GFX9-NEXT:    v_mov_b32_e32 v4, s4
+; GFX9-NEXT:    s_and_b32 s1, s0, 15
+; GFX9-NEXT:    s_bfe_u32 s5, s0, 0x40004
+; GFX9-NEXT:    s_bfe_u32 s4, s2, 0x40008
 ; GFX9-NEXT:    s_bfe_u32 s7, s2, 0x4000c
-; GFX9-NEXT:    v_mov_b32_e32 v6, s4
-; GFX9-NEXT:    s_bfe_u32 s9, s2, 0x40010
+; GFX9-NEXT:    v_mov_b32_e32 v5, s4
+; GFX9-NEXT:    s_bfe_u32 s6, s0, 0x40008
+; GFX9-NEXT:    s_bfe_u32 s8, s2, 0x40010
+; GFX9-NEXT:    v_mov_b32_e32 v6, s7
+; GFX9-NEXT:    s_bfe_u32 s4, s0, 0x4000c
+; GFX9-NEXT:    s_bfe_u32 s9, s2, 0x40014
+; GFX9-NEXT:    s_bfe_u32 s7, s0, 0x40010
 ; GFX9-NEXT:    v_mov_b32_e32 v7, s8
-; GFX9-NEXT:    s_bfe_u32 s11, s2, 0x40014
-; GFX9-NEXT:    v_mov_b32_e32 v8, s10
-; GFX9-NEXT:    s_bfe_u32 s13, s2, 0x40018
-; GFX9-NEXT:    v_mov_b32_e32 v9, s12
+; GFX9-NEXT:    s_bfe_u32 s11, s2, 0x40018
+; GFX9-NEXT:    s_bfe_u32 s10, s0, 0x40014
+; GFX9-NEXT:    v_mov_b32_e32 v8, s9
+; GFX9-NEXT:    s_bfe_u32 s12, s0, 0x40018
 ; GFX9-NEXT:    s_lshr_b32 s2, s2, 28
+; GFX9-NEXT:    v_mov_b32_e32 v9, s11
+; GFX9-NEXT:    s_lshr_b32 s0, s0, 28
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s1, v4, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s1, v3, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s5, v4, v2
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; GFX9-NEXT:    v_mad_u32_u24 v2, s6, v5, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s7, v6, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s9, v7, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s11, v8, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s13, v9, v2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s14
-; GFX9-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s4, v6, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s7, v7, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s10, v8, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s12, v9, v2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
 ; GFX9-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -702,46 +702,46 @@ define amdgpu_kernel void @udot8_acc8(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX9-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX9-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DL-NEXT:    global_load_ubyte v2, v[0:1], off
+; GFX9-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_and_b32 s0, s2, 15
-; GFX9-DL-NEXT:    s_and_b32 s1, s4, 15
+; GFX9-DL-NEXT:    s_and_b32 s1, s2, 15
+; GFX9-DL-NEXT:    s_bfe_u32 s4, s2, 0x40004
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-DL-NEXT:    s_bfe_u32 s5, s4, 0x40004
-; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s5
-; GFX9-DL-NEXT:    s_bfe_u32 s1, s2, 0x40004
-; GFX9-DL-NEXT:    s_bfe_u32 s5, s4, 0x40008
-; GFX9-DL-NEXT:    s_bfe_u32 s8, s4, 0x40010
-; GFX9-DL-NEXT:    s_bfe_u32 s10, s4, 0x40014
-; GFX9-DL-NEXT:    s_bfe_u32 s12, s4, 0x40018
-; GFX9-DL-NEXT:    s_lshr_b32 s14, s4, 28
-; GFX9-DL-NEXT:    s_bfe_u32 s4, s4, 0x4000c
-; GFX9-DL-NEXT:    s_bfe_u32 s6, s2, 0x40008
-; GFX9-DL-NEXT:    v_mov_b32_e32 v5, s5
+; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s4
+; GFX9-DL-NEXT:    s_and_b32 s1, s0, 15
+; GFX9-DL-NEXT:    s_bfe_u32 s5, s0, 0x40004
+; GFX9-DL-NEXT:    s_bfe_u32 s4, s2, 0x40008
 ; GFX9-DL-NEXT:    s_bfe_u32 s7, s2, 0x4000c
-; GFX9-DL-NEXT:    v_mov_b32_e32 v6, s4
-; GFX9-DL-NEXT:    s_bfe_u32 s9, s2, 0x40010
+; GFX9-DL-NEXT:    v_mov_b32_e32 v5, s4
+; GFX9-DL-NEXT:    s_bfe_u32 s6, s0, 0x40008
+; GFX9-DL-NEXT:    s_bfe_u32 s8, s2, 0x40010
+; GFX9-DL-NEXT:    v_mov_b32_e32 v6, s7
+; GFX9-DL-NEXT:    s_bfe_u32 s4, s0, 0x4000c
+; GFX9-DL-NEXT:    s_bfe_u32 s9, s2, 0x40014
+; GFX9-DL-NEXT:    s_bfe_u32 s7, s0, 0x40010
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v7, s8
-; GFX9-DL-NEXT:    s_bfe_u32 s11, s2, 0x40014
-; GFX9-DL-NEXT:    v_mov_b32_e32 v8, s10
-; GFX9-DL-NEXT:    s_bfe_u32 s13, s2, 0x40018
-; GFX9-DL-NEXT:    v_mov_b32_e32 v9, s12
+; GFX9-DL-NEXT:    s_bfe_u32 s11, s2, 0x40018
+; GFX9-DL-NEXT:    s_bfe_u32 s10, s0, 0x40014
+; GFX9-DL-NEXT:    v_mov_b32_e32 v8, s9
+; GFX9-DL-NEXT:    s_bfe_u32 s12, s0, 0x40018
 ; GFX9-DL-NEXT:    s_lshr_b32 s2, s2, 28
+; GFX9-DL-NEXT:    v_mov_b32_e32 v9, s11
+; GFX9-DL-NEXT:    s_lshr_b32 s0, s0, 28
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s1, v4, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s1, v3, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s5, v4, v2
 ; GFX9-DL-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s6, v5, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s7, v6, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s9, v7, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s11, v8, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s13, v9, v2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s14
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s4, v6, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s7, v7, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s10, v8, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s12, v9, v2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
 ; GFX9-DL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
 ;
@@ -751,38 +751,38 @@ define amdgpu_kernel void @udot8_acc8(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX10-DL-NEXT:    global_load_ubyte v2, v[0:1], off
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_and_b32 s0, s2, 15
-; GFX10-DL-NEXT:    s_and_b32 s1, s4, 15
-; GFX10-DL-NEXT:    s_bfe_u32 s5, s2, 0x40004
-; GFX10-DL-NEXT:    s_bfe_u32 s6, s4, 0x40004
-; GFX10-DL-NEXT:    s_bfe_u32 s7, s2, 0x40008
-; GFX10-DL-NEXT:    s_bfe_u32 s8, s4, 0x40008
-; GFX10-DL-NEXT:    s_bfe_u32 s9, s2, 0x4000c
-; GFX10-DL-NEXT:    s_bfe_u32 s10, s4, 0x4000c
-; GFX10-DL-NEXT:    s_bfe_u32 s11, s2, 0x40010
-; GFX10-DL-NEXT:    s_bfe_u32 s12, s4, 0x40010
-; GFX10-DL-NEXT:    s_bfe_u32 s13, s2, 0x40014
-; GFX10-DL-NEXT:    s_bfe_u32 s14, s4, 0x40014
+; GFX10-DL-NEXT:    s_and_b32 s2, s0, 15
+; GFX10-DL-NEXT:    s_and_b32 s4, s1, 15
+; GFX10-DL-NEXT:    s_bfe_u32 s5, s0, 0x40004
+; GFX10-DL-NEXT:    s_bfe_u32 s6, s1, 0x40004
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s0, s1, v2
-; GFX10-DL-NEXT:    s_bfe_u32 s0, s2, 0x40018
-; GFX10-DL-NEXT:    s_bfe_u32 s1, s4, 0x40018
-; GFX10-DL-NEXT:    s_lshr_b32 s2, s2, 28
-; GFX10-DL-NEXT:    s_lshr_b32 s4, s4, 28
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s4, v2
+; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40008
+; GFX10-DL-NEXT:    s_bfe_u32 s4, s1, 0x40008
 ; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s5, s6, v2
+; GFX10-DL-NEXT:    s_bfe_u32 s5, s0, 0x4000c
+; GFX10-DL-NEXT:    s_bfe_u32 s6, s1, 0x4000c
 ; GFX10-DL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s7, s8, v2
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s9, s10, v2
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s11, s12, v2
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s13, s14, v2
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s0, s1, v2
 ; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s4, v2
+; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40010
+; GFX10-DL-NEXT:    s_bfe_u32 s4, s1, 0x40010
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s5, s6, v2
+; GFX10-DL-NEXT:    s_bfe_u32 s5, s0, 0x40014
+; GFX10-DL-NEXT:    s_bfe_u32 s6, s1, 0x40014
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s4, v2
+; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40018
+; GFX10-DL-NEXT:    s_bfe_u32 s4, s1, 0x40018
+; GFX10-DL-NEXT:    s_lshr_b32 s0, s0, 28
+; GFX10-DL-NEXT:    s_lshr_b32 s1, s1, 28
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s5, s6, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s4, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s0, s1, v2
 ; GFX10-DL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                       <8 x i4> addrspace(1)* %src2,
@@ -909,48 +909,48 @@ define amdgpu_kernel void @udot8_acc4(<8 x i4> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX8-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX8-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_load_ubyte v2, v[0:1]
+; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_and_b32 s0, s2, 15
-; GFX8-NEXT:    s_and_b32 s1, s4, 15
+; GFX8-NEXT:    s_and_b32 s1, s2, 15
+; GFX8-NEXT:    s_bfe_u32 s4, s2, 0x40004
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    s_bfe_u32 s5, s4, 0x40004
-; GFX8-NEXT:    s_bfe_u32 s6, s4, 0x40008
-; GFX8-NEXT:    s_bfe_u32 s7, s4, 0x4000c
-; GFX8-NEXT:    v_mov_b32_e32 v4, s5
-; GFX8-NEXT:    s_bfe_u32 s1, s2, 0x40004
-; GFX8-NEXT:    s_bfe_u32 s5, s2, 0x40008
-; GFX8-NEXT:    s_bfe_u32 s8, s2, 0x4000c
+; GFX8-NEXT:    s_bfe_u32 s6, s2, 0x40008
+; GFX8-NEXT:    s_and_b32 s1, s0, 15
+; GFX8-NEXT:    s_bfe_u32 s7, s2, 0x4000c
+; GFX8-NEXT:    v_mov_b32_e32 v4, s4
+; GFX8-NEXT:    s_bfe_u32 s5, s0, 0x40004
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s7
+; GFX8-NEXT:    s_bfe_u32 s8, s0, 0x4000c
 ; GFX8-NEXT:    v_mov_b32_e32 v6, s6
+; GFX8-NEXT:    s_bfe_u32 s4, s0, 0x40008
 ; GFX8-NEXT:    v_mul_u32_u24_e32 v5, s8, v5
-; GFX8-NEXT:    s_bfe_u32 s9, s4, 0x40010
+; GFX8-NEXT:    s_bfe_u32 s6, s2, 0x40010
 ; GFX8-NEXT:    v_and_b32_e32 v5, 15, v5
-; GFX8-NEXT:    s_bfe_u32 s11, s4, 0x40014
-; GFX8-NEXT:    s_bfe_u32 s10, s2, 0x40010
-; GFX8-NEXT:    v_mov_b32_e32 v7, s9
-; GFX8-NEXT:    s_bfe_u32 s13, s4, 0x40018
-; GFX8-NEXT:    s_bfe_u32 s12, s2, 0x40014
-; GFX8-NEXT:    v_mov_b32_e32 v8, s11
-; GFX8-NEXT:    s_bfe_u32 s14, s2, 0x40018
-; GFX8-NEXT:    s_lshr_b32 s4, s4, 28
-; GFX8-NEXT:    v_mov_b32_e32 v9, s13
-; GFX8-NEXT:    s_lshr_b32 s2, s2, 28
+; GFX8-NEXT:    s_bfe_u32 s8, s2, 0x40014
+; GFX8-NEXT:    s_bfe_u32 s7, s0, 0x40010
+; GFX8-NEXT:    v_mov_b32_e32 v7, s6
+; GFX8-NEXT:    s_lshr_b32 s11, s2, 28
+; GFX8-NEXT:    s_bfe_u32 s2, s2, 0x40018
+; GFX8-NEXT:    s_bfe_u32 s9, s0, 0x40014
+; GFX8-NEXT:    v_mov_b32_e32 v8, s8
+; GFX8-NEXT:    s_bfe_u32 s10, s0, 0x40018
+; GFX8-NEXT:    v_mov_b32_e32 v9, s2
+; GFX8-NEXT:    s_lshr_b32 s0, s0, 28
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s1, v4, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s5, v6, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s1, v3, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s5, v4, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s4, v6, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s10, v7, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s12, v8, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s14, v9, v2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s4
-; GFX8-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s7, v7, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s9, v8, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s10, v9, v2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s11
+; GFX8-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX8-NEXT:    flat_store_byte v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
@@ -960,48 +960,48 @@ define amdgpu_kernel void @udot8_acc4(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX9-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX9-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    global_load_ubyte v2, v[0:1], off
+; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s0, s2, 15
-; GFX9-NEXT:    s_and_b32 s1, s4, 15
+; GFX9-NEXT:    s_and_b32 s1, s2, 15
+; GFX9-NEXT:    s_bfe_u32 s4, s2, 0x40004
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    s_bfe_u32 s5, s4, 0x40004
-; GFX9-NEXT:    s_bfe_u32 s6, s4, 0x40008
-; GFX9-NEXT:    s_bfe_u32 s7, s4, 0x4000c
-; GFX9-NEXT:    v_mov_b32_e32 v4, s5
-; GFX9-NEXT:    s_bfe_u32 s1, s2, 0x40004
-; GFX9-NEXT:    s_bfe_u32 s5, s2, 0x40008
-; GFX9-NEXT:    s_bfe_u32 s8, s2, 0x4000c
+; GFX9-NEXT:    s_bfe_u32 s6, s2, 0x40008
+; GFX9-NEXT:    s_and_b32 s1, s0, 15
+; GFX9-NEXT:    s_bfe_u32 s7, s2, 0x4000c
+; GFX9-NEXT:    v_mov_b32_e32 v4, s4
+; GFX9-NEXT:    s_bfe_u32 s5, s0, 0x40004
 ; GFX9-NEXT:    v_mov_b32_e32 v5, s7
+; GFX9-NEXT:    s_bfe_u32 s8, s0, 0x4000c
 ; GFX9-NEXT:    v_mov_b32_e32 v6, s6
+; GFX9-NEXT:    s_bfe_u32 s4, s0, 0x40008
 ; GFX9-NEXT:    v_mul_u32_u24_e32 v5, s8, v5
-; GFX9-NEXT:    s_bfe_u32 s9, s4, 0x40010
+; GFX9-NEXT:    s_bfe_u32 s6, s2, 0x40010
 ; GFX9-NEXT:    v_and_b32_e32 v5, 15, v5
-; GFX9-NEXT:    s_bfe_u32 s11, s4, 0x40014
-; GFX9-NEXT:    s_bfe_u32 s10, s2, 0x40010
-; GFX9-NEXT:    v_mov_b32_e32 v7, s9
-; GFX9-NEXT:    s_bfe_u32 s13, s4, 0x40018
-; GFX9-NEXT:    s_bfe_u32 s12, s2, 0x40014
-; GFX9-NEXT:    v_mov_b32_e32 v8, s11
-; GFX9-NEXT:    s_bfe_u32 s14, s2, 0x40018
-; GFX9-NEXT:    s_lshr_b32 s4, s4, 28
-; GFX9-NEXT:    v_mov_b32_e32 v9, s13
-; GFX9-NEXT:    s_lshr_b32 s2, s2, 28
+; GFX9-NEXT:    s_bfe_u32 s8, s2, 0x40014
+; GFX9-NEXT:    s_bfe_u32 s7, s0, 0x40010
+; GFX9-NEXT:    v_mov_b32_e32 v7, s6
+; GFX9-NEXT:    s_lshr_b32 s11, s2, 28
+; GFX9-NEXT:    s_bfe_u32 s2, s2, 0x40018
+; GFX9-NEXT:    s_bfe_u32 s9, s0, 0x40014
+; GFX9-NEXT:    v_mov_b32_e32 v8, s8
+; GFX9-NEXT:    s_bfe_u32 s10, s0, 0x40018
+; GFX9-NEXT:    v_mov_b32_e32 v9, s2
+; GFX9-NEXT:    s_lshr_b32 s0, s0, 28
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s1, v4, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s5, v6, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s1, v3, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s5, v4, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s4, v6, v2
 ; GFX9-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX9-NEXT:    v_add_u32_e32 v2, v2, v5
-; GFX9-NEXT:    v_mad_u32_u24 v2, s10, v7, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s12, v8, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s14, v9, v2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s4
-; GFX9-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s7, v7, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s9, v8, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s10, v9, v2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s11
+; GFX9-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
 ; GFX9-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX9-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
@@ -1011,48 +1011,48 @@ define amdgpu_kernel void @udot8_acc4(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX9-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX9-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DL-NEXT:    global_load_ubyte v2, v[0:1], off
+; GFX9-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_and_b32 s0, s2, 15
-; GFX9-DL-NEXT:    s_and_b32 s1, s4, 15
+; GFX9-DL-NEXT:    s_and_b32 s1, s2, 15
+; GFX9-DL-NEXT:    s_bfe_u32 s4, s2, 0x40004
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-DL-NEXT:    s_bfe_u32 s5, s4, 0x40004
-; GFX9-DL-NEXT:    s_bfe_u32 s6, s4, 0x40008
-; GFX9-DL-NEXT:    s_bfe_u32 s7, s4, 0x4000c
-; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s5
-; GFX9-DL-NEXT:    s_bfe_u32 s1, s2, 0x40004
-; GFX9-DL-NEXT:    s_bfe_u32 s5, s2, 0x40008
-; GFX9-DL-NEXT:    s_bfe_u32 s8, s2, 0x4000c
+; GFX9-DL-NEXT:    s_bfe_u32 s6, s2, 0x40008
+; GFX9-DL-NEXT:    s_and_b32 s1, s0, 15
+; GFX9-DL-NEXT:    s_bfe_u32 s7, s2, 0x4000c
+; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s4
+; GFX9-DL-NEXT:    s_bfe_u32 s5, s0, 0x40004
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v5, s7
+; GFX9-DL-NEXT:    s_bfe_u32 s8, s0, 0x4000c
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v6, s6
+; GFX9-DL-NEXT:    s_bfe_u32 s4, s0, 0x40008
 ; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v5, s8, v5
-; GFX9-DL-NEXT:    s_bfe_u32 s9, s4, 0x40010
+; GFX9-DL-NEXT:    s_bfe_u32 s6, s2, 0x40010
 ; GFX9-DL-NEXT:    v_and_b32_e32 v5, 15, v5
-; GFX9-DL-NEXT:    s_bfe_u32 s11, s4, 0x40014
-; GFX9-DL-NEXT:    s_bfe_u32 s10, s2, 0x40010
-; GFX9-DL-NEXT:    v_mov_b32_e32 v7, s9
-; GFX9-DL-NEXT:    s_bfe_u32 s13, s4, 0x40018
-; GFX9-DL-NEXT:    s_bfe_u32 s12, s2, 0x40014
-; GFX9-DL-NEXT:    v_mov_b32_e32 v8, s11
-; GFX9-DL-NEXT:    s_bfe_u32 s14, s2, 0x40018
-; GFX9-DL-NEXT:    s_lshr_b32 s4, s4, 28
-; GFX9-DL-NEXT:    v_mov_b32_e32 v9, s13
-; GFX9-DL-NEXT:    s_lshr_b32 s2, s2, 28
+; GFX9-DL-NEXT:    s_bfe_u32 s8, s2, 0x40014
+; GFX9-DL-NEXT:    s_bfe_u32 s7, s0, 0x40010
+; GFX9-DL-NEXT:    v_mov_b32_e32 v7, s6
+; GFX9-DL-NEXT:    s_lshr_b32 s11, s2, 28
+; GFX9-DL-NEXT:    s_bfe_u32 s2, s2, 0x40018
+; GFX9-DL-NEXT:    s_bfe_u32 s9, s0, 0x40014
+; GFX9-DL-NEXT:    v_mov_b32_e32 v8, s8
+; GFX9-DL-NEXT:    s_bfe_u32 s10, s0, 0x40018
+; GFX9-DL-NEXT:    v_mov_b32_e32 v9, s2
+; GFX9-DL-NEXT:    s_lshr_b32 s0, s0, 28
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s1, v4, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s5, v6, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s1, v3, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s5, v4, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s4, v6, v2
 ; GFX9-DL-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX9-DL-NEXT:    v_add_u32_e32 v2, v2, v5
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s10, v7, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s12, v8, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s14, v9, v2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s4
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s7, v7, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s9, v8, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s10, v9, v2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s11
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
 ; GFX9-DL-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX9-DL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
@@ -1063,40 +1063,40 @@ define amdgpu_kernel void @udot8_acc4(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX10-DL-NEXT:    global_load_ubyte v2, v[0:1], off
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_and_b32 s0, s2, 15
-; GFX10-DL-NEXT:    s_and_b32 s1, s4, 15
-; GFX10-DL-NEXT:    s_bfe_u32 s5, s2, 0x40004
-; GFX10-DL-NEXT:    s_bfe_u32 s6, s4, 0x40004
-; GFX10-DL-NEXT:    s_bfe_u32 s7, s2, 0x40008
-; GFX10-DL-NEXT:    s_bfe_u32 s8, s2, 0x4000c
-; GFX10-DL-NEXT:    s_bfe_u32 s9, s4, 0x40008
+; GFX10-DL-NEXT:    s_and_b32 s2, s0, 15
+; GFX10-DL-NEXT:    s_and_b32 s4, s1, 15
+; GFX10-DL-NEXT:    s_bfe_u32 s5, s0, 0x40004
+; GFX10-DL-NEXT:    s_bfe_u32 s6, s1, 0x40004
+; GFX10-DL-NEXT:    s_bfe_u32 s7, s1, 0x40008
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s0, s1, v2
-; GFX10-DL-NEXT:    s_bfe_u32 s0, s4, 0x4000c
-; GFX10-DL-NEXT:    s_bfe_u32 s1, s4, 0x40010
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s4, v2
+; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40008
+; GFX10-DL-NEXT:    s_bfe_u32 s4, s0, 0x4000c
 ; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s5, s6, v2
-; GFX10-DL-NEXT:    v_mul_u32_u24_e64 v3, s8, s0
-; GFX10-DL-NEXT:    s_bfe_u32 s0, s2, 0x40010
-; GFX10-DL-NEXT:    s_bfe_u32 s5, s2, 0x40014
-; GFX10-DL-NEXT:    s_bfe_u32 s6, s4, 0x40014
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s7, s9, v2
-; GFX10-DL-NEXT:    v_and_b32_e32 v3, 15, v3
+; GFX10-DL-NEXT:    s_bfe_u32 s5, s1, 0x4000c
+; GFX10-DL-NEXT:    s_bfe_u32 s6, s1, 0x40014
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s7, v2
+; GFX10-DL-NEXT:    v_mul_u32_u24_e64 v3, s4, s5
+; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40010
+; GFX10-DL-NEXT:    s_bfe_u32 s4, s1, 0x40010
+; GFX10-DL-NEXT:    s_bfe_u32 s5, s0, 0x40014
 ; GFX10-DL-NEXT:    v_and_b32_e32 v2, 15, v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v3, 15, v3
 ; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v2, v3
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s0, s1, v2
-; GFX10-DL-NEXT:    s_bfe_u32 s0, s2, 0x40018
-; GFX10-DL-NEXT:    s_bfe_u32 s1, s4, 0x40018
-; GFX10-DL-NEXT:    s_lshr_b32 s2, s2, 28
-; GFX10-DL-NEXT:    s_lshr_b32 s4, s4, 28
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s4, v2
+; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40018
+; GFX10-DL-NEXT:    s_bfe_u32 s4, s1, 0x40018
+; GFX10-DL-NEXT:    s_lshr_b32 s0, s0, 28
+; GFX10-DL-NEXT:    s_lshr_b32 s1, s1, 28
 ; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s5, s6, v2
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s0, s1, v2
 ; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s4, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s0, s1, v2
 ; GFX10-DL-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX10-DL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
@@ -1208,48 +1208,48 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(<8 x i4> addrspace(1)* %sr
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX8-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX8-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_load_ubyte v2, v[0:1]
+; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_and_b32 s0, s2, 15
-; GFX8-NEXT:    s_and_b32 s1, s4, 15
+; GFX8-NEXT:    s_and_b32 s1, s2, 15
+; GFX8-NEXT:    s_bfe_u32 s4, s2, 0x40004
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    s_bfe_u32 s5, s4, 0x40004
-; GFX8-NEXT:    s_bfe_u32 s6, s4, 0x40008
-; GFX8-NEXT:    s_bfe_u32 s7, s4, 0x4000c
-; GFX8-NEXT:    v_mov_b32_e32 v4, s5
-; GFX8-NEXT:    s_bfe_u32 s1, s2, 0x40004
-; GFX8-NEXT:    s_bfe_u32 s5, s2, 0x40008
-; GFX8-NEXT:    s_bfe_u32 s8, s2, 0x4000c
+; GFX8-NEXT:    s_bfe_u32 s6, s2, 0x40008
+; GFX8-NEXT:    s_and_b32 s1, s0, 15
+; GFX8-NEXT:    s_bfe_u32 s7, s2, 0x4000c
+; GFX8-NEXT:    v_mov_b32_e32 v4, s4
+; GFX8-NEXT:    s_bfe_u32 s5, s0, 0x40004
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s7
+; GFX8-NEXT:    s_bfe_u32 s8, s0, 0x4000c
 ; GFX8-NEXT:    v_mov_b32_e32 v6, s6
+; GFX8-NEXT:    s_bfe_u32 s4, s0, 0x40008
 ; GFX8-NEXT:    v_mul_u32_u24_e32 v5, s8, v5
-; GFX8-NEXT:    s_bfe_u32 s9, s4, 0x40010
+; GFX8-NEXT:    s_bfe_u32 s6, s2, 0x40010
 ; GFX8-NEXT:    v_and_b32_e32 v5, 15, v5
-; GFX8-NEXT:    s_bfe_u32 s11, s4, 0x40014
-; GFX8-NEXT:    s_bfe_u32 s10, s2, 0x40010
-; GFX8-NEXT:    v_mov_b32_e32 v7, s9
-; GFX8-NEXT:    s_bfe_u32 s13, s4, 0x40018
-; GFX8-NEXT:    s_bfe_u32 s12, s2, 0x40014
-; GFX8-NEXT:    v_mov_b32_e32 v8, s11
-; GFX8-NEXT:    s_bfe_u32 s14, s2, 0x40018
-; GFX8-NEXT:    s_lshr_b32 s4, s4, 28
-; GFX8-NEXT:    v_mov_b32_e32 v9, s13
-; GFX8-NEXT:    s_lshr_b32 s2, s2, 28
+; GFX8-NEXT:    s_bfe_u32 s8, s2, 0x40014
+; GFX8-NEXT:    s_bfe_u32 s7, s0, 0x40010
+; GFX8-NEXT:    v_mov_b32_e32 v7, s6
+; GFX8-NEXT:    s_lshr_b32 s11, s2, 28
+; GFX8-NEXT:    s_bfe_u32 s2, s2, 0x40018
+; GFX8-NEXT:    s_bfe_u32 s9, s0, 0x40014
+; GFX8-NEXT:    v_mov_b32_e32 v8, s8
+; GFX8-NEXT:    s_bfe_u32 s10, s0, 0x40018
+; GFX8-NEXT:    v_mov_b32_e32 v9, s2
+; GFX8-NEXT:    s_lshr_b32 s0, s0, 28
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s1, v4, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s5, v6, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s1, v3, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s5, v4, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s4, v6, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
-; GFX8-NEXT:    v_mad_u32_u24 v2, s10, v7, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s12, v8, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s14, v9, v2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s4
-; GFX8-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s7, v7, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s9, v8, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s10, v9, v2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s11
+; GFX8-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX8-NEXT:    flat_store_byte v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
@@ -1259,48 +1259,48 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(<8 x i4> addrspace(1)* %sr
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX9-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX9-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    global_load_ubyte v2, v[0:1], off
+; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s0, s2, 15
-; GFX9-NEXT:    s_and_b32 s1, s4, 15
+; GFX9-NEXT:    s_and_b32 s1, s2, 15
+; GFX9-NEXT:    s_bfe_u32 s4, s2, 0x40004
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    s_bfe_u32 s5, s4, 0x40004
-; GFX9-NEXT:    s_bfe_u32 s6, s4, 0x40008
-; GFX9-NEXT:    s_bfe_u32 s7, s4, 0x4000c
-; GFX9-NEXT:    v_mov_b32_e32 v4, s5
-; GFX9-NEXT:    s_bfe_u32 s1, s2, 0x40004
-; GFX9-NEXT:    s_bfe_u32 s5, s2, 0x40008
-; GFX9-NEXT:    s_bfe_u32 s8, s2, 0x4000c
+; GFX9-NEXT:    s_bfe_u32 s6, s2, 0x40008
+; GFX9-NEXT:    s_and_b32 s1, s0, 15
+; GFX9-NEXT:    s_bfe_u32 s7, s2, 0x4000c
+; GFX9-NEXT:    v_mov_b32_e32 v4, s4
+; GFX9-NEXT:    s_bfe_u32 s5, s0, 0x40004
 ; GFX9-NEXT:    v_mov_b32_e32 v5, s7
+; GFX9-NEXT:    s_bfe_u32 s8, s0, 0x4000c
 ; GFX9-NEXT:    v_mov_b32_e32 v6, s6
+; GFX9-NEXT:    s_bfe_u32 s4, s0, 0x40008
 ; GFX9-NEXT:    v_mul_u32_u24_e32 v5, s8, v5
-; GFX9-NEXT:    s_bfe_u32 s9, s4, 0x40010
+; GFX9-NEXT:    s_bfe_u32 s6, s2, 0x40010
 ; GFX9-NEXT:    v_and_b32_e32 v5, 15, v5
-; GFX9-NEXT:    s_bfe_u32 s11, s4, 0x40014
-; GFX9-NEXT:    s_bfe_u32 s10, s2, 0x40010
-; GFX9-NEXT:    v_mov_b32_e32 v7, s9
-; GFX9-NEXT:    s_bfe_u32 s13, s4, 0x40018
-; GFX9-NEXT:    s_bfe_u32 s12, s2, 0x40014
-; GFX9-NEXT:    v_mov_b32_e32 v8, s11
-; GFX9-NEXT:    s_bfe_u32 s14, s2, 0x40018
-; GFX9-NEXT:    s_lshr_b32 s4, s4, 28
-; GFX9-NEXT:    v_mov_b32_e32 v9, s13
-; GFX9-NEXT:    s_lshr_b32 s2, s2, 28
+; GFX9-NEXT:    s_bfe_u32 s8, s2, 0x40014
+; GFX9-NEXT:    s_bfe_u32 s7, s0, 0x40010
+; GFX9-NEXT:    v_mov_b32_e32 v7, s6
+; GFX9-NEXT:    s_lshr_b32 s11, s2, 28
+; GFX9-NEXT:    s_bfe_u32 s2, s2, 0x40018
+; GFX9-NEXT:    s_bfe_u32 s9, s0, 0x40014
+; GFX9-NEXT:    v_mov_b32_e32 v8, s8
+; GFX9-NEXT:    s_bfe_u32 s10, s0, 0x40018
+; GFX9-NEXT:    v_mov_b32_e32 v9, s2
+; GFX9-NEXT:    s_lshr_b32 s0, s0, 28
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s1, v4, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s5, v6, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s1, v3, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s5, v4, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s4, v6, v2
 ; GFX9-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX9-NEXT:    v_add_u32_e32 v2, v5, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s10, v7, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s12, v8, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s14, v9, v2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s4
-; GFX9-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s7, v7, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s9, v8, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s10, v9, v2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s11
+; GFX9-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
 ; GFX9-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX9-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
@@ -1310,48 +1310,48 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(<8 x i4> addrspace(1)* %sr
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX9-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX9-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DL-NEXT:    global_load_ubyte v2, v[0:1], off
+; GFX9-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_and_b32 s0, s2, 15
-; GFX9-DL-NEXT:    s_and_b32 s1, s4, 15
+; GFX9-DL-NEXT:    s_and_b32 s1, s2, 15
+; GFX9-DL-NEXT:    s_bfe_u32 s4, s2, 0x40004
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-DL-NEXT:    s_bfe_u32 s5, s4, 0x40004
-; GFX9-DL-NEXT:    s_bfe_u32 s6, s4, 0x40008
-; GFX9-DL-NEXT:    s_bfe_u32 s7, s4, 0x4000c
-; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s5
-; GFX9-DL-NEXT:    s_bfe_u32 s1, s2, 0x40004
-; GFX9-DL-NEXT:    s_bfe_u32 s5, s2, 0x40008
-; GFX9-DL-NEXT:    s_bfe_u32 s8, s2, 0x4000c
+; GFX9-DL-NEXT:    s_bfe_u32 s6, s2, 0x40008
+; GFX9-DL-NEXT:    s_and_b32 s1, s0, 15
+; GFX9-DL-NEXT:    s_bfe_u32 s7, s2, 0x4000c
+; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s4
+; GFX9-DL-NEXT:    s_bfe_u32 s5, s0, 0x40004
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v5, s7
+; GFX9-DL-NEXT:    s_bfe_u32 s8, s0, 0x4000c
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v6, s6
+; GFX9-DL-NEXT:    s_bfe_u32 s4, s0, 0x40008
 ; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v5, s8, v5
-; GFX9-DL-NEXT:    s_bfe_u32 s9, s4, 0x40010
+; GFX9-DL-NEXT:    s_bfe_u32 s6, s2, 0x40010
 ; GFX9-DL-NEXT:    v_and_b32_e32 v5, 15, v5
-; GFX9-DL-NEXT:    s_bfe_u32 s11, s4, 0x40014
-; GFX9-DL-NEXT:    s_bfe_u32 s10, s2, 0x40010
-; GFX9-DL-NEXT:    v_mov_b32_e32 v7, s9
-; GFX9-DL-NEXT:    s_bfe_u32 s13, s4, 0x40018
-; GFX9-DL-NEXT:    s_bfe_u32 s12, s2, 0x40014
-; GFX9-DL-NEXT:    v_mov_b32_e32 v8, s11
-; GFX9-DL-NEXT:    s_bfe_u32 s14, s2, 0x40018
-; GFX9-DL-NEXT:    s_lshr_b32 s4, s4, 28
-; GFX9-DL-NEXT:    v_mov_b32_e32 v9, s13
-; GFX9-DL-NEXT:    s_lshr_b32 s2, s2, 28
+; GFX9-DL-NEXT:    s_bfe_u32 s8, s2, 0x40014
+; GFX9-DL-NEXT:    s_bfe_u32 s7, s0, 0x40010
+; GFX9-DL-NEXT:    v_mov_b32_e32 v7, s6
+; GFX9-DL-NEXT:    s_lshr_b32 s11, s2, 28
+; GFX9-DL-NEXT:    s_bfe_u32 s2, s2, 0x40018
+; GFX9-DL-NEXT:    s_bfe_u32 s9, s0, 0x40014
+; GFX9-DL-NEXT:    v_mov_b32_e32 v8, s8
+; GFX9-DL-NEXT:    s_bfe_u32 s10, s0, 0x40018
+; GFX9-DL-NEXT:    v_mov_b32_e32 v9, s2
+; GFX9-DL-NEXT:    s_lshr_b32 s0, s0, 28
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s1, v4, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s5, v6, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s1, v3, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s5, v4, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s4, v6, v2
 ; GFX9-DL-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX9-DL-NEXT:    v_add_u32_e32 v2, v5, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s10, v7, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s12, v8, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s14, v9, v2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s4
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s7, v7, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s9, v8, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s10, v9, v2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s11
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
 ; GFX9-DL-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX9-DL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
@@ -1362,40 +1362,40 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(<8 x i4> addrspace(1)* %sr
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX10-DL-NEXT:    global_load_ubyte v2, v[0:1], off
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_and_b32 s0, s2, 15
-; GFX10-DL-NEXT:    s_and_b32 s1, s4, 15
-; GFX10-DL-NEXT:    s_bfe_u32 s5, s2, 0x40004
-; GFX10-DL-NEXT:    s_bfe_u32 s6, s4, 0x40004
-; GFX10-DL-NEXT:    s_bfe_u32 s7, s2, 0x40008
-; GFX10-DL-NEXT:    s_bfe_u32 s8, s2, 0x4000c
+; GFX10-DL-NEXT:    s_and_b32 s2, s0, 15
+; GFX10-DL-NEXT:    s_and_b32 s4, s1, 15
+; GFX10-DL-NEXT:    s_bfe_u32 s5, s0, 0x40004
+; GFX10-DL-NEXT:    s_bfe_u32 s6, s1, 0x40004
+; GFX10-DL-NEXT:    s_bfe_u32 s7, s1, 0x40008
+; GFX10-DL-NEXT:    s_bfe_u32 s8, s1, 0x4000c
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s0, s1, v2
-; GFX10-DL-NEXT:    s_bfe_u32 s1, s4, 0x4000c
-; GFX10-DL-NEXT:    s_bfe_u32 s0, s4, 0x40008
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s4, v2
+; GFX10-DL-NEXT:    s_bfe_u32 s4, s0, 0x4000c
+; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40008
 ; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s5, s6, v2
-; GFX10-DL-NEXT:    v_mul_u32_u24_e64 v3, s8, s1
-; GFX10-DL-NEXT:    s_bfe_u32 s1, s4, 0x40010
-; GFX10-DL-NEXT:    s_bfe_u32 s5, s2, 0x40014
-; GFX10-DL-NEXT:    s_bfe_u32 s6, s4, 0x40014
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s7, s0, v2
+; GFX10-DL-NEXT:    v_mul_u32_u24_e64 v3, s4, s8
+; GFX10-DL-NEXT:    s_bfe_u32 s4, s1, 0x40010
+; GFX10-DL-NEXT:    s_bfe_u32 s5, s0, 0x40014
+; GFX10-DL-NEXT:    s_bfe_u32 s6, s1, 0x40014
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s7, v2
 ; GFX10-DL-NEXT:    v_and_b32_e32 v3, 15, v3
-; GFX10-DL-NEXT:    s_bfe_u32 s0, s2, 0x40010
+; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40010
 ; GFX10-DL-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v3, v2
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s0, s1, v2
-; GFX10-DL-NEXT:    s_bfe_u32 s0, s2, 0x40018
-; GFX10-DL-NEXT:    s_bfe_u32 s1, s4, 0x40018
-; GFX10-DL-NEXT:    s_lshr_b32 s2, s2, 28
-; GFX10-DL-NEXT:    s_lshr_b32 s4, s4, 28
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s4, v2
+; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40018
+; GFX10-DL-NEXT:    s_bfe_u32 s4, s1, 0x40018
+; GFX10-DL-NEXT:    s_lshr_b32 s0, s0, 28
+; GFX10-DL-NEXT:    s_lshr_b32 s1, s1, 28
 ; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s5, s6, v2
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s0, s1, v2
 ; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s4, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s0, s1, v2
 ; GFX10-DL-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX10-DL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
@@ -1460,43 +1460,43 @@ define amdgpu_kernel void @udot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_load_dword s0, s[8:9], 0x0
-; GFX7-NEXT:    s_load_dword s1, s[10:11], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s10, s[10:11], 0x0
+; GFX7-NEXT:    s_load_dword s21, s[4:5], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_lshr_b32 s8, s0, 28
-; GFX7-NEXT:    s_bfe_u32 s21, s1, 0x40004
-; GFX7-NEXT:    s_lshr_b32 s15, s1, 28
-; GFX7-NEXT:    s_bfe_u32 s16, s1, 0x40018
-; GFX7-NEXT:    s_bfe_u32 s17, s1, 0x40014
-; GFX7-NEXT:    s_bfe_u32 s18, s1, 0x40010
-; GFX7-NEXT:    s_bfe_u32 s19, s1, 0x4000c
-; GFX7-NEXT:    s_bfe_u32 s20, s1, 0x40008
-; GFX7-NEXT:    s_and_b32 s1, s1, 15
-; GFX7-NEXT:    s_bfe_u32 s9, s0, 0x40018
-; GFX7-NEXT:    s_bfe_u32 s10, s0, 0x40014
-; GFX7-NEXT:    s_bfe_u32 s11, s0, 0x40010
+; GFX7-NEXT:    s_lshr_b32 s1, s0, 28
+; GFX7-NEXT:    s_bfe_u32 s20, s10, 0x40004
+; GFX7-NEXT:    s_lshr_b32 s11, s10, 28
+; GFX7-NEXT:    s_bfe_u32 s15, s10, 0x40018
+; GFX7-NEXT:    s_bfe_u32 s16, s10, 0x40014
+; GFX7-NEXT:    s_bfe_u32 s17, s10, 0x40010
+; GFX7-NEXT:    s_bfe_u32 s18, s10, 0x4000c
+; GFX7-NEXT:    s_bfe_u32 s19, s10, 0x40008
+; GFX7-NEXT:    s_and_b32 s10, s10, 15
+; GFX7-NEXT:    s_bfe_u32 s2, s0, 0x40018
+; GFX7-NEXT:    s_bfe_u32 s8, s0, 0x40014
+; GFX7-NEXT:    s_bfe_u32 s9, s0, 0x40010
 ; GFX7-NEXT:    s_bfe_u32 s12, s0, 0x4000c
 ; GFX7-NEXT:    s_bfe_u32 s13, s0, 0x40008
 ; GFX7-NEXT:    s_bfe_u32 s14, s0, 0x40004
 ; GFX7-NEXT:    s_and_b32 s0, s0, 15
-; GFX7-NEXT:    v_mov_b32_e32 v0, s1
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s10
+; GFX7-NEXT:    v_mov_b32_e32 v1, s21
 ; GFX7-NEXT:    v_mad_u32_u24 v1, s0, v0, v1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s21
+; GFX7-NEXT:    v_mov_b32_e32 v2, s20
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s0, v0, v1
 ; GFX7-NEXT:    v_mad_u32_u24 v1, s14, v2, v1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s20
-; GFX7-NEXT:    v_mad_u32_u24 v1, s13, v2, v1
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s19
-; GFX7-NEXT:    v_mad_u32_u24 v1, s12, v2, v1
+; GFX7-NEXT:    v_mad_u32_u24 v1, s13, v2, v1
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s18
-; GFX7-NEXT:    v_mad_u32_u24 v1, s11, v2, v1
+; GFX7-NEXT:    v_mad_u32_u24 v1, s12, v2, v1
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s17
-; GFX7-NEXT:    v_mad_u32_u24 v1, s10, v2, v1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s16
 ; GFX7-NEXT:    v_mad_u32_u24 v1, s9, v2, v1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s15
+; GFX7-NEXT:    v_mov_b32_e32 v2, s16
 ; GFX7-NEXT:    v_mad_u32_u24 v1, s8, v2, v1
+; GFX7-NEXT:    v_mov_b32_e32 v2, s15
+; GFX7-NEXT:    v_mad_u32_u24 v1, s2, v2, v1
+; GFX7-NEXT:    v_mov_b32_e32 v2, s11
+; GFX7-NEXT:    v_mad_u32_u24 v1, s1, v2, v1
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
@@ -1507,46 +1507,46 @@ define amdgpu_kernel void @udot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX8-NEXT:    s_load_dword s4, s[6:7], 0x0
-; GFX8-NEXT:    s_load_dword s5, s[0:1], 0x0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    s_load_dword s6, s[6:7], 0x0
+; GFX8-NEXT:    s_load_dword s19, s[0:1], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_lshr_b32 s0, s2, 28
-; GFX8-NEXT:    s_bfe_u32 s17, s4, 0x40004
-; GFX8-NEXT:    s_lshr_b32 s11, s4, 28
-; GFX8-NEXT:    s_bfe_u32 s12, s4, 0x40018
-; GFX8-NEXT:    s_bfe_u32 s13, s4, 0x40014
-; GFX8-NEXT:    s_bfe_u32 s14, s4, 0x40010
-; GFX8-NEXT:    s_bfe_u32 s15, s4, 0x4000c
-; GFX8-NEXT:    s_bfe_u32 s16, s4, 0x40008
-; GFX8-NEXT:    s_and_b32 s4, s4, 15
-; GFX8-NEXT:    s_bfe_u32 s1, s2, 0x40018
-; GFX8-NEXT:    s_bfe_u32 s6, s2, 0x40014
-; GFX8-NEXT:    s_bfe_u32 s7, s2, 0x40010
-; GFX8-NEXT:    s_bfe_u32 s8, s2, 0x4000c
-; GFX8-NEXT:    s_bfe_u32 s9, s2, 0x40008
-; GFX8-NEXT:    s_bfe_u32 s10, s2, 0x40004
+; GFX8-NEXT:    s_lshr_b32 s4, s2, 28
+; GFX8-NEXT:    s_bfe_u32 s18, s6, 0x40004
+; GFX8-NEXT:    s_lshr_b32 s7, s6, 28
+; GFX8-NEXT:    s_bfe_u32 s13, s6, 0x40018
+; GFX8-NEXT:    s_bfe_u32 s14, s6, 0x40014
+; GFX8-NEXT:    s_bfe_u32 s15, s6, 0x40010
+; GFX8-NEXT:    s_bfe_u32 s16, s6, 0x4000c
+; GFX8-NEXT:    s_bfe_u32 s17, s6, 0x40008
+; GFX8-NEXT:    s_and_b32 s6, s6, 15
+; GFX8-NEXT:    s_bfe_u32 s5, s2, 0x40018
+; GFX8-NEXT:    s_bfe_u32 s8, s2, 0x40014
+; GFX8-NEXT:    s_bfe_u32 s9, s2, 0x40010
+; GFX8-NEXT:    s_bfe_u32 s10, s2, 0x4000c
+; GFX8-NEXT:    s_bfe_u32 s11, s2, 0x40008
+; GFX8-NEXT:    s_bfe_u32 s12, s2, 0x40004
 ; GFX8-NEXT:    s_and_b32 s2, s2, 15
-; GFX8-NEXT:    v_mov_b32_e32 v2, s4
-; GFX8-NEXT:    v_mov_b32_e32 v3, s5
-; GFX8-NEXT:    v_mad_u32_u24 v3, s2, v2, v3
-; GFX8-NEXT:    v_mov_b32_e32 v4, s17
-; GFX8-NEXT:    v_mad_u32_u24 v2, s2, v2, v3
-; GFX8-NEXT:    v_mad_u32_u24 v3, s10, v4, v3
-; GFX8-NEXT:    v_mov_b32_e32 v4, s16
-; GFX8-NEXT:    v_mad_u32_u24 v3, s9, v4, v3
-; GFX8-NEXT:    v_mov_b32_e32 v4, s15
-; GFX8-NEXT:    v_mad_u32_u24 v3, s8, v4, v3
-; GFX8-NEXT:    v_mov_b32_e32 v4, s14
-; GFX8-NEXT:    v_mad_u32_u24 v3, s7, v4, v3
-; GFX8-NEXT:    v_mov_b32_e32 v4, s13
-; GFX8-NEXT:    v_mad_u32_u24 v3, s6, v4, v3
-; GFX8-NEXT:    v_mov_b32_e32 v4, s12
-; GFX8-NEXT:    v_mad_u32_u24 v3, s1, v4, v3
-; GFX8-NEXT:    v_mov_b32_e32 v4, s11
-; GFX8-NEXT:    v_mad_u32_u24 v3, s0, v4, v3
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s19
+; GFX8-NEXT:    v_mad_u32_u24 v1, s2, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s18
+; GFX8-NEXT:    v_mad_u32_u24 v0, s2, v0, v1
+; GFX8-NEXT:    v_mad_u32_u24 v1, s12, v2, v1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s17
+; GFX8-NEXT:    v_mad_u32_u24 v1, s11, v2, v1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s16
+; GFX8-NEXT:    v_mad_u32_u24 v1, s10, v2, v1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s15
+; GFX8-NEXT:    v_mad_u32_u24 v1, s9, v2, v1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s14
+; GFX8-NEXT:    v_mad_u32_u24 v1, s8, v2, v1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s13
+; GFX8-NEXT:    v_mad_u32_u24 v1, s5, v2, v1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s7
+; GFX8-NEXT:    v_mad_u32_u24 v1, s4, v2, v1
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -1556,46 +1556,46 @@ define amdgpu_kernel void @udot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX9-NEXT:    s_load_dword s4, s[6:7], 0x0
-; GFX9-NEXT:    s_load_dword s5, s[0:1], 0x0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    s_load_dword s6, s[6:7], 0x0
+; GFX9-NEXT:    s_load_dword s19, s[0:1], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshr_b32 s0, s2, 28
-; GFX9-NEXT:    s_bfe_u32 s17, s4, 0x40004
-; GFX9-NEXT:    s_lshr_b32 s11, s4, 28
-; GFX9-NEXT:    s_bfe_u32 s12, s4, 0x40018
-; GFX9-NEXT:    s_bfe_u32 s13, s4, 0x40014
-; GFX9-NEXT:    s_bfe_u32 s14, s4, 0x40010
-; GFX9-NEXT:    s_bfe_u32 s15, s4, 0x4000c
-; GFX9-NEXT:    s_bfe_u32 s16, s4, 0x40008
-; GFX9-NEXT:    s_and_b32 s4, s4, 15
-; GFX9-NEXT:    s_bfe_u32 s1, s2, 0x40018
-; GFX9-NEXT:    s_bfe_u32 s6, s2, 0x40014
-; GFX9-NEXT:    s_bfe_u32 s7, s2, 0x40010
-; GFX9-NEXT:    s_bfe_u32 s8, s2, 0x4000c
-; GFX9-NEXT:    s_bfe_u32 s9, s2, 0x40008
-; GFX9-NEXT:    s_bfe_u32 s10, s2, 0x40004
+; GFX9-NEXT:    s_lshr_b32 s4, s2, 28
+; GFX9-NEXT:    s_bfe_u32 s18, s6, 0x40004
+; GFX9-NEXT:    s_lshr_b32 s7, s6, 28
+; GFX9-NEXT:    s_bfe_u32 s13, s6, 0x40018
+; GFX9-NEXT:    s_bfe_u32 s14, s6, 0x40014
+; GFX9-NEXT:    s_bfe_u32 s15, s6, 0x40010
+; GFX9-NEXT:    s_bfe_u32 s16, s6, 0x4000c
+; GFX9-NEXT:    s_bfe_u32 s17, s6, 0x40008
+; GFX9-NEXT:    s_and_b32 s6, s6, 15
+; GFX9-NEXT:    s_bfe_u32 s5, s2, 0x40018
+; GFX9-NEXT:    s_bfe_u32 s8, s2, 0x40014
+; GFX9-NEXT:    s_bfe_u32 s9, s2, 0x40010
+; GFX9-NEXT:    s_bfe_u32 s10, s2, 0x4000c
+; GFX9-NEXT:    s_bfe_u32 s11, s2, 0x40008
+; GFX9-NEXT:    s_bfe_u32 s12, s2, 0x40004
 ; GFX9-NEXT:    s_and_b32 s2, s2, 15
-; GFX9-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-NEXT:    v_mad_u32_u24 v3, s2, v2, v3
-; GFX9-NEXT:    v_mov_b32_e32 v4, s17
-; GFX9-NEXT:    v_mad_u32_u24 v2, s2, v2, v3
-; GFX9-NEXT:    v_mad_u32_u24 v3, s10, v4, v3
-; GFX9-NEXT:    v_mov_b32_e32 v4, s16
-; GFX9-NEXT:    v_mad_u32_u24 v3, s9, v4, v3
-; GFX9-NEXT:    v_mov_b32_e32 v4, s15
-; GFX9-NEXT:    v_mad_u32_u24 v3, s8, v4, v3
-; GFX9-NEXT:    v_mov_b32_e32 v4, s14
-; GFX9-NEXT:    v_mad_u32_u24 v3, s7, v4, v3
-; GFX9-NEXT:    v_mov_b32_e32 v4, s13
-; GFX9-NEXT:    v_mad_u32_u24 v3, s6, v4, v3
-; GFX9-NEXT:    v_mov_b32_e32 v4, s12
-; GFX9-NEXT:    v_mad_u32_u24 v3, s1, v4, v3
-; GFX9-NEXT:    v_mov_b32_e32 v4, s11
-; GFX9-NEXT:    v_mad_u32_u24 v3, s0, v4, v3
-; GFX9-NEXT:    v_add_u32_e32 v2, v2, v3
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s19
+; GFX9-NEXT:    v_mad_u32_u24 v1, s2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s18
+; GFX9-NEXT:    v_mad_u32_u24 v0, s2, v0, v1
+; GFX9-NEXT:    v_mad_u32_u24 v1, s12, v2, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s17
+; GFX9-NEXT:    v_mad_u32_u24 v1, s11, v2, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s16
+; GFX9-NEXT:    v_mad_u32_u24 v1, s10, v2, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s15
+; GFX9-NEXT:    v_mad_u32_u24 v1, s9, v2, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s14
+; GFX9-NEXT:    v_mad_u32_u24 v1, s8, v2, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s13
+; GFX9-NEXT:    v_mad_u32_u24 v1, s5, v2, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s7
+; GFX9-NEXT:    v_mad_u32_u24 v1, s4, v2, v1
+; GFX9-NEXT:    v_add_u32_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -1605,46 +1605,46 @@ define amdgpu_kernel void @udot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX9-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
-; GFX9-DL-NEXT:    s_load_dword s5, s[0:1], 0x0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-DL-NEXT:    s_load_dword s6, s[6:7], 0x0
+; GFX9-DL-NEXT:    s_load_dword s19, s[0:1], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_lshr_b32 s0, s2, 28
-; GFX9-DL-NEXT:    s_bfe_u32 s17, s4, 0x40004
-; GFX9-DL-NEXT:    s_lshr_b32 s11, s4, 28
-; GFX9-DL-NEXT:    s_bfe_u32 s12, s4, 0x40018
-; GFX9-DL-NEXT:    s_bfe_u32 s13, s4, 0x40014
-; GFX9-DL-NEXT:    s_bfe_u32 s14, s4, 0x40010
-; GFX9-DL-NEXT:    s_bfe_u32 s15, s4, 0x4000c
-; GFX9-DL-NEXT:    s_bfe_u32 s16, s4, 0x40008
-; GFX9-DL-NEXT:    s_and_b32 s4, s4, 15
-; GFX9-DL-NEXT:    s_bfe_u32 s1, s2, 0x40018
-; GFX9-DL-NEXT:    s_bfe_u32 s6, s2, 0x40014
-; GFX9-DL-NEXT:    s_bfe_u32 s7, s2, 0x40010
-; GFX9-DL-NEXT:    s_bfe_u32 s8, s2, 0x4000c
-; GFX9-DL-NEXT:    s_bfe_u32 s9, s2, 0x40008
-; GFX9-DL-NEXT:    s_bfe_u32 s10, s2, 0x40004
+; GFX9-DL-NEXT:    s_lshr_b32 s4, s2, 28
+; GFX9-DL-NEXT:    s_bfe_u32 s18, s6, 0x40004
+; GFX9-DL-NEXT:    s_lshr_b32 s7, s6, 28
+; GFX9-DL-NEXT:    s_bfe_u32 s13, s6, 0x40018
+; GFX9-DL-NEXT:    s_bfe_u32 s14, s6, 0x40014
+; GFX9-DL-NEXT:    s_bfe_u32 s15, s6, 0x40010
+; GFX9-DL-NEXT:    s_bfe_u32 s16, s6, 0x4000c
+; GFX9-DL-NEXT:    s_bfe_u32 s17, s6, 0x40008
+; GFX9-DL-NEXT:    s_and_b32 s6, s6, 15
+; GFX9-DL-NEXT:    s_bfe_u32 s5, s2, 0x40018
+; GFX9-DL-NEXT:    s_bfe_u32 s8, s2, 0x40014
+; GFX9-DL-NEXT:    s_bfe_u32 s9, s2, 0x40010
+; GFX9-DL-NEXT:    s_bfe_u32 s10, s2, 0x4000c
+; GFX9-DL-NEXT:    s_bfe_u32 s11, s2, 0x40008
+; GFX9-DL-NEXT:    s_bfe_u32 s12, s2, 0x40004
 ; GFX9-DL-NEXT:    s_and_b32 s2, s2, 15
-; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-DL-NEXT:    v_mad_u32_u24 v3, s2, v2, v3
-; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s17
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s2, v2, v3
-; GFX9-DL-NEXT:    v_mad_u32_u24 v3, s10, v4, v3
-; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s16
-; GFX9-DL-NEXT:    v_mad_u32_u24 v3, s9, v4, v3
-; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s15
-; GFX9-DL-NEXT:    v_mad_u32_u24 v3, s8, v4, v3
-; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s14
-; GFX9-DL-NEXT:    v_mad_u32_u24 v3, s7, v4, v3
-; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s13
-; GFX9-DL-NEXT:    v_mad_u32_u24 v3, s6, v4, v3
-; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s12
-; GFX9-DL-NEXT:    v_mad_u32_u24 v3, s1, v4, v3
-; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s11
-; GFX9-DL-NEXT:    v_mad_u32_u24 v3, s0, v4, v3
-; GFX9-DL-NEXT:    v_add_u32_e32 v2, v2, v3
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s19
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s2, v0, v1
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s18
+; GFX9-DL-NEXT:    v_mad_u32_u24 v0, s2, v0, v1
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s12, v2, v1
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s17
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s11, v2, v1
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s16
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s10, v2, v1
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s15
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s9, v2, v1
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s14
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s8, v2, v1
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s13
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s5, v2, v1
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s7
+; GFX9-DL-NEXT:    v_mad_u32_u24 v1, s4, v2, v1
+; GFX9-DL-NEXT:    v_add_u32_e32 v2, v0, v1
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
 ;
@@ -1657,36 +1657,36 @@ define amdgpu_kernel void @udot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s5, s[0:1], 0x0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_and_b32 s0, s2, 15
-; GFX10-DL-NEXT:    s_and_b32 s1, s4, 15
-; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-DL-NEXT:    s_and_b32 s6, s2, 15
+; GFX10-DL-NEXT:    s_and_b32 s7, s4, 15
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-DL-NEXT:    s_bfe_u32 s5, s2, 0x40004
-; GFX10-DL-NEXT:    s_bfe_u32 s6, s4, 0x40004
-; GFX10-DL-NEXT:    s_bfe_u32 s7, s2, 0x40008
-; GFX10-DL-NEXT:    s_bfe_u32 s8, s4, 0x40008
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s0, s1, v2
-; GFX10-DL-NEXT:    s_bfe_u32 s9, s2, 0x4000c
-; GFX10-DL-NEXT:    s_bfe_u32 s10, s4, 0x4000c
-; GFX10-DL-NEXT:    s_bfe_u32 s11, s2, 0x40010
-; GFX10-DL-NEXT:    s_bfe_u32 s12, s4, 0x40010
-; GFX10-DL-NEXT:    v_mad_u32_u24 v3, s5, s6, v2
+; GFX10-DL-NEXT:    s_bfe_u32 s8, s4, 0x40004
+; GFX10-DL-NEXT:    s_bfe_u32 s9, s2, 0x40008
+; GFX10-DL-NEXT:    s_bfe_u32 s10, s4, 0x40008
+; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s6, s7, v0
+; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s5, s8, v0
+; GFX10-DL-NEXT:    s_bfe_u32 s5, s2, 0x4000c
+; GFX10-DL-NEXT:    s_bfe_u32 s8, s4, 0x4000c
+; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s6, s7, v0
+; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s9, s10, v1
+; GFX10-DL-NEXT:    s_bfe_u32 s9, s2, 0x40010
+; GFX10-DL-NEXT:    s_bfe_u32 s10, s4, 0x40010
+; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s5, s8, v1
 ; GFX10-DL-NEXT:    s_bfe_u32 s5, s2, 0x40014
-; GFX10-DL-NEXT:    s_bfe_u32 s6, s4, 0x40014
-; GFX10-DL-NEXT:    s_bfe_u32 s13, s2, 0x40018
-; GFX10-DL-NEXT:    s_bfe_u32 s14, s4, 0x40018
-; GFX10-DL-NEXT:    v_mad_u32_u24 v3, s7, s8, v3
+; GFX10-DL-NEXT:    s_bfe_u32 s8, s4, 0x40014
+; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s9, s10, v1
+; GFX10-DL-NEXT:    s_bfe_u32 s9, s2, 0x40018
+; GFX10-DL-NEXT:    s_bfe_u32 s10, s4, 0x40018
 ; GFX10-DL-NEXT:    s_lshr_b32 s2, s2, 28
 ; GFX10-DL-NEXT:    s_lshr_b32 s4, s4, 28
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s0, s1, v2
-; GFX10-DL-NEXT:    v_mad_u32_u24 v3, s9, s10, v3
-; GFX10-DL-NEXT:    v_mad_u32_u24 v3, s11, s12, v3
-; GFX10-DL-NEXT:    v_mad_u32_u24 v3, s5, s6, v3
-; GFX10-DL-NEXT:    v_mad_u32_u24 v3, s13, s14, v3
-; GFX10-DL-NEXT:    v_mad_u32_u24 v3, s2, s4, v3
-; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v2, v3
+; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s5, s8, v1
+; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s9, s10, v1
+; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s4, v1
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v0, v1
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                                 <8 x i4> addrspace(1)* %src2,
@@ -1768,42 +1768,42 @@ define amdgpu_kernel void @udot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_load_dword s0, s[8:9], 0x0
-; GFX7-NEXT:    s_load_dword s1, s[10:11], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s10, s[10:11], 0x0
+; GFX7-NEXT:    s_load_dword s21, s[4:5], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_lshr_b32 s8, s0, 28
-; GFX7-NEXT:    s_lshr_b32 s15, s1, 28
-; GFX7-NEXT:    s_bfe_u32 s16, s1, 0x40018
-; GFX7-NEXT:    s_bfe_u32 s17, s1, 0x40014
-; GFX7-NEXT:    s_bfe_u32 s18, s1, 0x40010
-; GFX7-NEXT:    s_bfe_u32 s19, s1, 0x4000c
-; GFX7-NEXT:    s_bfe_u32 s20, s1, 0x40008
-; GFX7-NEXT:    s_bfe_u32 s21, s1, 0x40004
-; GFX7-NEXT:    s_and_b32 s1, s1, 15
-; GFX7-NEXT:    s_bfe_u32 s9, s0, 0x40018
-; GFX7-NEXT:    s_bfe_u32 s10, s0, 0x40014
-; GFX7-NEXT:    s_bfe_u32 s11, s0, 0x40010
+; GFX7-NEXT:    s_lshr_b32 s1, s0, 28
+; GFX7-NEXT:    s_lshr_b32 s11, s10, 28
+; GFX7-NEXT:    s_bfe_u32 s15, s10, 0x40018
+; GFX7-NEXT:    s_bfe_u32 s16, s10, 0x40014
+; GFX7-NEXT:    s_bfe_u32 s17, s10, 0x40010
+; GFX7-NEXT:    s_bfe_u32 s18, s10, 0x4000c
+; GFX7-NEXT:    s_bfe_u32 s19, s10, 0x40008
+; GFX7-NEXT:    s_bfe_u32 s20, s10, 0x40004
+; GFX7-NEXT:    s_and_b32 s10, s10, 15
+; GFX7-NEXT:    s_bfe_u32 s2, s0, 0x40018
+; GFX7-NEXT:    s_bfe_u32 s8, s0, 0x40014
+; GFX7-NEXT:    s_bfe_u32 s9, s0, 0x40010
 ; GFX7-NEXT:    s_bfe_u32 s12, s0, 0x4000c
 ; GFX7-NEXT:    s_bfe_u32 s13, s0, 0x40008
 ; GFX7-NEXT:    s_bfe_u32 s14, s0, 0x40004
 ; GFX7-NEXT:    s_and_b32 s0, s0, 15
-; GFX7-NEXT:    v_mov_b32_e32 v0, s1
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mad_u32_u24 v0, s0, v0, v1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s10
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s21
-; GFX7-NEXT:    v_mad_u32_u24 v0, s14, v1, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s0, v0, v1
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s20
-; GFX7-NEXT:    v_mad_u32_u24 v0, s13, v1, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s14, v1, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s19
-; GFX7-NEXT:    v_mad_u32_u24 v0, s12, v1, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s13, v1, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s18
-; GFX7-NEXT:    v_mad_u32_u24 v0, s11, v1, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s12, v1, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s17
-; GFX7-NEXT:    v_mad_u32_u24 v0, s10, v1, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s16
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s9, v1, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s15
+; GFX7-NEXT:    v_mov_b32_e32 v1, s16
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s8, v1, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s15
+; GFX7-NEXT:    v_mad_u32_u24 v0, s2, v1, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s11
+; GFX7-NEXT:    v_mad_u32_u24 v0, s1, v1, v0
 ; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
 ;
@@ -1813,44 +1813,44 @@ define amdgpu_kernel void @udot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX8-NEXT:    s_load_dword s4, s[6:7], 0x0
-; GFX8-NEXT:    s_load_dword s5, s[0:1], 0x0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    s_load_dword s6, s[6:7], 0x0
+; GFX8-NEXT:    s_load_dword s19, s[0:1], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_lshr_b32 s0, s2, 28
-; GFX8-NEXT:    s_lshr_b32 s11, s4, 28
-; GFX8-NEXT:    s_bfe_u32 s12, s4, 0x40018
-; GFX8-NEXT:    s_bfe_u32 s13, s4, 0x40014
-; GFX8-NEXT:    s_bfe_u32 s14, s4, 0x40010
-; GFX8-NEXT:    s_bfe_u32 s15, s4, 0x4000c
-; GFX8-NEXT:    s_bfe_u32 s16, s4, 0x40008
-; GFX8-NEXT:    s_bfe_u32 s17, s4, 0x40004
-; GFX8-NEXT:    s_and_b32 s4, s4, 15
-; GFX8-NEXT:    s_bfe_u32 s1, s2, 0x40018
-; GFX8-NEXT:    s_bfe_u32 s6, s2, 0x40014
-; GFX8-NEXT:    s_bfe_u32 s7, s2, 0x40010
-; GFX8-NEXT:    s_bfe_u32 s8, s2, 0x4000c
-; GFX8-NEXT:    s_bfe_u32 s9, s2, 0x40008
-; GFX8-NEXT:    s_bfe_u32 s10, s2, 0x40004
+; GFX8-NEXT:    s_lshr_b32 s4, s2, 28
+; GFX8-NEXT:    s_lshr_b32 s7, s6, 28
+; GFX8-NEXT:    s_bfe_u32 s13, s6, 0x40018
+; GFX8-NEXT:    s_bfe_u32 s14, s6, 0x40014
+; GFX8-NEXT:    s_bfe_u32 s15, s6, 0x40010
+; GFX8-NEXT:    s_bfe_u32 s16, s6, 0x4000c
+; GFX8-NEXT:    s_bfe_u32 s17, s6, 0x40008
+; GFX8-NEXT:    s_bfe_u32 s18, s6, 0x40004
+; GFX8-NEXT:    s_and_b32 s6, s6, 15
+; GFX8-NEXT:    s_bfe_u32 s5, s2, 0x40018
+; GFX8-NEXT:    s_bfe_u32 s8, s2, 0x40014
+; GFX8-NEXT:    s_bfe_u32 s9, s2, 0x40010
+; GFX8-NEXT:    s_bfe_u32 s10, s2, 0x4000c
+; GFX8-NEXT:    s_bfe_u32 s11, s2, 0x40008
+; GFX8-NEXT:    s_bfe_u32 s12, s2, 0x40004
 ; GFX8-NEXT:    s_and_b32 s2, s2, 15
-; GFX8-NEXT:    v_mov_b32_e32 v2, s4
-; GFX8-NEXT:    v_mov_b32_e32 v3, s5
-; GFX8-NEXT:    v_mad_u32_u24 v2, s2, v2, v3
-; GFX8-NEXT:    v_mov_b32_e32 v3, s17
-; GFX8-NEXT:    v_mad_u32_u24 v2, s10, v3, v2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s16
-; GFX8-NEXT:    v_mad_u32_u24 v2, s9, v3, v2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s15
-; GFX8-NEXT:    v_mad_u32_u24 v2, s8, v3, v2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s14
-; GFX8-NEXT:    v_mad_u32_u24 v2, s7, v3, v2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s13
-; GFX8-NEXT:    v_mad_u32_u24 v2, s6, v3, v2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s12
-; GFX8-NEXT:    v_mad_u32_u24 v2, s1, v3, v2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s11
-; GFX8-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s19
+; GFX8-NEXT:    v_mad_u32_u24 v0, s2, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s18
+; GFX8-NEXT:    v_mad_u32_u24 v0, s12, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s17
+; GFX8-NEXT:    v_mad_u32_u24 v0, s11, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s16
+; GFX8-NEXT:    v_mad_u32_u24 v0, s10, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s15
+; GFX8-NEXT:    v_mad_u32_u24 v0, s9, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s14
+; GFX8-NEXT:    v_mad_u32_u24 v0, s8, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s13
+; GFX8-NEXT:    v_mad_u32_u24 v0, s5, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mad_u32_u24 v2, s4, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -1860,44 +1860,44 @@ define amdgpu_kernel void @udot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX9-NEXT:    s_load_dword s4, s[6:7], 0x0
-; GFX9-NEXT:    s_load_dword s5, s[0:1], 0x0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    s_load_dword s6, s[6:7], 0x0
+; GFX9-NEXT:    s_load_dword s19, s[0:1], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshr_b32 s0, s2, 28
-; GFX9-NEXT:    s_lshr_b32 s11, s4, 28
-; GFX9-NEXT:    s_bfe_u32 s12, s4, 0x40018
-; GFX9-NEXT:    s_bfe_u32 s13, s4, 0x40014
-; GFX9-NEXT:    s_bfe_u32 s14, s4, 0x40010
-; GFX9-NEXT:    s_bfe_u32 s15, s4, 0x4000c
-; GFX9-NEXT:    s_bfe_u32 s16, s4, 0x40008
-; GFX9-NEXT:    s_bfe_u32 s17, s4, 0x40004
-; GFX9-NEXT:    s_and_b32 s4, s4, 15
-; GFX9-NEXT:    s_bfe_u32 s1, s2, 0x40018
-; GFX9-NEXT:    s_bfe_u32 s6, s2, 0x40014
-; GFX9-NEXT:    s_bfe_u32 s7, s2, 0x40010
-; GFX9-NEXT:    s_bfe_u32 s8, s2, 0x4000c
-; GFX9-NEXT:    s_bfe_u32 s9, s2, 0x40008
-; GFX9-NEXT:    s_bfe_u32 s10, s2, 0x40004
+; GFX9-NEXT:    s_lshr_b32 s4, s2, 28
+; GFX9-NEXT:    s_lshr_b32 s7, s6, 28
+; GFX9-NEXT:    s_bfe_u32 s13, s6, 0x40018
+; GFX9-NEXT:    s_bfe_u32 s14, s6, 0x40014
+; GFX9-NEXT:    s_bfe_u32 s15, s6, 0x40010
+; GFX9-NEXT:    s_bfe_u32 s16, s6, 0x4000c
+; GFX9-NEXT:    s_bfe_u32 s17, s6, 0x40008
+; GFX9-NEXT:    s_bfe_u32 s18, s6, 0x40004
+; GFX9-NEXT:    s_and_b32 s6, s6, 15
+; GFX9-NEXT:    s_bfe_u32 s5, s2, 0x40018
+; GFX9-NEXT:    s_bfe_u32 s8, s2, 0x40014
+; GFX9-NEXT:    s_bfe_u32 s9, s2, 0x40010
+; GFX9-NEXT:    s_bfe_u32 s10, s2, 0x4000c
+; GFX9-NEXT:    s_bfe_u32 s11, s2, 0x40008
+; GFX9-NEXT:    s_bfe_u32 s12, s2, 0x40004
 ; GFX9-NEXT:    s_and_b32 s2, s2, 15
-; GFX9-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-NEXT:    v_mad_u32_u24 v2, s2, v2, v3
-; GFX9-NEXT:    v_mov_b32_e32 v3, s17
-; GFX9-NEXT:    v_mad_u32_u24 v2, s10, v3, v2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s16
-; GFX9-NEXT:    v_mad_u32_u24 v2, s9, v3, v2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s15
-; GFX9-NEXT:    v_mad_u32_u24 v2, s8, v3, v2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s14
-; GFX9-NEXT:    v_mad_u32_u24 v2, s7, v3, v2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s13
-; GFX9-NEXT:    v_mad_u32_u24 v2, s6, v3, v2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s12
-; GFX9-NEXT:    v_mad_u32_u24 v2, s1, v3, v2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s11
-; GFX9-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s19
+; GFX9-NEXT:    v_mad_u32_u24 v0, s2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, s18
+; GFX9-NEXT:    v_mad_u32_u24 v0, s12, v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s17
+; GFX9-NEXT:    v_mad_u32_u24 v0, s11, v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s16
+; GFX9-NEXT:    v_mad_u32_u24 v0, s10, v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s15
+; GFX9-NEXT:    v_mad_u32_u24 v0, s9, v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s14
+; GFX9-NEXT:    v_mad_u32_u24 v0, s8, v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s13
+; GFX9-NEXT:    v_mad_u32_u24 v0, s5, v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    v_mad_u32_u24 v2, s4, v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -1906,15 +1906,15 @@ define amdgpu_kernel void @udot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX9-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
-; GFX9-DL-NEXT:    s_load_dword s5, s[0:1], 0x0
+; GFX9-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
+; GFX9-DL-NEXT:    s_load_dword s6, s[0:1], 0x0
+; GFX9-DL-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-DL-NEXT:    v_dot8_u32_u4 v2, s2, v2, v3
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s6
+; GFX9-DL-NEXT:    v_dot8_u32_u4 v2, s4, v2, v3
 ; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
 ;
@@ -1924,14 +1924,14 @@ define amdgpu_kernel void @udot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
-; GFX10-DL-NEXT:    s_load_dword s5, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-DL-NEXT:    v_dot8_u32_u4 v2, s4, s5, v0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-DL-NEXT:    v_dot8_u32_u4 v2, s2, s4, v2
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                               <8 x i4> addrspace(1)* %src2,
@@ -2032,46 +2032,46 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX8-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX8-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_load_ushort v2, v[0:1]
+; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_and_b32 s0, s2, 15
-; GFX8-NEXT:    s_and_b32 s1, s4, 15
+; GFX8-NEXT:    s_and_b32 s1, s2, 15
+; GFX8-NEXT:    s_bfe_u32 s4, s2, 0x40004
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    s_bfe_u32 s5, s4, 0x40004
-; GFX8-NEXT:    v_mov_b32_e32 v4, s5
-; GFX8-NEXT:    s_bfe_u32 s1, s2, 0x40004
-; GFX8-NEXT:    s_bfe_u32 s5, s4, 0x40008
-; GFX8-NEXT:    s_bfe_u32 s8, s4, 0x40010
-; GFX8-NEXT:    s_bfe_u32 s10, s4, 0x40014
-; GFX8-NEXT:    s_bfe_u32 s12, s4, 0x40018
-; GFX8-NEXT:    s_lshr_b32 s14, s4, 28
-; GFX8-NEXT:    s_bfe_u32 s4, s4, 0x4000c
-; GFX8-NEXT:    s_bfe_u32 s6, s2, 0x40008
-; GFX8-NEXT:    v_mov_b32_e32 v5, s5
+; GFX8-NEXT:    v_mov_b32_e32 v4, s4
+; GFX8-NEXT:    s_and_b32 s1, s0, 15
+; GFX8-NEXT:    s_bfe_u32 s5, s0, 0x40004
+; GFX8-NEXT:    s_bfe_u32 s4, s2, 0x40008
 ; GFX8-NEXT:    s_bfe_u32 s7, s2, 0x4000c
-; GFX8-NEXT:    v_mov_b32_e32 v6, s4
-; GFX8-NEXT:    s_bfe_u32 s9, s2, 0x40010
+; GFX8-NEXT:    v_mov_b32_e32 v5, s4
+; GFX8-NEXT:    s_bfe_u32 s6, s0, 0x40008
+; GFX8-NEXT:    s_bfe_u32 s8, s2, 0x40010
+; GFX8-NEXT:    v_mov_b32_e32 v6, s7
+; GFX8-NEXT:    s_bfe_u32 s4, s0, 0x4000c
+; GFX8-NEXT:    s_bfe_u32 s9, s2, 0x40014
+; GFX8-NEXT:    s_bfe_u32 s7, s0, 0x40010
 ; GFX8-NEXT:    v_mov_b32_e32 v7, s8
-; GFX8-NEXT:    s_bfe_u32 s11, s2, 0x40014
-; GFX8-NEXT:    v_mov_b32_e32 v8, s10
-; GFX8-NEXT:    s_bfe_u32 s13, s2, 0x40018
-; GFX8-NEXT:    v_mov_b32_e32 v9, s12
+; GFX8-NEXT:    s_bfe_u32 s11, s2, 0x40018
+; GFX8-NEXT:    s_bfe_u32 s10, s0, 0x40014
+; GFX8-NEXT:    v_mov_b32_e32 v8, s9
+; GFX8-NEXT:    s_bfe_u32 s12, s0, 0x40018
 ; GFX8-NEXT:    s_lshr_b32 s2, s2, 28
+; GFX8-NEXT:    v_mov_b32_e32 v9, s11
+; GFX8-NEXT:    s_lshr_b32 s0, s0, 28
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s1, v4, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s1, v3, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s5, v4, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX8-NEXT:    v_mad_u32_u24 v2, s6, v5, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s7, v6, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s9, v7, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s11, v8, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s13, v9, v2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s14
-; GFX8-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s4, v6, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s7, v7, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s10, v8, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s12, v9, v2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
 ; GFX8-NEXT:    flat_store_short v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -2080,53 +2080,53 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX9-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX9-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    global_load_ushort v2, v[0:1], off
+; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s0, s2, 15
-; GFX9-NEXT:    s_and_b32 s1, s4, 15
-; GFX9-NEXT:    s_bfe_u32 s5, s4, 0x40004
-; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s5
-; GFX9-NEXT:    s_bfe_u32 s6, s2, 0x40004
-; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s6
+; GFX9-NEXT:    s_and_b32 s1, s2, 15
+; GFX9-NEXT:    s_bfe_u32 s4, s2, 0x40004
+; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s4
+; GFX9-NEXT:    s_bfe_u32 s11, s2, 0x40018
+; GFX9-NEXT:    s_lshr_b32 s13, s2, 28
+; GFX9-NEXT:    s_and_b32 s4, s0, 15
+; GFX9-NEXT:    s_bfe_u32 s5, s0, 0x40004
+; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s4, s5
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    s_bfe_u32 s5, s4, 0x40008
-; GFX9-NEXT:    s_bfe_u32 s7, s4, 0x4000c
-; GFX9-NEXT:    s_pack_ll_b32_b16 s5, s5, s7
-; GFX9-NEXT:    v_pk_mul_lo_u16 v3, s0, v3
-; GFX9-NEXT:    s_bfe_u32 s1, s2, 0x40008
-; GFX9-NEXT:    s_bfe_u32 s6, s2, 0x4000c
-; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s6
-; GFX9-NEXT:    v_mov_b32_e32 v4, s5
-; GFX9-NEXT:    s_bfe_u32 s0, s4, 0x40010
-; GFX9-NEXT:    s_bfe_u32 s7, s4, 0x40014
-; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s7
-; GFX9-NEXT:    v_pk_mul_lo_u16 v4, s1, v4
-; GFX9-NEXT:    s_bfe_u32 s5, s2, 0x40010
-; GFX9-NEXT:    s_bfe_u32 s6, s2, 0x40014
-; GFX9-NEXT:    s_bfe_u32 s1, s4, 0x40018
-; GFX9-NEXT:    s_lshr_b32 s4, s4, 28
+; GFX9-NEXT:    s_pack_ll_b32_b16 s11, s11, s13
+; GFX9-NEXT:    s_bfe_u32 s1, s0, 0x40008
+; GFX9-NEXT:    s_bfe_u32 s5, s0, 0x4000c
+; GFX9-NEXT:    s_bfe_u32 s9, s0, 0x40010
+; GFX9-NEXT:    s_bfe_u32 s10, s0, 0x40014
+; GFX9-NEXT:    s_bfe_u32 s12, s0, 0x40018
+; GFX9-NEXT:    s_lshr_b32 s0, s0, 28
+; GFX9-NEXT:    s_bfe_u32 s6, s2, 0x40008
+; GFX9-NEXT:    s_bfe_u32 s7, s2, 0x4000c
+; GFX9-NEXT:    s_bfe_u32 s8, s2, 0x40010
+; GFX9-NEXT:    s_bfe_u32 s2, s2, 0x40014
+; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s12, s0
+; GFX9-NEXT:    v_mov_b32_e32 v4, s11
+; GFX9-NEXT:    v_pk_mul_lo_u16 v4, s0, v4
+; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s8, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v5, s0
-; GFX9-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
-; GFX9-NEXT:    s_bfe_u32 s0, s2, 0x40018
-; GFX9-NEXT:    s_lshr_b32 s2, s2, 28
-; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s4
-; GFX9-NEXT:    v_pk_mul_lo_u16 v5, s5, v5
-; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v6, s1
-; GFX9-NEXT:    v_pk_mul_lo_u16 v6, s0, v6
+; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s6, s7
+; GFX9-NEXT:    v_pk_mul_lo_u16 v3, s4, v3
+; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v6, s0
+; GFX9-NEXT:    v_pk_mul_lo_u16 v6, s1, v6
+; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s9, s10
+; GFX9-NEXT:    v_pk_mul_lo_u16 v5, s2, v5
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
 ; GFX9-NEXT:    v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT:    v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0
-; GFX9-NEXT:    v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0
+; GFX9-NEXT:    v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-NEXT:    v_add_u32_e32 v2, v2, v5
 ; GFX9-NEXT:    v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT:    v_add_u32_e32 v2, v2, v6
-; GFX9-NEXT:    v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_add_u32_e32 v2, v2, v4
+; GFX9-NEXT:    v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-NEXT:    global_store_short v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -2135,53 +2135,53 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX9-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX9-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DL-NEXT:    global_load_ushort v2, v[0:1], off
+; GFX9-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_and_b32 s0, s2, 15
-; GFX9-DL-NEXT:    s_and_b32 s1, s4, 15
-; GFX9-DL-NEXT:    s_bfe_u32 s5, s4, 0x40004
-; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s1, s1, s5
-; GFX9-DL-NEXT:    s_bfe_u32 s6, s2, 0x40004
-; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s0, s0, s6
+; GFX9-DL-NEXT:    s_and_b32 s1, s2, 15
+; GFX9-DL-NEXT:    s_bfe_u32 s4, s2, 0x40004
+; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s1, s1, s4
+; GFX9-DL-NEXT:    s_bfe_u32 s11, s2, 0x40018
+; GFX9-DL-NEXT:    s_lshr_b32 s13, s2, 28
+; GFX9-DL-NEXT:    s_and_b32 s4, s0, 15
+; GFX9-DL-NEXT:    s_bfe_u32 s5, s0, 0x40004
+; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s4, s4, s5
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-DL-NEXT:    s_bfe_u32 s5, s4, 0x40008
-; GFX9-DL-NEXT:    s_bfe_u32 s7, s4, 0x4000c
-; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s5, s5, s7
-; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v3, s0, v3
-; GFX9-DL-NEXT:    s_bfe_u32 s1, s2, 0x40008
-; GFX9-DL-NEXT:    s_bfe_u32 s6, s2, 0x4000c
-; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s1, s1, s6
-; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s5
-; GFX9-DL-NEXT:    s_bfe_u32 s0, s4, 0x40010
-; GFX9-DL-NEXT:    s_bfe_u32 s7, s4, 0x40014
-; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s0, s0, s7
-; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v4, s1, v4
-; GFX9-DL-NEXT:    s_bfe_u32 s5, s2, 0x40010
-; GFX9-DL-NEXT:    s_bfe_u32 s6, s2, 0x40014
-; GFX9-DL-NEXT:    s_bfe_u32 s1, s4, 0x40018
-; GFX9-DL-NEXT:    s_lshr_b32 s4, s4, 28
+; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s11, s11, s13
+; GFX9-DL-NEXT:    s_bfe_u32 s1, s0, 0x40008
+; GFX9-DL-NEXT:    s_bfe_u32 s5, s0, 0x4000c
+; GFX9-DL-NEXT:    s_bfe_u32 s9, s0, 0x40010
+; GFX9-DL-NEXT:    s_bfe_u32 s10, s0, 0x40014
+; GFX9-DL-NEXT:    s_bfe_u32 s12, s0, 0x40018
+; GFX9-DL-NEXT:    s_lshr_b32 s0, s0, 28
+; GFX9-DL-NEXT:    s_bfe_u32 s6, s2, 0x40008
+; GFX9-DL-NEXT:    s_bfe_u32 s7, s2, 0x4000c
+; GFX9-DL-NEXT:    s_bfe_u32 s8, s2, 0x40010
+; GFX9-DL-NEXT:    s_bfe_u32 s2, s2, 0x40014
+; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s0, s12, s0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s11
+; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v4, s0, v4
+; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s0, s8, s2
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v5, s0
-; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
-; GFX9-DL-NEXT:    s_bfe_u32 s0, s2, 0x40018
-; GFX9-DL-NEXT:    s_lshr_b32 s2, s2, 28
-; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s1, s1, s4
-; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v5, s5, v5
-; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v6, s1
-; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v6, s0, v6
+; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s0, s6, s7
+; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v3, s4, v3
+; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s1, s1, s5
+; GFX9-DL-NEXT:    v_mov_b32_e32 v6, s0
+; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v6, s1, v6
+; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s2, s9, s10
+; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v5, s2, v5
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-DL-NEXT:    v_add_u32_e32 v2, v3, v2
 ; GFX9-DL-NEXT:    v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-DL-NEXT:    v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0
-; GFX9-DL-NEXT:    v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-DL-NEXT:    v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0
+; GFX9-DL-NEXT:    v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-DL-NEXT:    v_add_u32_e32 v2, v2, v5
 ; GFX9-DL-NEXT:    v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-DL-NEXT:    v_add_u32_e32 v2, v2, v6
-; GFX9-DL-NEXT:    v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-DL-NEXT:    v_add_u32_e32 v2, v2, v4
+; GFX9-DL-NEXT:    v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-DL-NEXT:    global_store_short v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
 ;
@@ -2191,42 +2191,42 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX10-DL-NEXT:    global_load_ushort v2, v[0:1], off
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_and_b32 s0, s2, 15
-; GFX10-DL-NEXT:    s_bfe_u32 s1, s2, 0x40004
-; GFX10-DL-NEXT:    s_and_b32 s5, s4, 15
-; GFX10-DL-NEXT:    s_bfe_u32 s6, s4, 0x40004
-; GFX10-DL-NEXT:    s_bfe_u32 s7, s2, 0x40008
-; GFX10-DL-NEXT:    s_bfe_u32 s8, s2, 0x4000c
-; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
-; GFX10-DL-NEXT:    s_bfe_u32 s1, s4, 0x40008
-; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
-; GFX10-DL-NEXT:    s_bfe_u32 s6, s4, 0x4000c
-; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s7, s7, s8
-; GFX10-DL-NEXT:    s_bfe_u32 s8, s2, 0x40010
-; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v3, s0, s5
-; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s0, s1, s6
-; GFX10-DL-NEXT:    s_bfe_u32 s1, s2, 0x40014
-; GFX10-DL-NEXT:    s_bfe_u32 s5, s4, 0x40010
-; GFX10-DL-NEXT:    s_bfe_u32 s6, s4, 0x40014
-; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v4, s7, s0
-; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s1, s8, s1
-; GFX10-DL-NEXT:    s_bfe_u32 s0, s2, 0x40018
-; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
-; GFX10-DL-NEXT:    s_lshr_b32 s2, s2, 28
-; GFX10-DL-NEXT:    s_bfe_u32 s6, s4, 0x40018
-; GFX10-DL-NEXT:    s_lshr_b32 s4, s4, 28
-; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
+; GFX10-DL-NEXT:    s_and_b32 s2, s0, 15
+; GFX10-DL-NEXT:    s_bfe_u32 s6, s0, 0x40004
+; GFX10-DL-NEXT:    s_and_b32 s4, s1, 15
+; GFX10-DL-NEXT:    s_bfe_u32 s5, s1, 0x40004
+; GFX10-DL-NEXT:    s_bfe_u32 s7, s1, 0x4000c
+; GFX10-DL-NEXT:    s_bfe_u32 s8, s0, 0x4000c
+; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s2, s2, s6
+; GFX10-DL-NEXT:    s_bfe_u32 s6, s1, 0x40008
+; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s4, s4, s5
+; GFX10-DL-NEXT:    s_bfe_u32 s5, s0, 0x40008
+; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v3, s2, s4
+; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s4, s6, s7
+; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s5, s5, s8
+; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40010
+; GFX10-DL-NEXT:    s_bfe_u32 s6, s0, 0x40014
+; GFX10-DL-NEXT:    s_bfe_u32 s7, s1, 0x40010
+; GFX10-DL-NEXT:    s_bfe_u32 s8, s1, 0x40014
+; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v4, s5, s4
+; GFX10-DL-NEXT:    s_bfe_u32 s4, s0, 0x40018
+; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s2, s2, s6
+; GFX10-DL-NEXT:    s_lshr_b32 s0, s0, 28
+; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s5, s7, s8
+; GFX10-DL-NEXT:    s_bfe_u32 s6, s1, 0x40018
+; GFX10-DL-NEXT:    s_lshr_b32 s1, s1, 28
+; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s0, s4, s0
+; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s1, s6, s1
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v3, v2
 ; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v3, s1, s5
-; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s1, s6, s4
+; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v3, s2, s5
 ; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0
 ; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v4, s0, s1
@@ -2553,59 +2553,59 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX10-DL-NEXT:    s_load_dword s5, s[6:7], 0x0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX10-DL-NEXT:    global_load_ubyte v2, v[0:1], off
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_bfe_u32 s0, s4, 0x40004
-; GFX10-DL-NEXT:    s_bfe_u32 s1, s5, 0x40004
-; GFX10-DL-NEXT:    s_and_b32 s6, s4, 15
-; GFX10-DL-NEXT:    s_and_b32 s8, s5, 15
-; GFX10-DL-NEXT:    s_bfe_u32 s7, s4, 0x4000c
-; GFX10-DL-NEXT:    s_bfe_u32 s9, s5, 0x4000c
-; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v3, s0, s1
-; GFX10-DL-NEXT:    s_bfe_u32 s0, s4, 0x40008
+; GFX10-DL-NEXT:    s_bfe_u32 s4, s0, 0x40004
+; GFX10-DL-NEXT:    s_bfe_u32 s5, s1, 0x40004
+; GFX10-DL-NEXT:    s_and_b32 s6, s0, 15
+; GFX10-DL-NEXT:    s_and_b32 s8, s1, 15
+; GFX10-DL-NEXT:    s_bfe_u32 s7, s0, 0x4000c
+; GFX10-DL-NEXT:    s_bfe_u32 s9, s1, 0x4000c
+; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v3, s4, s5
+; GFX10-DL-NEXT:    s_bfe_u32 s4, s0, 0x40008
 ; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v4, s6, s8
-; GFX10-DL-NEXT:    s_bfe_u32 s1, s5, 0x40008
+; GFX10-DL-NEXT:    s_bfe_u32 s5, s1, 0x40008
 ; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v5, s7, s9
 ; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v3, 8, v3
-; GFX10-DL-NEXT:    s_bfe_u32 s6, s4, 0x40014
-; GFX10-DL-NEXT:    s_lshr_b32 s7, s4, 28
-; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v6, s0, s1
+; GFX10-DL-NEXT:    s_bfe_u32 s7, s1, 0x40014
+; GFX10-DL-NEXT:    s_bfe_u32 s6, s0, 0x40018
+; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v6, s4, s5
 ; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v5, 8, v5
 ; GFX10-DL-NEXT:    v_or_b32_e32 v3, v4, v3
-; GFX10-DL-NEXT:    s_bfe_u32 s0, s5, 0x40014
-; GFX10-DL-NEXT:    s_lshr_b32 s9, s5, 28
-; GFX10-DL-NEXT:    s_bfe_u32 s1, s4, 0x40010
+; GFX10-DL-NEXT:    s_bfe_u32 s5, s0, 0x40014
+; GFX10-DL-NEXT:    s_bfe_u32 s4, s0, 0x40010
+; GFX10-DL-NEXT:    s_lshr_b32 s0, s0, 28
 ; GFX10-DL-NEXT:    v_or_b32_sdwa v4, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-DL-NEXT:    v_and_b32_e32 v3, s2, v3
-; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v5, s6, s0
-; GFX10-DL-NEXT:    s_bfe_u32 s8, s5, 0x40010
-; GFX10-DL-NEXT:    s_bfe_u32 s0, s4, 0x40018
-; GFX10-DL-NEXT:    s_bfe_u32 s4, s5, 0x40018
+; GFX10-DL-NEXT:    s_bfe_u32 s8, s1, 0x40010
+; GFX10-DL-NEXT:    s_lshr_b32 s9, s1, 28
+; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v5, s5, s7
+; GFX10-DL-NEXT:    s_bfe_u32 s1, s1, 0x40018
 ; GFX10-DL-NEXT:    v_or_b32_e32 v4, v3, v4
-; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v5, 8, v5
-; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v6, s1, s8
-; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v7, s7, s9
-; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v8, s0, s4
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v9, 8, v4
-; GFX10-DL-NEXT:    v_or_b32_e32 v5, v6, v5
-; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v6, 8, v7
-; GFX10-DL-NEXT:    v_and_b32_e32 v5, s2, v5
-; GFX10-DL-NEXT:    v_or_b32_sdwa v6, v8, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-DL-NEXT:    v_or_b32_e32 v11, v5, v6
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v7, 8, v11
+; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v6, s4, s8
+; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v7, s0, s9
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v8, 8, v4
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v7, 8, v7
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v3, v2
-; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v2, v9
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v3, 8, v5
+; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v5, s6, s1
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v2, v8
+; GFX10-DL-NEXT:    v_or_b32_e32 v3, v6, v3
+; GFX10-DL-NEXT:    v_or_b32_sdwa v5, v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2
+; GFX10-DL-NEXT:    v_and_b32_e32 v3, s2, v3
+; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX10-DL-NEXT:    v_or_b32_e32 v4, v3, v5
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v2, v3
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v3, 8, v4
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v2, v3
+; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v2, v5
-; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v2, v7
-; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
 ; GFX10-DL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                              <8 x i4> addrspace(1)* %src2,
@@ -2696,48 +2696,48 @@ define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX8-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX8-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_load_ubyte v2, v[0:1]
+; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_and_b32 s0, s2, 15
-; GFX8-NEXT:    s_and_b32 s1, s4, 15
+; GFX8-NEXT:    s_and_b32 s1, s2, 15
+; GFX8-NEXT:    s_bfe_u32 s4, s2, 0x40004
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    s_bfe_u32 s5, s4, 0x40004
-; GFX8-NEXT:    s_bfe_u32 s6, s4, 0x40008
-; GFX8-NEXT:    s_bfe_u32 s7, s4, 0x4000c
-; GFX8-NEXT:    v_mov_b32_e32 v4, s5
-; GFX8-NEXT:    s_bfe_u32 s1, s2, 0x40004
-; GFX8-NEXT:    s_bfe_u32 s5, s2, 0x40008
-; GFX8-NEXT:    s_bfe_u32 s8, s2, 0x4000c
+; GFX8-NEXT:    s_bfe_u32 s6, s2, 0x40008
+; GFX8-NEXT:    s_and_b32 s1, s0, 15
+; GFX8-NEXT:    s_bfe_u32 s7, s2, 0x4000c
+; GFX8-NEXT:    v_mov_b32_e32 v4, s4
+; GFX8-NEXT:    s_bfe_u32 s5, s0, 0x40004
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s7
+; GFX8-NEXT:    s_bfe_u32 s8, s0, 0x4000c
 ; GFX8-NEXT:    v_mov_b32_e32 v6, s6
+; GFX8-NEXT:    s_bfe_u32 s4, s0, 0x40008
 ; GFX8-NEXT:    v_mul_u32_u24_e32 v5, s8, v5
-; GFX8-NEXT:    s_bfe_u32 s9, s4, 0x40010
+; GFX8-NEXT:    s_bfe_u32 s6, s2, 0x40010
 ; GFX8-NEXT:    v_and_b32_e32 v5, 15, v5
-; GFX8-NEXT:    s_bfe_u32 s11, s4, 0x40014
-; GFX8-NEXT:    s_bfe_u32 s10, s2, 0x40010
-; GFX8-NEXT:    v_mov_b32_e32 v7, s9
-; GFX8-NEXT:    s_bfe_u32 s13, s4, 0x40018
-; GFX8-NEXT:    s_bfe_u32 s12, s2, 0x40014
-; GFX8-NEXT:    v_mov_b32_e32 v8, s11
-; GFX8-NEXT:    s_bfe_u32 s14, s2, 0x40018
-; GFX8-NEXT:    s_lshr_b32 s4, s4, 28
-; GFX8-NEXT:    v_mov_b32_e32 v9, s13
-; GFX8-NEXT:    s_lshr_b32 s2, s2, 28
+; GFX8-NEXT:    s_bfe_u32 s8, s2, 0x40014
+; GFX8-NEXT:    s_bfe_u32 s7, s0, 0x40010
+; GFX8-NEXT:    v_mov_b32_e32 v7, s6
+; GFX8-NEXT:    s_lshr_b32 s11, s2, 28
+; GFX8-NEXT:    s_bfe_u32 s2, s2, 0x40018
+; GFX8-NEXT:    s_bfe_u32 s9, s0, 0x40014
+; GFX8-NEXT:    v_mov_b32_e32 v8, s8
+; GFX8-NEXT:    s_bfe_u32 s10, s0, 0x40018
+; GFX8-NEXT:    v_mov_b32_e32 v9, s2
+; GFX8-NEXT:    s_lshr_b32 s0, s0, 28
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s1, v4, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s5, v6, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s1, v3, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s5, v4, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s4, v6, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s10, v7, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s12, v8, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s14, v9, v2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s4
-; GFX8-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s7, v7, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s9, v8, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s10, v9, v2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s11
+; GFX8-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX8-NEXT:    flat_store_byte v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
@@ -2747,48 +2747,48 @@ define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX9-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX9-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    global_load_ubyte v2, v[0:1], off
+; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s0, s2, 15
-; GFX9-NEXT:    s_and_b32 s1, s4, 15
+; GFX9-NEXT:    s_and_b32 s1, s2, 15
+; GFX9-NEXT:    s_bfe_u32 s4, s2, 0x40004
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    s_bfe_u32 s5, s4, 0x40004
-; GFX9-NEXT:    s_bfe_u32 s6, s4, 0x40008
-; GFX9-NEXT:    s_bfe_u32 s7, s4, 0x4000c
-; GFX9-NEXT:    v_mov_b32_e32 v4, s5
-; GFX9-NEXT:    s_bfe_u32 s1, s2, 0x40004
-; GFX9-NEXT:    s_bfe_u32 s5, s2, 0x40008
-; GFX9-NEXT:    s_bfe_u32 s8, s2, 0x4000c
+; GFX9-NEXT:    s_bfe_u32 s6, s2, 0x40008
+; GFX9-NEXT:    s_and_b32 s1, s0, 15
+; GFX9-NEXT:    s_bfe_u32 s7, s2, 0x4000c
+; GFX9-NEXT:    v_mov_b32_e32 v4, s4
+; GFX9-NEXT:    s_bfe_u32 s5, s0, 0x40004
 ; GFX9-NEXT:    v_mov_b32_e32 v5, s7
+; GFX9-NEXT:    s_bfe_u32 s8, s0, 0x4000c
 ; GFX9-NEXT:    v_mov_b32_e32 v6, s6
+; GFX9-NEXT:    s_bfe_u32 s4, s0, 0x40008
 ; GFX9-NEXT:    v_mul_u32_u24_e32 v5, s8, v5
-; GFX9-NEXT:    s_bfe_u32 s9, s4, 0x40010
+; GFX9-NEXT:    s_bfe_u32 s6, s2, 0x40010
 ; GFX9-NEXT:    v_and_b32_e32 v5, 15, v5
-; GFX9-NEXT:    s_bfe_u32 s11, s4, 0x40014
-; GFX9-NEXT:    s_bfe_u32 s10, s2, 0x40010
-; GFX9-NEXT:    v_mov_b32_e32 v7, s9
-; GFX9-NEXT:    s_bfe_u32 s13, s4, 0x40018
-; GFX9-NEXT:    s_bfe_u32 s12, s2, 0x40014
-; GFX9-NEXT:    v_mov_b32_e32 v8, s11
-; GFX9-NEXT:    s_bfe_u32 s14, s2, 0x40018
-; GFX9-NEXT:    s_lshr_b32 s4, s4, 28
-; GFX9-NEXT:    v_mov_b32_e32 v9, s13
-; GFX9-NEXT:    s_lshr_b32 s2, s2, 28
+; GFX9-NEXT:    s_bfe_u32 s8, s2, 0x40014
+; GFX9-NEXT:    s_bfe_u32 s7, s0, 0x40010
+; GFX9-NEXT:    v_mov_b32_e32 v7, s6
+; GFX9-NEXT:    s_lshr_b32 s11, s2, 28
+; GFX9-NEXT:    s_bfe_u32 s2, s2, 0x40018
+; GFX9-NEXT:    s_bfe_u32 s9, s0, 0x40014
+; GFX9-NEXT:    v_mov_b32_e32 v8, s8
+; GFX9-NEXT:    s_bfe_u32 s10, s0, 0x40018
+; GFX9-NEXT:    v_mov_b32_e32 v9, s2
+; GFX9-NEXT:    s_lshr_b32 s0, s0, 28
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s1, v4, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s5, v6, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s1, v3, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s5, v4, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s4, v6, v2
 ; GFX9-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX9-NEXT:    v_add_u32_e32 v2, v2, v5
-; GFX9-NEXT:    v_mad_u32_u24 v2, s10, v7, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s12, v8, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s14, v9, v2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s4
-; GFX9-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s7, v7, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s9, v8, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s10, v9, v2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s11
+; GFX9-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
 ; GFX9-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX9-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
@@ -2798,48 +2798,48 @@ define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX9-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX9-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DL-NEXT:    global_load_ubyte v2, v[0:1], off
+; GFX9-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_and_b32 s0, s2, 15
-; GFX9-DL-NEXT:    s_and_b32 s1, s4, 15
+; GFX9-DL-NEXT:    s_and_b32 s1, s2, 15
+; GFX9-DL-NEXT:    s_bfe_u32 s4, s2, 0x40004
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-DL-NEXT:    s_bfe_u32 s5, s4, 0x40004
-; GFX9-DL-NEXT:    s_bfe_u32 s6, s4, 0x40008
-; GFX9-DL-NEXT:    s_bfe_u32 s7, s4, 0x4000c
-; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s5
-; GFX9-DL-NEXT:    s_bfe_u32 s1, s2, 0x40004
-; GFX9-DL-NEXT:    s_bfe_u32 s5, s2, 0x40008
-; GFX9-DL-NEXT:    s_bfe_u32 s8, s2, 0x4000c
+; GFX9-DL-NEXT:    s_bfe_u32 s6, s2, 0x40008
+; GFX9-DL-NEXT:    s_and_b32 s1, s0, 15
+; GFX9-DL-NEXT:    s_bfe_u32 s7, s2, 0x4000c
+; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s4
+; GFX9-DL-NEXT:    s_bfe_u32 s5, s0, 0x40004
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v5, s7
+; GFX9-DL-NEXT:    s_bfe_u32 s8, s0, 0x4000c
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v6, s6
+; GFX9-DL-NEXT:    s_bfe_u32 s4, s0, 0x40008
 ; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v5, s8, v5
-; GFX9-DL-NEXT:    s_bfe_u32 s9, s4, 0x40010
+; GFX9-DL-NEXT:    s_bfe_u32 s6, s2, 0x40010
 ; GFX9-DL-NEXT:    v_and_b32_e32 v5, 15, v5
-; GFX9-DL-NEXT:    s_bfe_u32 s11, s4, 0x40014
-; GFX9-DL-NEXT:    s_bfe_u32 s10, s2, 0x40010
-; GFX9-DL-NEXT:    v_mov_b32_e32 v7, s9
-; GFX9-DL-NEXT:    s_bfe_u32 s13, s4, 0x40018
-; GFX9-DL-NEXT:    s_bfe_u32 s12, s2, 0x40014
-; GFX9-DL-NEXT:    v_mov_b32_e32 v8, s11
-; GFX9-DL-NEXT:    s_bfe_u32 s14, s2, 0x40018
-; GFX9-DL-NEXT:    s_lshr_b32 s4, s4, 28
-; GFX9-DL-NEXT:    v_mov_b32_e32 v9, s13
-; GFX9-DL-NEXT:    s_lshr_b32 s2, s2, 28
+; GFX9-DL-NEXT:    s_bfe_u32 s8, s2, 0x40014
+; GFX9-DL-NEXT:    s_bfe_u32 s7, s0, 0x40010
+; GFX9-DL-NEXT:    v_mov_b32_e32 v7, s6
+; GFX9-DL-NEXT:    s_lshr_b32 s11, s2, 28
+; GFX9-DL-NEXT:    s_bfe_u32 s2, s2, 0x40018
+; GFX9-DL-NEXT:    s_bfe_u32 s9, s0, 0x40014
+; GFX9-DL-NEXT:    v_mov_b32_e32 v8, s8
+; GFX9-DL-NEXT:    s_bfe_u32 s10, s0, 0x40018
+; GFX9-DL-NEXT:    v_mov_b32_e32 v9, s2
+; GFX9-DL-NEXT:    s_lshr_b32 s0, s0, 28
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s1, v4, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s5, v6, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s1, v3, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s5, v4, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s4, v6, v2
 ; GFX9-DL-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX9-DL-NEXT:    v_add_u32_e32 v2, v2, v5
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s10, v7, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s12, v8, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s14, v9, v2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s4
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s7, v7, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s9, v8, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s10, v9, v2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s11
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
 ; GFX9-DL-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX9-DL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
@@ -2850,40 +2850,40 @@ define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX10-DL-NEXT:    global_load_ubyte v2, v[0:1], off
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_and_b32 s0, s2, 15
-; GFX10-DL-NEXT:    s_and_b32 s1, s4, 15
-; GFX10-DL-NEXT:    s_bfe_u32 s5, s2, 0x40004
-; GFX10-DL-NEXT:    s_bfe_u32 s6, s4, 0x40004
-; GFX10-DL-NEXT:    s_bfe_u32 s7, s2, 0x40008
-; GFX10-DL-NEXT:    s_bfe_u32 s8, s2, 0x4000c
-; GFX10-DL-NEXT:    s_bfe_u32 s9, s4, 0x40008
+; GFX10-DL-NEXT:    s_and_b32 s2, s0, 15
+; GFX10-DL-NEXT:    s_and_b32 s4, s1, 15
+; GFX10-DL-NEXT:    s_bfe_u32 s5, s0, 0x40004
+; GFX10-DL-NEXT:    s_bfe_u32 s6, s1, 0x40004
+; GFX10-DL-NEXT:    s_bfe_u32 s7, s1, 0x40008
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s0, s1, v2
-; GFX10-DL-NEXT:    s_bfe_u32 s0, s4, 0x4000c
-; GFX10-DL-NEXT:    s_bfe_u32 s1, s4, 0x40010
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s4, v2
+; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40008
+; GFX10-DL-NEXT:    s_bfe_u32 s4, s0, 0x4000c
 ; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s5, s6, v2
-; GFX10-DL-NEXT:    v_mul_u32_u24_e64 v3, s8, s0
-; GFX10-DL-NEXT:    s_bfe_u32 s0, s2, 0x40010
-; GFX10-DL-NEXT:    s_bfe_u32 s5, s2, 0x40014
-; GFX10-DL-NEXT:    s_bfe_u32 s6, s4, 0x40014
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s7, s9, v2
-; GFX10-DL-NEXT:    v_and_b32_e32 v3, 15, v3
+; GFX10-DL-NEXT:    s_bfe_u32 s5, s1, 0x4000c
+; GFX10-DL-NEXT:    s_bfe_u32 s6, s1, 0x40014
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s7, v2
+; GFX10-DL-NEXT:    v_mul_u32_u24_e64 v3, s4, s5
+; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40010
+; GFX10-DL-NEXT:    s_bfe_u32 s4, s1, 0x40010
+; GFX10-DL-NEXT:    s_bfe_u32 s5, s0, 0x40014
 ; GFX10-DL-NEXT:    v_and_b32_e32 v2, 15, v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v3, 15, v3
 ; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v2, v3
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s0, s1, v2
-; GFX10-DL-NEXT:    s_bfe_u32 s0, s2, 0x40018
-; GFX10-DL-NEXT:    s_bfe_u32 s1, s4, 0x40018
-; GFX10-DL-NEXT:    s_lshr_b32 s2, s2, 28
-; GFX10-DL-NEXT:    s_lshr_b32 s4, s4, 28
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s4, v2
+; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40018
+; GFX10-DL-NEXT:    s_bfe_u32 s4, s1, 0x40018
+; GFX10-DL-NEXT:    s_lshr_b32 s0, s0, 28
+; GFX10-DL-NEXT:    s_lshr_b32 s1, s1, 28
 ; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s5, s6, v2
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s0, s1, v2
 ; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s4, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s0, s1, v2
 ; GFX10-DL-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX10-DL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
@@ -2927,41 +2927,41 @@ define amdgpu_kernel void @udot8_variant1(i32 addrspace(1)* %v1addr,
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
-; GFX7-NEXT:    s_load_dword s6, s[0:1], 0x0
+; GFX7-NEXT:    s_load_dword s20, s[0:1], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_and_b32 s7, s4, 15
-; GFX7-NEXT:    s_and_b32 s8, s5, 15
-; GFX7-NEXT:    s_bfe_u32 s9, s4, 0x40004
-; GFX7-NEXT:    s_bfe_u32 s11, s4, 0x40008
-; GFX7-NEXT:    s_bfe_u32 s13, s4, 0x4000c
-; GFX7-NEXT:    s_bfe_u32 s15, s4, 0x40010
-; GFX7-NEXT:    s_bfe_u32 s17, s4, 0x40014
-; GFX7-NEXT:    s_bfe_u32 s19, s4, 0x40018
+; GFX7-NEXT:    s_and_b32 s6, s4, 15
+; GFX7-NEXT:    s_and_b32 s7, s5, 15
+; GFX7-NEXT:    s_bfe_u32 s8, s4, 0x40004
+; GFX7-NEXT:    s_bfe_u32 s10, s4, 0x40008
+; GFX7-NEXT:    s_bfe_u32 s12, s4, 0x4000c
+; GFX7-NEXT:    s_bfe_u32 s14, s4, 0x40010
+; GFX7-NEXT:    s_bfe_u32 s16, s4, 0x40014
+; GFX7-NEXT:    s_bfe_u32 s18, s4, 0x40018
 ; GFX7-NEXT:    s_lshr_b32 s4, s4, 28
-; GFX7-NEXT:    v_mov_b32_e32 v0, s7
-; GFX7-NEXT:    v_mov_b32_e32 v1, s6
-; GFX7-NEXT:    v_mad_u32_u24 v0, s8, v0, v1
-; GFX7-NEXT:    s_bfe_u32 s10, s5, 0x40004
-; GFX7-NEXT:    s_bfe_u32 s12, s5, 0x40008
-; GFX7-NEXT:    s_bfe_u32 s14, s5, 0x4000c
-; GFX7-NEXT:    s_bfe_u32 s16, s5, 0x40010
-; GFX7-NEXT:    s_bfe_u32 s18, s5, 0x40014
-; GFX7-NEXT:    s_bfe_u32 s20, s5, 0x40018
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s20
+; GFX7-NEXT:    v_mad_u32_u24 v0, s7, v0, v1
+; GFX7-NEXT:    s_bfe_u32 s9, s5, 0x40004
+; GFX7-NEXT:    s_bfe_u32 s11, s5, 0x40008
+; GFX7-NEXT:    s_bfe_u32 s13, s5, 0x4000c
+; GFX7-NEXT:    s_bfe_u32 s15, s5, 0x40010
+; GFX7-NEXT:    s_bfe_u32 s17, s5, 0x40014
+; GFX7-NEXT:    s_bfe_u32 s19, s5, 0x40018
 ; GFX7-NEXT:    s_lshr_b32 s5, s5, 28
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s5, v1, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s9
-; GFX7-NEXT:    v_mad_u32_u24 v0, s10, v1, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s11
-; GFX7-NEXT:    v_mad_u32_u24 v0, s12, v1, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s13
-; GFX7-NEXT:    v_mad_u32_u24 v0, s14, v1, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s15
-; GFX7-NEXT:    v_mad_u32_u24 v0, s16, v1, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s17
-; GFX7-NEXT:    v_mad_u32_u24 v0, s18, v1, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s19
-; GFX7-NEXT:    v_mad_u32_u24 v0, s20, v1, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s8
+; GFX7-NEXT:    v_mad_u32_u24 v0, s9, v1, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s10
+; GFX7-NEXT:    v_mad_u32_u24 v0, s11, v1, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s12
+; GFX7-NEXT:    v_mad_u32_u24 v0, s13, v1, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s14
+; GFX7-NEXT:    v_mad_u32_u24 v0, s15, v1, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s16
+; GFX7-NEXT:    v_mad_u32_u24 v0, s17, v1, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s18
+; GFX7-NEXT:    v_mad_u32_u24 v0, s19, v1, v0
 ; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
@@ -2972,43 +2972,43 @@ define amdgpu_kernel void @udot8_variant1(i32 addrspace(1)* %v1addr,
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX8-NEXT:    s_load_dword s3, s[6:7], 0x0
-; GFX8-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    s_load_dword s18, s[0:1], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_and_b32 s0, s2, 15
-; GFX8-NEXT:    s_and_b32 s1, s3, 15
-; GFX8-NEXT:    s_bfe_u32 s5, s2, 0x40004
-; GFX8-NEXT:    s_bfe_u32 s7, s2, 0x40008
-; GFX8-NEXT:    s_bfe_u32 s9, s2, 0x4000c
-; GFX8-NEXT:    s_bfe_u32 s11, s2, 0x40010
-; GFX8-NEXT:    s_bfe_u32 s13, s2, 0x40014
-; GFX8-NEXT:    s_bfe_u32 s15, s2, 0x40018
+; GFX8-NEXT:    s_and_b32 s4, s2, 15
+; GFX8-NEXT:    s_and_b32 s5, s3, 15
+; GFX8-NEXT:    s_bfe_u32 s6, s2, 0x40004
+; GFX8-NEXT:    s_bfe_u32 s8, s2, 0x40008
+; GFX8-NEXT:    s_bfe_u32 s10, s2, 0x4000c
+; GFX8-NEXT:    s_bfe_u32 s12, s2, 0x40010
+; GFX8-NEXT:    s_bfe_u32 s14, s2, 0x40014
+; GFX8-NEXT:    s_bfe_u32 s16, s2, 0x40018
 ; GFX8-NEXT:    s_lshr_b32 s2, s2, 28
-; GFX8-NEXT:    v_mov_b32_e32 v2, s0
-; GFX8-NEXT:    v_mov_b32_e32 v3, s4
-; GFX8-NEXT:    v_mad_u32_u24 v2, s1, v2, v3
-; GFX8-NEXT:    s_bfe_u32 s6, s3, 0x40004
-; GFX8-NEXT:    s_bfe_u32 s8, s3, 0x40008
-; GFX8-NEXT:    s_bfe_u32 s10, s3, 0x4000c
-; GFX8-NEXT:    s_bfe_u32 s12, s3, 0x40010
-; GFX8-NEXT:    s_bfe_u32 s14, s3, 0x40014
-; GFX8-NEXT:    s_bfe_u32 s16, s3, 0x40018
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s18
+; GFX8-NEXT:    v_mad_u32_u24 v0, s5, v0, v1
+; GFX8-NEXT:    s_bfe_u32 s7, s3, 0x40004
+; GFX8-NEXT:    s_bfe_u32 s9, s3, 0x40008
+; GFX8-NEXT:    s_bfe_u32 s11, s3, 0x4000c
+; GFX8-NEXT:    s_bfe_u32 s13, s3, 0x40010
+; GFX8-NEXT:    s_bfe_u32 s15, s3, 0x40014
+; GFX8-NEXT:    s_bfe_u32 s17, s3, 0x40018
 ; GFX8-NEXT:    s_lshr_b32 s3, s3, 28
-; GFX8-NEXT:    v_mov_b32_e32 v3, s2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s3, v3, v2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s5
-; GFX8-NEXT:    v_mad_u32_u24 v2, s6, v3, v2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s7
-; GFX8-NEXT:    v_mad_u32_u24 v2, s8, v3, v2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s9
-; GFX8-NEXT:    v_mad_u32_u24 v2, s10, v3, v2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s11
-; GFX8-NEXT:    v_mad_u32_u24 v2, s12, v3, v2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s13
-; GFX8-NEXT:    v_mad_u32_u24 v2, s14, v3, v2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s15
-; GFX8-NEXT:    v_mad_u32_u24 v2, s16, v3, v2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s2
+; GFX8-NEXT:    v_mad_u32_u24 v0, s3, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s6
+; GFX8-NEXT:    v_mad_u32_u24 v0, s7, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s8
+; GFX8-NEXT:    v_mad_u32_u24 v0, s9, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s10
+; GFX8-NEXT:    v_mad_u32_u24 v0, s11, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s12
+; GFX8-NEXT:    v_mad_u32_u24 v0, s13, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s14
+; GFX8-NEXT:    v_mad_u32_u24 v0, s15, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s16
+; GFX8-NEXT:    v_mad_u32_u24 v2, s17, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -3019,43 +3019,43 @@ define amdgpu_kernel void @udot8_variant1(i32 addrspace(1)* %v1addr,
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-NEXT:    s_load_dword s3, s[6:7], 0x0
-; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    s_load_dword s18, s[0:1], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s0, s2, 15
-; GFX9-NEXT:    s_and_b32 s1, s3, 15
-; GFX9-NEXT:    s_bfe_u32 s5, s2, 0x40004
-; GFX9-NEXT:    s_bfe_u32 s7, s2, 0x40008
-; GFX9-NEXT:    s_bfe_u32 s9, s2, 0x4000c
-; GFX9-NEXT:    s_bfe_u32 s11, s2, 0x40010
-; GFX9-NEXT:    s_bfe_u32 s13, s2, 0x40014
-; GFX9-NEXT:    s_bfe_u32 s15, s2, 0x40018
+; GFX9-NEXT:    s_and_b32 s4, s2, 15
+; GFX9-NEXT:    s_and_b32 s5, s3, 15
+; GFX9-NEXT:    s_bfe_u32 s6, s2, 0x40004
+; GFX9-NEXT:    s_bfe_u32 s8, s2, 0x40008
+; GFX9-NEXT:    s_bfe_u32 s10, s2, 0x4000c
+; GFX9-NEXT:    s_bfe_u32 s12, s2, 0x40010
+; GFX9-NEXT:    s_bfe_u32 s14, s2, 0x40014
+; GFX9-NEXT:    s_bfe_u32 s16, s2, 0x40018
 ; GFX9-NEXT:    s_lshr_b32 s2, s2, 28
-; GFX9-NEXT:    v_mov_b32_e32 v2, s0
-; GFX9-NEXT:    v_mov_b32_e32 v3, s4
-; GFX9-NEXT:    v_mad_u32_u24 v2, s1, v2, v3
-; GFX9-NEXT:    s_bfe_u32 s6, s3, 0x40004
-; GFX9-NEXT:    s_bfe_u32 s8, s3, 0x40008
-; GFX9-NEXT:    s_bfe_u32 s10, s3, 0x4000c
-; GFX9-NEXT:    s_bfe_u32 s12, s3, 0x40010
-; GFX9-NEXT:    s_bfe_u32 s14, s3, 0x40014
-; GFX9-NEXT:    s_bfe_u32 s16, s3, 0x40018
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s18
+; GFX9-NEXT:    v_mad_u32_u24 v0, s5, v0, v1
+; GFX9-NEXT:    s_bfe_u32 s7, s3, 0x40004
+; GFX9-NEXT:    s_bfe_u32 s9, s3, 0x40008
+; GFX9-NEXT:    s_bfe_u32 s11, s3, 0x4000c
+; GFX9-NEXT:    s_bfe_u32 s13, s3, 0x40010
+; GFX9-NEXT:    s_bfe_u32 s15, s3, 0x40014
+; GFX9-NEXT:    s_bfe_u32 s17, s3, 0x40018
 ; GFX9-NEXT:    s_lshr_b32 s3, s3, 28
-; GFX9-NEXT:    v_mov_b32_e32 v3, s2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s3, v3, v2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-NEXT:    v_mad_u32_u24 v2, s6, v3, v2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s7
-; GFX9-NEXT:    v_mad_u32_u24 v2, s8, v3, v2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s9
-; GFX9-NEXT:    v_mad_u32_u24 v2, s10, v3, v2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s11
-; GFX9-NEXT:    v_mad_u32_u24 v2, s12, v3, v2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s13
-; GFX9-NEXT:    v_mad_u32_u24 v2, s14, v3, v2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s15
-; GFX9-NEXT:    v_mad_u32_u24 v2, s16, v3, v2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    v_mad_u32_u24 v0, s3, v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-NEXT:    v_mad_u32_u24 v0, s7, v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s8
+; GFX9-NEXT:    v_mad_u32_u24 v0, s9, v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s10
+; GFX9-NEXT:    v_mad_u32_u24 v0, s11, v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s12
+; GFX9-NEXT:    v_mad_u32_u24 v0, s13, v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s14
+; GFX9-NEXT:    v_mad_u32_u24 v0, s15, v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s16
+; GFX9-NEXT:    v_mad_u32_u24 v2, s17, v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -3065,14 +3065,14 @@ define amdgpu_kernel void @udot8_variant1(i32 addrspace(1)* %v1addr,
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
-; GFX9-DL-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX9-DL-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX9-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-DL-NEXT:    v_dot8_u32_u4 v2, s4, v0, v1
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s4
-; GFX9-DL-NEXT:    v_dot8_u32_u4 v2, s3, v2, v3
 ; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
 ;
@@ -3082,14 +3082,14 @@ define amdgpu_kernel void @udot8_variant1(i32 addrspace(1)* %v1addr,
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
-; GFX10-DL-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_load_dword s3, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-DL-NEXT:    v_dot8_u32_u4 v2, s4, s3, v0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-DL-NEXT:    v_dot8_u32_u4 v2, s3, s2, v2
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                           i32 addrspace(1)* %v2addr,

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
index 12573a5fee3b..4f32340a0ff8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
@@ -562,31 +562,32 @@ define amdgpu_kernel void @maxnum_v4f16(
 ; SI-NEXT:    s_mov_b32 s0, s4
 ; SI-NEXT:    s_mov_b32 s1, s5
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
-; SI-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, s4
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, s5
 ; SI-NEXT:    s_lshr_b32 s4, s4, 16
+; SI-NEXT:    s_lshr_b32 s5, s5, 16
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, s4
-; SI-NEXT:    s_lshr_b32 s4, s5, 16
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, s4
-; SI-NEXT:    s_lshr_b32 s4, s7, 16
-; SI-NEXT:    v_cvt_f32_f16_e32 v5, s4
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, s5
-; SI-NEXT:    s_lshr_b32 s4, s6, 16
-; SI-NEXT:    v_cvt_f32_f16_e32 v7, s7
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, s5
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_lshr_b32 s6, s5, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, s6
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, s4
+; SI-NEXT:    s_lshr_b32 s4, s4, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v7, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v6, s4
-; SI-NEXT:    v_cvt_f32_f16_e32 v4, s6
 ; SI-NEXT:    v_mul_f32_e32 v5, 1.0, v5
-; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
 ; SI-NEXT:    v_max_f32_e32 v3, v3, v5
 ; SI-NEXT:    v_mul_f32_e32 v5, 1.0, v7
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; SI-NEXT:    v_max_f32_e32 v1, v1, v5
 ; SI-NEXT:    v_mul_f32_e32 v5, 1.0, v6
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; SI-NEXT:    v_max_f32_e32 v2, v2, v5
 ; SI-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_max_f32_e32 v0, v0, v4

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
index cdf05094f692..923383cf5a1a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
@@ -615,31 +615,32 @@ define amdgpu_kernel void @minnum_v4f16(
 ; SI-NEXT:    s_mov_b32 s0, s4
 ; SI-NEXT:    s_mov_b32 s1, s5
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
-; SI-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, s4
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, s5
 ; SI-NEXT:    s_lshr_b32 s4, s4, 16
+; SI-NEXT:    s_lshr_b32 s5, s5, 16
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, s4
-; SI-NEXT:    s_lshr_b32 s4, s5, 16
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, s4
-; SI-NEXT:    s_lshr_b32 s4, s7, 16
-; SI-NEXT:    v_cvt_f32_f16_e32 v5, s4
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, s5
-; SI-NEXT:    s_lshr_b32 s4, s6, 16
-; SI-NEXT:    v_cvt_f32_f16_e32 v7, s7
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, s5
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_lshr_b32 s6, s5, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, s6
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, s4
+; SI-NEXT:    s_lshr_b32 s4, s4, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v7, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v6, s4
-; SI-NEXT:    v_cvt_f32_f16_e32 v4, s6
 ; SI-NEXT:    v_mul_f32_e32 v5, 1.0, v5
-; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
 ; SI-NEXT:    v_min_f32_e32 v3, v3, v5
 ; SI-NEXT:    v_mul_f32_e32 v5, 1.0, v7
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; SI-NEXT:    v_min_f32_e32 v1, v1, v5
 ; SI-NEXT:    v_mul_f32_e32 v5, 1.0, v6
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; SI-NEXT:    v_min_f32_e32 v2, v2, v5
 ; SI-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_min_f32_e32 v0, v0, v4

diff  --git a/llvm/test/CodeGen/AMDGPU/madak.ll b/llvm/test/CodeGen/AMDGPU/madak.ll
index 8e4b6806f98a..fe710f7c79bb 100644
--- a/llvm/test/CodeGen/AMDGPU/madak.ll
+++ b/llvm/test/CodeGen/AMDGPU/madak.ll
@@ -10,8 +10,8 @@ declare float @llvm.fabs.f32(float) nounwind readnone
 ; GCN-LABEL: {{^}}madak_f32:
 ; GFX6:   buffer_load_dword [[VA:v[0-9]+]]
 ; GFX6:   buffer_load_dword [[VB:v[0-9]+]]
-; GFX8: {{flat|global}}_load_dword [[VB:v[0-9]+]]
 ; GFX8: {{flat|global}}_load_dword [[VA:v[0-9]+]]
+; GFX8: {{flat|global}}_load_dword [[VB:v[0-9]+]]
 ; GFX9: {{flat|global}}_load_dword [[VA:v[0-9]+]]
 ; GFX9: {{flat|global}}_load_dword [[VB:v[0-9]+]]
 ; GFX10: {{flat|global}}_load_dword [[VA:v[0-9]+]]
@@ -101,8 +101,8 @@ define amdgpu_kernel void @madak_m_inline_imm_f32(float addrspace(1)* noalias %o
 ; GCN-LABEL: {{^}}madak_inline_imm_f32:
 ; GFX6:   buffer_load_dword [[VA:v[0-9]+]]
 ; GFX6:   buffer_load_dword [[VB:v[0-9]+]]
-; GFX8: {{flat|global}}_load_dword [[VB:v[0-9]+]]
 ; GFX8: {{flat|global}}_load_dword [[VA:v[0-9]+]]
+; GFX8: {{flat|global}}_load_dword [[VB:v[0-9]+]]
 ; GFX9: {{flat|global}}_load_dword [[VA:v[0-9]+]]
 ; GFX9: {{flat|global}}_load_dword [[VB:v[0-9]+]]
 ; GFX10: {{flat|global}}_load_dword [[VA:v[0-9]+]]

diff  --git a/llvm/test/CodeGen/AMDGPU/max.i16.ll b/llvm/test/CodeGen/AMDGPU/max.i16.ll
index 371831f28231..5839eccf6086 100644
--- a/llvm/test/CodeGen/AMDGPU/max.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/max.i16.ll
@@ -16,13 +16,13 @@ define amdgpu_kernel void @v_test_imax_sge_i16(i16 addrspace(1)* %out, i16 addrs
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v4
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT:    flat_load_ushort v5, v[0:1]
 ; VI-NEXT:    flat_load_ushort v2, v[2:3]
-; VI-NEXT:    flat_load_ushort v3, v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s4, v4
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_max_i16_e32 v2, v3, v2
+; VI-NEXT:    v_max_i16_e32 v2, v5, v2
 ; VI-NEXT:    flat_store_short v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -38,13 +38,13 @@ define amdgpu_kernel void @v_test_imax_sge_i16(i16 addrspace(1)* %out, i16 addrs
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX9-NEXT:    global_load_ushort v5, v[0:1], off
 ; GFX9-NEXT:    global_load_ushort v2, v[2:3], off
-; GFX9-NEXT:    global_load_ushort v3, v[0:1], off
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s4, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_max_i16_e32 v2, v3, v2
+; GFX9-NEXT:    v_max_i16_e32 v2, v5, v2
 ; GFX9-NEXT:    global_store_short v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
@@ -73,15 +73,15 @@ define amdgpu_kernel void @v_test_imax_sge_v2i16(<2 x i16> addrspace(1)* %out, <
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v4
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT:    flat_load_dword v5, v[0:1]
 ; VI-NEXT:    flat_load_dword v2, v[2:3]
-; VI-NEXT:    flat_load_dword v3, v[0:1]
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s4, v4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s4, v4
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_max_i16_e32 v4, v3, v2
-; VI-NEXT:    v_max_i16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT:    v_or_b32_e32 v2, v4, v2
+; VI-NEXT:    v_max_i16_e32 v3, v5, v2
+; VI-NEXT:    v_max_i16_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_or_b32_e32 v2, v3, v2
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -97,13 +97,13 @@ define amdgpu_kernel void @v_test_imax_sge_v2i16(<2 x i16> addrspace(1)* %out, <
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX9-NEXT:    global_load_dword v5, v[0:1], off
 ; GFX9-NEXT:    global_load_dword v2, v[2:3], off
-; GFX9-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s4, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_max_i16 v2, v3, v2
+; GFX9-NEXT:    v_pk_max_i16 v2, v5, v2
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
@@ -124,35 +124,35 @@ define amdgpu_kernel void @v_test_imax_sge_v3i16(<3 x i16> addrspace(1)* %out, <
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; VI-NEXT:    v_lshlrev_b32_e32 v8, 3, v0
+; VI-NEXT:    v_lshlrev_b32_e32 v6, 3, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v3, s7
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v8
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v6
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    v_add_u32_e32 v2, vcc, s6, v8
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v6
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; VI-NEXT:    v_add_u32_e32 v6, vcc, 4, v2
-; VI-NEXT:    flat_load_dword v9, v[0:1]
-; VI-NEXT:    v_addc_u32_e32 v7, vcc, 0, v3, vcc
-; VI-NEXT:    flat_load_ushort v4, v[4:5]
-; VI-NEXT:    flat_load_dword v5, v[2:3]
-; VI-NEXT:    flat_load_ushort v6, v[6:7]
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s4, v8
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 4, v0
-; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v7, s5
+; VI-NEXT:    v_add_u32_e32 v6, vcc, s4, v6
+; VI-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
+; VI-NEXT:    flat_load_dword v8, v[0:1]
+; VI-NEXT:    flat_load_ushort v9, v[4:5]
+; VI-NEXT:    v_add_u32_e32 v0, vcc, 4, v2
+; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; VI-NEXT:    flat_load_dword v2, v[2:3]
+; VI-NEXT:    flat_load_ushort v0, v[0:1]
+; VI-NEXT:    v_add_u32_e32 v4, vcc, 4, v6
+; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; VI-NEXT:    v_max_i16_e32 v7, v5, v9
-; VI-NEXT:    v_max_i16_sdwa v5, v5, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_i16_e32 v1, v8, v2
+; VI-NEXT:    v_max_i16_sdwa v2, v8, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_max_i16_e32 v4, v6, v4
-; VI-NEXT:    v_or_b32_e32 v5, v7, v5
-; VI-NEXT:    flat_store_dword v[0:1], v5
-; VI-NEXT:    flat_store_short v[2:3], v4
+; VI-NEXT:    v_max_i16_e32 v0, v9, v0
+; VI-NEXT:    flat_store_dword v[6:7], v1
+; VI-NEXT:    flat_store_short v[4:5], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_test_imax_sge_v3i16:
@@ -167,19 +167,20 @@ define amdgpu_kernel void @v_test_imax_sge_v3i16(<3 x i16> addrspace(1)* %out, <
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT:    global_load_dword v6, v[2:3], off
-; GFX9-NEXT:    global_load_dword v7, v[0:1], off
+; GFX9-NEXT:    global_load_dword v6, v[0:1], off
+; GFX9-NEXT:    global_load_dword v7, v[2:3], off
 ; GFX9-NEXT:    v_mov_b32_e32 v5, s5
 ; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, s4, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v8, v7
-; GFX9-NEXT:    v_pk_max_i16 v7, v7, v6
-; GFX9-NEXT:    global_load_short_d16 v6, v[2:3], off offset:4
+; GFX9-NEXT:    v_pk_max_i16 v6, v6, v7
+; GFX9-NEXT:    global_load_short_d16 v7, v[2:3], off offset:4
 ; GFX9-NEXT:    global_load_short_d16 v8, v[0:1], off offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_max_i16 v0, v8, v6
-; GFX9-NEXT:    global_store_dword v[4:5], v7, off
+; GFX9-NEXT:    v_pk_max_i16 v0, v8, v7
+; GFX9-NEXT:    global_store_dword v[4:5], v6, off
 ; GFX9-NEXT:    global_store_short v[4:5], v0, off offset:4
 ; GFX9-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
@@ -208,8 +209,8 @@ define amdgpu_kernel void @v_test_imax_sge_v4i16(<4 x i16> addrspace(1)* %out, <
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v4
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; VI-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
 ; VI-NEXT:    v_mov_b32_e32 v5, s5
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, s4, v4
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
@@ -235,8 +236,8 @@ define amdgpu_kernel void @v_test_imax_sge_v4i16(<4 x i16> addrspace(1)* %out, <
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
 ; GFX9-NEXT:    v_mov_b32_e32 v5, s5
 ; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, s4, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
@@ -271,13 +272,13 @@ define amdgpu_kernel void @v_test_imax_sgt_i16(i16 addrspace(1)* %out, i16 addrs
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v4
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT:    flat_load_ushort v5, v[0:1]
 ; VI-NEXT:    flat_load_ushort v2, v[2:3]
-; VI-NEXT:    flat_load_ushort v3, v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s4, v4
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_max_i16_e32 v2, v3, v2
+; VI-NEXT:    v_max_i16_e32 v2, v5, v2
 ; VI-NEXT:    flat_store_short v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -293,13 +294,13 @@ define amdgpu_kernel void @v_test_imax_sgt_i16(i16 addrspace(1)* %out, i16 addrs
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX9-NEXT:    global_load_ushort v5, v[0:1], off
 ; GFX9-NEXT:    global_load_ushort v2, v[2:3], off
-; GFX9-NEXT:    global_load_ushort v3, v[0:1], off
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s4, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_max_i16_e32 v2, v3, v2
+; GFX9-NEXT:    v_max_i16_e32 v2, v5, v2
 ; GFX9-NEXT:    global_store_short v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
@@ -328,13 +329,13 @@ define amdgpu_kernel void @v_test_umax_uge_i16(i16 addrspace(1)* %out, i16 addrs
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v4
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT:    flat_load_ushort v5, v[0:1]
 ; VI-NEXT:    flat_load_ushort v2, v[2:3]
-; VI-NEXT:    flat_load_ushort v3, v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s4, v4
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_max_u16_e32 v2, v3, v2
+; VI-NEXT:    v_max_u16_e32 v2, v5, v2
 ; VI-NEXT:    flat_store_short v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -350,13 +351,13 @@ define amdgpu_kernel void @v_test_umax_uge_i16(i16 addrspace(1)* %out, i16 addrs
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX9-NEXT:    global_load_ushort v5, v[0:1], off
 ; GFX9-NEXT:    global_load_ushort v2, v[2:3], off
-; GFX9-NEXT:    global_load_ushort v3, v[0:1], off
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s4, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_max_u16_e32 v2, v3, v2
+; GFX9-NEXT:    v_max_u16_e32 v2, v5, v2
 ; GFX9-NEXT:    global_store_short v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
@@ -385,13 +386,13 @@ define amdgpu_kernel void @v_test_umax_ugt_i16(i16 addrspace(1)* %out, i16 addrs
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v4
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT:    flat_load_ushort v5, v[0:1]
 ; VI-NEXT:    flat_load_ushort v2, v[2:3]
-; VI-NEXT:    flat_load_ushort v3, v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s4, v4
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_max_u16_e32 v2, v3, v2
+; VI-NEXT:    v_max_u16_e32 v2, v5, v2
 ; VI-NEXT:    flat_store_short v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -407,13 +408,13 @@ define amdgpu_kernel void @v_test_umax_ugt_i16(i16 addrspace(1)* %out, i16 addrs
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX9-NEXT:    global_load_ushort v5, v[0:1], off
 ; GFX9-NEXT:    global_load_ushort v2, v[2:3], off
-; GFX9-NEXT:    global_load_ushort v3, v[0:1], off
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s4, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_max_u16_e32 v2, v3, v2
+; GFX9-NEXT:    v_max_u16_e32 v2, v5, v2
 ; GFX9-NEXT:    global_store_short v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
@@ -441,15 +442,15 @@ define amdgpu_kernel void @v_test_umax_ugt_v2i16(<2 x i16> addrspace(1)* %out, <
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v4
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT:    flat_load_dword v5, v[0:1]
 ; VI-NEXT:    flat_load_dword v2, v[2:3]
-; VI-NEXT:    flat_load_dword v3, v[0:1]
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s4, v4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s4, v4
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_max_u16_e32 v4, v3, v2
-; VI-NEXT:    v_max_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT:    v_or_b32_e32 v2, v4, v2
+; VI-NEXT:    v_max_u16_e32 v3, v5, v2
+; VI-NEXT:    v_max_u16_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_or_b32_e32 v2, v3, v2
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -465,13 +466,13 @@ define amdgpu_kernel void @v_test_umax_ugt_v2i16(<2 x i16> addrspace(1)* %out, <
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX9-NEXT:    global_load_dword v5, v[0:1], off
 ; GFX9-NEXT:    global_load_dword v2, v[2:3], off
-; GFX9-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s4, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_max_u16 v2, v3, v2
+; GFX9-NEXT:    v_pk_max_u16 v2, v5, v2
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone

diff  --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
index 811fba02c609..c8c118280e9b 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
@@ -393,11 +393,11 @@ define amdgpu_kernel void @DiffBase(i8 addrspace(1)* %buffer1,
 ; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
 ;
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
-; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
+; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
-; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
+; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
 ;
 ; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
 ; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}

diff  --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
index 84693f61aa61..494613f45f7d 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
@@ -73,7 +73,7 @@ entry:
 ; GCN-LABEL: {{^}}mul_v2i16:
 ; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}}
 ; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}}
-; NOSDWA: v_mul_u32_u24_e32 v[[DST_MUL:[0-9]+]], v[[DST0]], v[[DST1]]
+; NOSDWA: v_mul_u32_u24_e32 v[[DST_MUL:[0-9]+]], v[[DST1]], v[[DST0]]
 ; NOSDWA: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MUL]]
 ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[DST_SHL]]
 ; NOSDWA-NOT: v_mul_u32_u24_sdwa

diff  --git a/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll b/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll
index 6c8891d28d6d..ed57ec6cca35 100644
--- a/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll
@@ -78,7 +78,7 @@ define amdgpu_kernel void @load_shl_base_lds_max_offset(i8 addrspace(1)* %out, i
 ; GCN: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 2, {{v[0-9]+}}
 ; GCN-DAG: v_add_{{[iu]}}32_e32 [[PTR0:v[0-9]+]], vcc, lds0 at abs32@lo, [[OFS]]
 ; GCN-DAG: v_add_{{[iu]}}32_e32 [[PTR1:v[0-9]+]], vcc, lds1 at abs32@lo, [[OFS]]
-; GCN: s_mov_b32 m0, -1
+; GCN-DAG: s_mov_b32 m0, -1
 
 ; GCN-DAG: ds_read_b32 {{v[0-9]+}}, [[PTR0]] offset:256
 ; GCN-DAG: ds_read_b32 {{v[0-9]+}}, [[PTR1]] offset:256

diff  --git a/llvm/test/CodeGen/AMDGPU/sub.i16.ll b/llvm/test/CodeGen/AMDGPU/sub.i16.ll
index 6ff620c87e64..aee5c489d351 100644
--- a/llvm/test/CodeGen/AMDGPU/sub.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.i16.ll
@@ -107,7 +107,7 @@ define amdgpu_kernel void @v_test_sub_i16_zext_to_i64(i64 addrspace(1)* %out, i1
 ; GCN-LABEL: {{^}}v_test_sub_i16_sext_to_i32:
 ; VI: flat_load_ushort [[A:v[0-9]+]]
 ; VI: flat_load_ushort [[B:v[0-9]+]]
-; VI: v_sub_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; VI: v_sub_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
 ; VI-NEXT: v_bfe_i32 [[SEXT:v[0-9]+]], [[ADD]], 0, 16
 ; VI-NEXT: buffer_store_dword [[SEXT]]
 define amdgpu_kernel void @v_test_sub_i16_sext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
@@ -127,7 +127,7 @@ define amdgpu_kernel void @v_test_sub_i16_sext_to_i32(i32 addrspace(1)* %out, i1
 ; GCN-LABEL: {{^}}v_test_sub_i16_sext_to_i64:
 ; VI: flat_load_ushort [[A:v[0-9]+]]
 ; VI: flat_load_ushort [[B:v[0-9]+]]
-; VI: v_sub_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; VI: v_sub_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
 ; VI-NEXT: v_bfe_i32 v[[LO:[0-9]+]], [[ADD]], 0, 16
 ; VI-NEXT: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
 ; VI-NEXT: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}

diff  --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
index c7a08a665115..5f540df4968d 100644
--- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
@@ -66,39 +66,39 @@ define amdgpu_kernel void @s_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i
 ; GFX9-LABEL: s_test_sub_v2i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s6, s[6:7], 0x0
-; GFX9-NEXT:    s_load_dword s7, s[0:1], 0x0
 ; GFX9-NEXT:    s_mov_b32 s0, s4
 ; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX9-NEXT:    s_load_dword s5, s[8:9], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s7
-; GFX9-NEXT:    v_pk_sub_i16 v0, s6, v0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s5
+; GFX9-NEXT:    v_pk_sub_i16 v0, s4, v0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_test_sub_v2i16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_load_dword s6, s[6:7], 0x0
-; VI-NEXT:    s_load_dword s7, s[0:1], 0x0
 ; VI-NEXT:    s_mov_b32 s0, s4
 ; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_load_dword s4, s[6:7], 0x0
+; VI-NEXT:    s_load_dword s5, s[8:9], 0x0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshr_b32 s4, s6, 16
-; VI-NEXT:    s_lshr_b32 s5, s7, 16
+; VI-NEXT:    s_lshr_b32 s6, s4, 16
+; VI-NEXT:    s_lshr_b32 s7, s5, 16
 ; VI-NEXT:    s_sub_i32 s4, s4, s5
-; VI-NEXT:    s_sub_i32 s6, s6, s7
-; VI-NEXT:    s_and_b32 s5, s6, 0xffff
-; VI-NEXT:    s_lshl_b32 s4, s4, 16
-; VI-NEXT:    s_or_b32 s4, s5, s4
+; VI-NEXT:    s_sub_i32 s5, s6, s7
+; VI-NEXT:    s_and_b32 s4, s4, 0xffff
+; VI-NEXT:    s_lshl_b32 s5, s5, 16
+; VI-NEXT:    s_or_b32 s4, s4, s5
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
@@ -614,12 +614,12 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(<2 x i64> addrspace(1)
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT:    global_load_dword v2, v[2:3], off
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    global_load_dword v1, v[2:3], off
 ; GFX9-NEXT:    s_mov_b32 s0, s4
 ; GFX9-NEXT:    s_mov_b32 s1, s5
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_sub_i16 v1, v0, v2
+; GFX9-NEXT:    v_pk_sub_i16 v1, v0, v1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
 ; GFX9-NEXT:    v_bfe_i32 v0, v1, 0, 16
 ; GFX9-NEXT:    v_bfe_i32 v2, v2, 0, 16
@@ -642,15 +642,15 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(<2 x i64> addrspace(1)
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-NEXT:    flat_load_dword v2, v[2:3]
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    flat_load_dword v1, v[2:3]
 ; VI-NEXT:    s_mov_b32 s0, s4
 ; VI-NEXT:    s_mov_b32 s1, s5
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_sub_u16_sdwa v1, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT:    v_sub_u16_e32 v0, v0, v2
-; VI-NEXT:    v_bfe_i32 v2, v1, 0, 16
+; VI-NEXT:    v_sub_u16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_sub_u16_e32 v0, v0, v1
 ; VI-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; VI-NEXT:    v_bfe_i32 v2, v2, 0, 16
 ; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; VI-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
 ; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0

diff  --git a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll
index ff3e837235f0..200026494139 100644
--- a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll
@@ -106,13 +106,13 @@ define amdgpu_kernel void @truncate_high_elt_extract_vector(<2 x i16> addrspace(
 ; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_load_dword s2, s[4:5], 0x0
-; VI-NEXT:    s_load_dword s3, s[6:7], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_load_dword s0, s[4:5], 0x0
+; VI-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_sext_i32_i16 s0, s2
-; VI-NEXT:    s_sext_i32_i16 s1, s3
+; VI-NEXT:    s_sext_i32_i16 s0, s0
+; VI-NEXT:    s_sext_i32_i16 s1, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    v_mul_i32_i24_e32 v2, s1, v2
 ; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2

diff  --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
index 145a18177a1e..3e8384ad30d5 100644
--- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
@@ -19,11 +19,11 @@ define <4 x half> @shuffle_v4f16_234u(<4 x half> addrspace(1)* %arg0, <4 x half>
 ; GFX9-LABEL: shuffle_v4f16_234u:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX9-NEXT:    global_load_dwordx2 v[1:2], v[2:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
   %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
@@ -114,15 +114,15 @@ define <4 x half> @shuffle_v4f16_35u5(<4 x half> addrspace(1)* %arg0, <4 x half>
 ; GFX9-LABEL: shuffle_v4f16_35u5:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v2, v[2:3], off
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
-; GFX9-NEXT:    global_load_dword v1, v[2:3], off
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffff
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_and_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
   %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
@@ -134,15 +134,15 @@ define <4 x half> @shuffle_v4f16_357u(<4 x half> addrspace(1)* %arg0, <4 x half>
 ; GFX9-LABEL: shuffle_v4f16_357u:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[2:3], off
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffff
+; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_and_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
+; GFX9-NEXT:    v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; GFX9-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
   %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
@@ -181,10 +181,10 @@ define <4 x half> @shuffle_v4f16_0145(<4 x half> addrspace(1)* %arg0, <4 x half>
 ; GFX9-LABEL: shuffle_v4f16_0145:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, v2
+; GFX9-NEXT:    global_load_dwordx2 v[1:2], v[2:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
   %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
@@ -196,10 +196,11 @@ define <4 x half> @shuffle_v4f16_0167(<4 x half> addrspace(1)* %arg0, <4 x half>
 ; GFX9-LABEL: shuffle_v4f16_0167:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, v3
+; GFX9-NEXT:    global_load_dwordx2 v[1:2], v[2:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
   %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
@@ -239,11 +240,11 @@ define <4 x half> @shuffle_v4f16_2345(<4 x half> addrspace(1)* %arg0, <4 x half>
 ; GFX9-LABEL: shuffle_v4f16_2345:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX9-NEXT:    global_load_dwordx2 v[1:2], v[2:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
   %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
@@ -255,11 +256,10 @@ define <4 x half> @shuffle_v4f16_2367(<4 x half> addrspace(1)* %arg0, <4 x half>
 ; GFX9-LABEL: shuffle_v4f16_2367:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[2:3], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v1, v3
+; GFX9-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
   %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
@@ -271,11 +271,10 @@ define <4 x half> @shuffle_v4f16_4501(<4 x half> addrspace(1)* %arg0, <4 x half>
 ; GFX9-LABEL: shuffle_v4f16_4501:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[3:4], v[2:3], off
-; GFX9-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_mov_b32_e32 v0, v3
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[2:3], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
   %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
@@ -287,8 +286,8 @@ define <4 x half> @shuffle_v4f16_4523(<4 x half> addrspace(1)* %arg0, <4 x half>
 ; GFX9-LABEL: shuffle_v4f16_4523:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -329,11 +328,11 @@ define <4 x half> @shuffle_v4f16_6701(<4 x half> addrspace(1)* %arg0, <4 x half>
 ; GFX9-LABEL: shuffle_v4f16_6701:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
-; GFX9-NEXT:    v_mov_b32_e32 v0, v3
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[2:3], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
   %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
@@ -345,8 +344,8 @@ define <4 x half> @shuffle_v4f16_6723(<4 x half> addrspace(1)* %arg0, <4 x half>
 ; GFX9-LABEL: shuffle_v4f16_6723:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -390,9 +389,9 @@ define <4 x half> @shuffle_v4f16_2356(<4 x half> addrspace(1)* %arg0, <4 x half>
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
 ; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
-; GFX9-NEXT:    v_mov_b32_e32 v6, 0xffff
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_and_b32_sdwa v0, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v3, 16, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v5
@@ -409,9 +408,9 @@ define <4 x half> @shuffle_v4f16_5623(<4 x half> addrspace(1)* %arg0, <4 x half>
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_sdwa v0, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff
+; GFX9-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v3, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
@@ -424,15 +423,15 @@ define <4 x half> @shuffle_v4f16_3456(<4 x half> addrspace(1)* %arg0, <4 x half>
 ; GFX9-LABEL: shuffle_v4f16_3456:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[2:3], off
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffff
+; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_and_b32_sdwa v3, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_and_b32_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, v3
-; GFX9-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
+; GFX9-NEXT:    v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
+; GFX9-NEXT:    v_lshl_or_b32 v1, v3, 16, v4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
   %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
@@ -444,15 +443,15 @@ define <4 x half> @shuffle_v4f16_5634(<4 x half> addrspace(1)* %arg0, <4 x half>
 ; GFX9-LABEL: shuffle_v4f16_5634:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
-; GFX9-NEXT:    global_load_dwordx2 v[1:2], v[2:3], off
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0xffff
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_and_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_and_b32_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT:    v_lshl_or_b32 v1, v1, 16, v0
-; GFX9-NEXT:    v_lshl_or_b32 v0, v2, 16, v3
+; GFX9-NEXT:    v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_lshl_or_b32 v1, v2, 16, v0
+; GFX9-NEXT:    v_lshl_or_b32 v0, v3, 16, v4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
   %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
@@ -464,16 +463,16 @@ define <4 x half> @shuffle_v4f16_5734(<4 x half> addrspace(1)* %arg0, <4 x half>
 ; GFX9-LABEL: shuffle_v4f16_5734:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[2:3], off
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffff
+; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_and_b32_sdwa v3, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
-; GFX9-NEXT:    v_and_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT:    v_lshl_or_b32 v1, v0, 16, v3
-; GFX9-NEXT:    v_lshl_or_b32 v0, v4, 16, v2
+; GFX9-NEXT:    v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_and_b32_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_lshl_or_b32 v1, v2, 16, v0
+; GFX9-NEXT:    v_lshl_or_b32 v0, v3, 16, v4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
   %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
@@ -487,9 +486,9 @@ define <4 x i16> @shuffle_v4i16_2356(<4 x i16> addrspace(1)* %arg0, <4 x i16> ad
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
 ; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
-; GFX9-NEXT:    v_mov_b32_e32 v6, 0xffff
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_and_b32_sdwa v0, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v3, 16, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v5
@@ -504,10 +503,11 @@ define <4 x i16> @shuffle_v4i16_0167(<4 x i16> addrspace(1)* %arg0, <4 x i16> ad
 ; GFX9-LABEL: shuffle_v4i16_0167:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, v3
+; GFX9-NEXT:    global_load_dwordx2 v[1:2], v[2:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x i16>, <4 x i16> addrspace(1)* %arg0
   %val1 = load <4 x i16>, <4 x i16> addrspace(1)* %arg1
@@ -571,12 +571,12 @@ define <4 x half> @shuffle_v4f16_6161(<4 x half> addrspace(1)* %arg0, <4 x half>
 ; GFX9-LABEL: shuffle_v4f16_6161:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v2, v[2:3], off offset:4
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
-; GFX9-NEXT:    global_load_dword v1, v[2:3], off offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v2
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -651,10 +651,8 @@ define <4 x half> @shuffle_v8f16_4589(<8 x half> addrspace(1)* %arg0, <8 x half>
 ; GFX9-LABEL: shuffle_v8f16_4589:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v[2:3], off
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:8
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_mov_b32_e32 v1, v2
+; GFX9-NEXT:    global_load_dword v1, v[2:3], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0
@@ -667,10 +665,8 @@ define <4 x half> @shuffle_v8f16_10_11_2_3(<8 x half> addrspace(1)* %arg0, <8 x
 ; GFX9-LABEL: shuffle_v8f16_10_11_2_3:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v[2:3], off offset:4
 ; GFX9-NEXT:    global_load_dword v1, v[0:1], off offset:4
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-NEXT:    global_load_dword v0, v[2:3], off offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0
@@ -685,9 +681,9 @@ define <4 x half> @shuffle_v8f16_13_14_2_3(<8 x half> addrspace(1)* %arg0, <8 x
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
 ; GFX9-NEXT:    global_load_dword v1, v[0:1], off offset:4
-; GFX9-NEXT:    v_mov_b32_e32 v6, 0xffff
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_and_b32_sdwa v0, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v5, 16, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -732,10 +728,13 @@ define <6 x half> @shuffle_v6f16_452367(<6 x half> addrspace(1)* %arg0, <6 x hal
 ; GFX9-LABEL: shuffle_v6f16_452367:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v[2:3], off
+; GFX9-NEXT:    v_mov_b32_e32 v4, v3
+; GFX9-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX9-NEXT:    global_load_dwordx3 v[0:2], v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_load_dword v3, v[3:4], off
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <6 x half>, <6 x half> addrspace(1)* %arg0
@@ -760,9 +759,9 @@ define amdgpu_kernel void @fma_shuffle(<4 x half> addrspace(1)* nocapture readon
 ; GFX9-NEXT:    v_mov_b32_e32 v5, s5
 ; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, s4, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
-; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[4:5], off
-; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
+; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[4:5], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_fma_f16 v6, v0, v2, v6 op_sel_hi:[0,1,1]
 ; GFX9-NEXT:    v_pk_fma_f16 v2, v1, v2, v7 op_sel_hi:[0,1,1]
@@ -803,14 +802,15 @@ define <4 x half> @shuffle_v4f16_0456(<4 x half> addrspace(1)* %arg0, <4 x half>
 ; GFX9-LABEL: shuffle_v4f16_0456:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffff
-; GFX9-NEXT:    v_and_b32_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT:    v_and_b32_e32 v0, v1, v0
-; GFX9-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
-; GFX9-NEXT:    v_lshl_or_b32 v1, v3, 16, v4
+; GFX9-NEXT:    global_load_dwordx2 v[1:2], v[2:3], off
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0xffff
+; GFX9-NEXT:    v_and_b32_e32 v0, v3, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX9-NEXT:    v_lshl_or_b32 v1, v2, 16, v3
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
   %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1

diff  --git a/llvm/test/CodeGen/AMDGPU/wait.ll b/llvm/test/CodeGen/AMDGPU/wait.ll
index 1f0a66f91656..c06c16e29fe9 100644
--- a/llvm/test/CodeGen/AMDGPU/wait.ll
+++ b/llvm/test/CodeGen/AMDGPU/wait.ll
@@ -13,7 +13,7 @@
 ; DEFAULT: buffer_load_format_xyzw
 ; DEFAULT: s_waitcnt vmcnt(0)
 ; DEFAULT: exp
-; DEFAULT-NEXT: exp
+; DEFAULT: exp
 ; DEFAULT-NEXT: s_endpgm
 define amdgpu_vs void @main(<16 x i8> addrspace(4)* inreg %arg, <16 x i8> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, <16 x i8> addrspace(4)* inreg %arg3, <16 x i8> addrspace(4)* inreg %arg4, i32 inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, float addrspace(4)* inreg %constptr) #0 {
 main_body:


        


More information about the llvm-commits mailing list