[llvm] 8a52bd8 - [AMDGPU] Only select VOP3 forms of VOP2 instructions

Jay Foad via llvm-commits llvm-commits at lists.llvm.org
Wed Nov 24 03:17:22 PST 2021


Author: Jay Foad
Date: 2021-11-24T11:15:30Z
New Revision: 8a52bd82e36855b3ad842f2535d0c78a97db55dc

URL: https://github.com/llvm/llvm-project/commit/8a52bd82e36855b3ad842f2535d0c78a97db55dc
DIFF: https://github.com/llvm/llvm-project/commit/8a52bd82e36855b3ad842f2535d0c78a97db55dc.diff

LOG: [AMDGPU] Only select VOP3 forms of VOP2 instructions

Change VOP_PAT_GEN to default to not generating an instruction selection
pattern for the VOP2 (e32) form of an instruction, only for the VOP3
(e64) form. This allows SIFoldOperands maximum freedom to fold copies
into the operands of an instruction, before SIShrinkInstructions tries
to shrink it back to the smaller encoding.

This affects the following VOP2 instructions:
v_min_i32
v_max_i32
v_min_u32
v_max_u32
v_and_b32
v_or_b32
v_xor_b32
v_lshr_b32
v_ashr_i32
v_lshl_b32

A further cleanup could simplify or remove VOP_PAT_GEN, since its
optional second argument is never used.

Differential Revision: https://reviews.llvm.org/D114252

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/SIInstrInfo.td
    llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll
    llvm/test/CodeGen/AMDGPU/bfe-patterns.ll
    llvm/test/CodeGen/AMDGPU/commute-shifts.ll
    llvm/test/CodeGen/AMDGPU/ctpop16.ll
    llvm/test/CodeGen/AMDGPU/extract-lowbits.ll
    llvm/test/CodeGen/AMDGPU/flat-scratch.ll
    llvm/test/CodeGen/AMDGPU/idot8s.ll
    llvm/test/CodeGen/AMDGPU/idot8u.ll
    llvm/test/CodeGen/AMDGPU/inline-asm.ll
    llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll
    llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
    llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
    llvm/test/CodeGen/AMDGPU/select-constant-xor.ll
    llvm/test/CodeGen/AMDGPU/sext-in-reg.ll
    llvm/test/CodeGen/AMDGPU/shl.ll
    llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
    llvm/test/CodeGen/AMDGPU/ssubsat.ll
    llvm/test/CodeGen/AMDGPU/stack-realign.ll
    llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 8c24268e379e..47ee83eb9351 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2246,7 +2246,7 @@ class VOP_NO_EXT <VOPProfile p> : VOPProfile <p.ArgVT> {
   let HasExtSDWA9 = 0;
 }
 
-class VOP_PAT_GEN <VOPProfile p, int mode=PatGenMode.Pattern> : VOPProfile <p.ArgVT> {
+class VOP_PAT_GEN <VOPProfile p, int mode=PatGenMode.NoPattern> : VOPProfile <p.ArgVT> {
   let NeedPatGen = mode;
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll
index ae63e24afbf4..baff889c521e 100644
--- a/llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll
@@ -40,8 +40,8 @@ define amdgpu_kernel void @s_ashr_v2i16(<2 x i16> addrspace(1)* %out, i32, <2 x
 ; CI-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
 ; CI: v_ashrrev_i32_e32 v{{[0-9]+}}, 16, [[LHS]]
 ; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
-; CI: v_ashr_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; CI: v_ashr_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; CI: v_ashrrev_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; CI: v_ashrrev_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
 ; CI: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}}
 ; CI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}

diff  --git a/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll b/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll
index ce6340fb3953..80580967a788 100644
--- a/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll
+++ b/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
 
 ; GCN-LABEL: {{^}}v_ubfe_sub_i32:
 ; GCN: {{buffer|flat}}_load_dword [[SRC:v[0-9]+]]
@@ -24,11 +24,8 @@ define amdgpu_kernel void @v_ubfe_sub_i32(i32 addrspace(1)* %out, i32 addrspace(
 ; GCN: {{buffer|flat}}_load_dword [[WIDTH:v[0-9]+]]
 ; GCN: v_sub_{{[iu]}}32_e32 [[SUB:v[0-9]+]], vcc, 32, [[WIDTH]]
 
-; SI-NEXT: v_lshl_b32_e32 [[SHL:v[0-9]+]], [[SRC]], [[SUB]]
-; SI-NEXT: v_lshr_b32_e32 [[BFE:v[0-9]+]], [[SHL]], [[SUB]]
-
-; VI-NEXT: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], [[SUB]], [[SRC]]
-; VI-NEXT: v_lshrrev_b32_e32 [[BFE:v[0-9]+]], [[SUB]], [[SHL]]
+; GCN-NEXT: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], [[SUB]], [[SRC]]
+; GCN-NEXT: v_lshrrev_b32_e32 [[BFE:v[0-9]+]], [[SUB]], [[SHL]]
 
 ; GCN: [[BFE]]
 ; GCN: [[SHL]]
@@ -101,11 +98,8 @@ define amdgpu_kernel void @v_sbfe_sub_i32(i32 addrspace(1)* %out, i32 addrspace(
 ; GCN: {{buffer|flat}}_load_dword [[WIDTH:v[0-9]+]]
 ; GCN: v_sub_{{[iu]}}32_e32 [[SUB:v[0-9]+]], vcc, 32, [[WIDTH]]
 
-; SI-NEXT: v_lshl_b32_e32 [[SHL:v[0-9]+]], [[SRC]], [[SUB]]
-; SI-NEXT: v_ashr_i32_e32 [[BFE:v[0-9]+]], [[SHL]], [[SUB]]
-
-; VI-NEXT: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], [[SUB]], [[SRC]]
-; VI-NEXT: v_ashrrev_i32_e32 [[BFE:v[0-9]+]], [[SUB]], [[SHL]]
+; GCN-NEXT: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], [[SUB]], [[SRC]]
+; GCN-NEXT: v_ashrrev_i32_e32 [[BFE:v[0-9]+]], [[SUB]], [[SHL]]
 
 ; GCN: [[BFE]]
 ; GCN: [[SHL]]

diff  --git a/llvm/test/CodeGen/AMDGPU/commute-shifts.ll b/llvm/test/CodeGen/AMDGPU/commute-shifts.ll
index 13262ce6c631..59244ea56c28 100644
--- a/llvm/test/CodeGen/AMDGPU/commute-shifts.ll
+++ b/llvm/test/CodeGen/AMDGPU/commute-shifts.ll
@@ -17,7 +17,7 @@ define amdgpu_ps float @main(float %arg0, float %arg1) #0 {
 ; SI-NEXT:    image_load v2, v0, s[0:7] dmask:0x1 unorm
 ; SI-NEXT:    v_and_b32_e32 v0, 7, v0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshr_b32_e32 v0, v2, v0
+; SI-NEXT:    v_lshrrev_b32_e32 v0, v0, v2
 ; SI-NEXT:    v_and_b32_e32 v0, 1, v0
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc

diff  --git a/llvm/test/CodeGen/AMDGPU/ctpop16.ll b/llvm/test/CodeGen/AMDGPU/ctpop16.ll
index aedd9d573d4f..59a4e0b7368a 100644
--- a/llvm/test/CodeGen/AMDGPU/ctpop16.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctpop16.ll
@@ -730,7 +730,7 @@ define amdgpu_kernel void @v_ctpop_v16i16(<16 x i16> addrspace(1)* noalias %out,
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_and_b32_e32 v14, s0, v2
 ; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_and_b32_e32 v15, 0xffff, v3
+; SI-NEXT:    v_and_b32_e32 v15, s0, v3
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    v_bcnt_u32_b32_e64 v7, v7, 0
 ; SI-NEXT:    v_bcnt_u32_b32_e64 v6, v6, 0
@@ -773,7 +773,6 @@ define amdgpu_kernel void @v_ctpop_v16i16(<16 x i16> addrspace(1)* noalias %out,
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
-; VI-NEXT:    v_mov_b32_e32 v8, 0xffff
 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 ; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -786,23 +785,23 @@ define amdgpu_kernel void @v_ctpop_v16i16(<16 x i16> addrspace(1)* noalias %out,
 ; VI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
 ; VI-NEXT:    s_mov_b32 s0, 0xffff
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
-; VI-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
-; VI-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
-; VI-NEXT:    v_lshrrev_b32_e32 v12, 16, v0
-; VI-NEXT:    v_and_b32_e32 v3, v8, v3
-; VI-NEXT:    v_and_b32_e32 v2, v8, v2
-; VI-NEXT:    v_and_b32_e32 v1, v8, v1
-; VI-NEXT:    v_and_b32_e32 v0, v8, v0
+; VI-NEXT:    v_lshrrev_b32_e32 v8, 16, v3
+; VI-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
+; VI-NEXT:    v_lshrrev_b32_e32 v10, 16, v1
+; VI-NEXT:    v_lshrrev_b32_e32 v11, 16, v0
+; VI-NEXT:    v_and_b32_e32 v3, s0, v3
+; VI-NEXT:    v_and_b32_e32 v2, s0, v2
+; VI-NEXT:    v_and_b32_e32 v1, s0, v1
+; VI-NEXT:    v_and_b32_e32 v0, s0, v0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_lshrrev_b32_e32 v8, 16, v7
+; VI-NEXT:    v_lshrrev_b32_e32 v12, 16, v7
 ; VI-NEXT:    v_lshrrev_b32_e32 v13, 16, v6
 ; VI-NEXT:    v_lshrrev_b32_e32 v14, 16, v5
 ; VI-NEXT:    v_lshrrev_b32_e32 v15, 16, v4
+; VI-NEXT:    v_bcnt_u32_b32 v8, v8, 0
 ; VI-NEXT:    v_bcnt_u32_b32 v9, v9, 0
 ; VI-NEXT:    v_bcnt_u32_b32 v10, v10, 0
 ; VI-NEXT:    v_bcnt_u32_b32 v11, v11, 0
-; VI-NEXT:    v_bcnt_u32_b32 v12, v12, 0
 ; VI-NEXT:    v_and_b32_e32 v7, s0, v7
 ; VI-NEXT:    v_and_b32_e32 v6, s0, v6
 ; VI-NEXT:    v_and_b32_e32 v5, s0, v5
@@ -811,27 +810,27 @@ define amdgpu_kernel void @v_ctpop_v16i16(<16 x i16> addrspace(1)* noalias %out,
 ; VI-NEXT:    v_bcnt_u32_b32 v2, v2, 0
 ; VI-NEXT:    v_bcnt_u32_b32 v1, v1, 0
 ; VI-NEXT:    v_bcnt_u32_b32 v0, v0, 0
-; VI-NEXT:    v_bcnt_u32_b32 v8, v8, 0
+; VI-NEXT:    v_bcnt_u32_b32 v12, v12, 0
 ; VI-NEXT:    v_bcnt_u32_b32 v13, v13, 0
 ; VI-NEXT:    v_bcnt_u32_b32 v14, v14, 0
 ; VI-NEXT:    v_bcnt_u32_b32 v15, v15, 0
+; VI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
 ; VI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
 ; VI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
 ; VI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; VI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
 ; VI-NEXT:    v_bcnt_u32_b32 v7, v7, 0
 ; VI-NEXT:    v_bcnt_u32_b32 v6, v6, 0
 ; VI-NEXT:    v_bcnt_u32_b32 v5, v5, 0
 ; VI-NEXT:    v_bcnt_u32_b32 v4, v4, 0
-; VI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; VI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
 ; VI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
 ; VI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
 ; VI-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; VI-NEXT:    v_or_b32_e32 v3, v3, v9
-; VI-NEXT:    v_or_b32_e32 v2, v2, v10
-; VI-NEXT:    v_or_b32_e32 v1, v1, v11
-; VI-NEXT:    v_or_b32_e32 v0, v0, v12
-; VI-NEXT:    v_or_b32_e32 v7, v7, v8
+; VI-NEXT:    v_or_b32_e32 v3, v3, v8
+; VI-NEXT:    v_or_b32_e32 v2, v2, v9
+; VI-NEXT:    v_or_b32_e32 v1, v1, v10
+; VI-NEXT:    v_or_b32_e32 v0, v0, v11
+; VI-NEXT:    v_or_b32_e32 v7, v7, v12
 ; VI-NEXT:    v_or_b32_e32 v6, v6, v13
 ; VI-NEXT:    v_or_b32_e32 v5, v5, v14
 ; VI-NEXT:    v_or_b32_e32 v4, v4, v15

diff  --git a/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll b/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll
index 5dee220a5db7..930216a95135 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll
@@ -166,8 +166,8 @@ define i32 @bzhi32_d1_indexzext(i32 %val, i8 %numlowbits) nounwind {
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_sub_i32_e32 v1, vcc, 32, v1
-; SI-NEXT:    v_lshl_b32_e32 v0, v0, v1
-; SI-NEXT:    v_lshr_b32_e32 v0, v0, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
+; SI-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: bzhi32_d1_indexzext:

diff  --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
index 238c9d043a4a..09e2e90feed9 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
@@ -500,9 +500,9 @@ define void @store_load_vindex_foo(i32 %idx) {
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s32
-; GFX9-NEXT:    v_mov_b32_e32 v3, 15
 ; GFX9-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, v0, v3
+; GFX9-NEXT:    v_mov_b32_e32 v3, 15
+; GFX9-NEXT:    v_and_b32_e32 v0, 15, v0
 ; GFX9-NEXT:    scratch_store_dword v2, v3, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
@@ -514,14 +514,14 @@ define void @store_load_vindex_foo(i32 %idx) {
 ; GFX10:       ; %bb.0: ; %bb
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v1, 15
-; GFX10-NEXT:    v_mov_b32_e32 v2, s32
-; GFX10-NEXT:    v_and_b32_e32 v3, v0, v1
-; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, v2
-; GFX10-NEXT:    v_lshl_add_u32 v2, v3, 2, v2
-; GFX10-NEXT:    scratch_store_dword v0, v1, off
+; GFX10-NEXT:    v_mov_b32_e32 v1, s32
+; GFX10-NEXT:    v_and_b32_e32 v2, 15, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, 15
+; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
+; GFX10-NEXT:    v_lshl_add_u32 v1, v2, 2, v1
+; GFX10-NEXT:    scratch_store_dword v0, v3, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    scratch_load_dword v0, v2, off glc dlc
+; GFX10-NEXT:    scratch_load_dword v0, v1, off glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -529,9 +529,9 @@ define void @store_load_vindex_foo(i32 %idx) {
 ; GFX9-PAL:       ; %bb.0: ; %bb
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s32
-; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 15
 ; GFX9-PAL-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
-; GFX9-PAL-NEXT:    v_and_b32_e32 v0, v0, v3
+; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 15
+; GFX9-PAL-NEXT:    v_and_b32_e32 v0, 15, v0
 ; GFX9-PAL-NEXT:    scratch_store_dword v2, v3, off
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
@@ -543,14 +543,14 @@ define void @store_load_vindex_foo(i32 %idx) {
 ; GFX10-PAL:       ; %bb.0: ; %bb
 ; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
-; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, s32
-; GFX10-PAL-NEXT:    v_and_b32_e32 v3, v0, v1
-; GFX10-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v2
-; GFX10-PAL-NEXT:    v_lshl_add_u32 v2, v3, 2, v2
-; GFX10-PAL-NEXT:    scratch_store_dword v0, v1, off
+; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, s32
+; GFX10-PAL-NEXT:    v_and_b32_e32 v2, 15, v0
+; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, 15
+; GFX10-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
+; GFX10-PAL-NEXT:    v_lshl_add_u32 v1, v2, 2, v1
+; GFX10-PAL-NEXT:    scratch_store_dword v0, v3, off
 ; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-PAL-NEXT:    scratch_load_dword v0, v2, off glc dlc
+; GFX10-PAL-NEXT:    scratch_load_dword v0, v1, off glc dlc
 ; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
 bb:
@@ -1247,9 +1247,9 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 0x100
 ; GFX9-NEXT:    v_mov_b32_e32 v1, vcc_hi
-; GFX9-NEXT:    v_mov_b32_e32 v3, 15
 ; GFX9-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, v0, v3
+; GFX9-NEXT:    v_mov_b32_e32 v3, 15
+; GFX9-NEXT:    v_and_b32_e32 v0, 15, v0
 ; GFX9-NEXT:    scratch_store_dword v2, v3, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
@@ -1261,17 +1261,17 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
 ; GFX10:       ; %bb.0: ; %bb
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x100
-; GFX10-NEXT:    v_mov_b32_e32 v2, vcc_lo
-; GFX10-NEXT:    v_and_b32_e32 v3, v0, v1
-; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, v2
-; GFX10-NEXT:    v_lshl_add_u32 v2, v3, 2, v2
-; GFX10-NEXT:    scratch_load_dword v3, off, s32 glc dlc
+; GFX10-NEXT:    v_and_b32_e32 v2, 15, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, vcc_lo
+; GFX10-NEXT:    v_mov_b32_e32 v3, 15
+; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
+; GFX10-NEXT:    v_lshl_add_u32 v1, v2, 2, v1
+; GFX10-NEXT:    scratch_load_dword v2, off, s32 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    scratch_store_dword v0, v1, off
+; GFX10-NEXT:    scratch_store_dword v0, v3, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    scratch_load_dword v0, v2, off glc dlc
+; GFX10-NEXT:    scratch_load_dword v0, v1, off glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1282,9 +1282,9 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    s_add_i32 vcc_hi, s32, 0x100
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, vcc_hi
-; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 15
 ; GFX9-PAL-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
-; GFX9-PAL-NEXT:    v_and_b32_e32 v0, v0, v3
+; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 15
+; GFX9-PAL-NEXT:    v_and_b32_e32 v0, 15, v0
 ; GFX9-PAL-NEXT:    scratch_store_dword v2, v3, off
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
@@ -1296,17 +1296,17 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
 ; GFX10-PAL:       ; %bb.0: ; %bb
 ; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX10-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x100
-; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, vcc_lo
-; GFX10-PAL-NEXT:    v_and_b32_e32 v3, v0, v1
-; GFX10-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v2
-; GFX10-PAL-NEXT:    v_lshl_add_u32 v2, v3, 2, v2
-; GFX10-PAL-NEXT:    scratch_load_dword v3, off, s32 glc dlc
+; GFX10-PAL-NEXT:    v_and_b32_e32 v2, 15, v0
+; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, vcc_lo
+; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, 15
+; GFX10-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
+; GFX10-PAL-NEXT:    v_lshl_add_u32 v1, v2, 2, v1
+; GFX10-PAL-NEXT:    scratch_load_dword v2, off, s32 glc dlc
 ; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-PAL-NEXT:    scratch_store_dword v0, v1, off
+; GFX10-PAL-NEXT:    scratch_store_dword v0, v3, off
 ; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-PAL-NEXT:    scratch_load_dword v0, v2, off glc dlc
+; GFX10-PAL-NEXT:    scratch_load_dword v0, v1, off glc dlc
 ; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
 bb:
@@ -2019,9 +2019,9 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 0x4004
 ; GFX9-NEXT:    v_mov_b32_e32 v1, vcc_hi
-; GFX9-NEXT:    v_mov_b32_e32 v3, 15
 ; GFX9-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, v0, v3
+; GFX9-NEXT:    v_mov_b32_e32 v3, 15
+; GFX9-NEXT:    v_and_b32_e32 v0, 15, v0
 ; GFX9-NEXT:    scratch_store_dword v2, v3, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
@@ -2033,17 +2033,17 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ; GFX10:       ; %bb.0: ; %bb
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x4004
-; GFX10-NEXT:    v_mov_b32_e32 v2, vcc_lo
-; GFX10-NEXT:    v_and_b32_e32 v3, v0, v1
-; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, v2
-; GFX10-NEXT:    v_lshl_add_u32 v2, v3, 2, v2
-; GFX10-NEXT:    scratch_load_dword v3, off, s32 offset:4 glc dlc
+; GFX10-NEXT:    v_and_b32_e32 v2, 15, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, vcc_lo
+; GFX10-NEXT:    v_mov_b32_e32 v3, 15
+; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
+; GFX10-NEXT:    v_lshl_add_u32 v1, v2, 2, v1
+; GFX10-NEXT:    scratch_load_dword v2, off, s32 offset:4 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    scratch_store_dword v0, v1, off
+; GFX10-NEXT:    scratch_store_dword v0, v3, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    scratch_load_dword v0, v2, off glc dlc
+; GFX10-NEXT:    scratch_load_dword v0, v1, off glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2054,9 +2054,9 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    s_add_i32 vcc_hi, s32, 0x4004
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, vcc_hi
-; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 15
 ; GFX9-PAL-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
-; GFX9-PAL-NEXT:    v_and_b32_e32 v0, v0, v3
+; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 15
+; GFX9-PAL-NEXT:    v_and_b32_e32 v0, 15, v0
 ; GFX9-PAL-NEXT:    scratch_store_dword v2, v3, off
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
@@ -2068,17 +2068,17 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ; GFX10-PAL:       ; %bb.0: ; %bb
 ; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX10-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4004
-; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, vcc_lo
-; GFX10-PAL-NEXT:    v_and_b32_e32 v3, v0, v1
-; GFX10-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v2
-; GFX10-PAL-NEXT:    v_lshl_add_u32 v2, v3, 2, v2
-; GFX10-PAL-NEXT:    scratch_load_dword v3, off, s32 offset:4 glc dlc
+; GFX10-PAL-NEXT:    v_and_b32_e32 v2, 15, v0
+; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, vcc_lo
+; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, 15
+; GFX10-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
+; GFX10-PAL-NEXT:    v_lshl_add_u32 v1, v2, 2, v1
+; GFX10-PAL-NEXT:    scratch_load_dword v2, off, s32 offset:4 glc dlc
 ; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-PAL-NEXT:    scratch_store_dword v0, v1, off
+; GFX10-PAL-NEXT:    scratch_store_dword v0, v3, off
 ; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-PAL-NEXT:    scratch_load_dword v0, v2, off glc dlc
+; GFX10-PAL-NEXT:    scratch_load_dword v0, v1, off glc dlc
 ; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
 bb:

diff  --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll
index df34354806cc..dd54b339d121 100644
--- a/llvm/test/CodeGen/AMDGPU/idot8s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll
@@ -339,57 +339,56 @@ define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    s_mov_b32 s2, -1
-; GFX7-NEXT:    v_mov_b32_e32 v2, 0xffff
+; GFX7-NEXT:    buffer_load_ushort v16, off, s[0:3], 0
 ; GFX7-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX7-NEXT:    s_addc_u32 s13, s13, 0
+; GFX7-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-NEXT:    v_bfe_i32 v1, v2, 0, 4
+; GFX7-NEXT:    v_bfe_i32 v3, v2, 4, 4
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-NEXT:    v_bfe_i32 v1, v3, 0, 4
-; GFX7-NEXT:    v_bfe_i32 v4, v3, 4, 4
-; GFX7-NEXT:    v_bfe_i32 v5, v3, 8, 4
-; GFX7-NEXT:    v_bfe_i32 v6, v3, 12, 4
-; GFX7-NEXT:    v_bfe_i32 v7, v3, 16, 4
-; GFX7-NEXT:    v_bfe_i32 v8, v3, 20, 4
-; GFX7-NEXT:    v_bfe_i32 v9, v3, 24, 4
-; GFX7-NEXT:    v_ashrrev_i32_e32 v3, 28, v3
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_bfe_i32 v10, v0, 0, 4
-; GFX7-NEXT:    v_bfe_i32 v11, v0, 4, 4
-; GFX7-NEXT:    v_bfe_i32 v12, v0, 8, 4
-; GFX7-NEXT:    v_bfe_i32 v13, v0, 12, 4
-; GFX7-NEXT:    v_bfe_i32 v14, v0, 16, 4
-; GFX7-NEXT:    v_bfe_i32 v15, v0, 20, 4
-; GFX7-NEXT:    v_bfe_i32 v16, v0, 24, 4
-; GFX7-NEXT:    v_ashrrev_i32_e32 v0, 28, v0
-; GFX7-NEXT:    v_and_b32_e32 v9, v2, v9
-; GFX7-NEXT:    v_and_b32_e32 v3, v2, v3
-; GFX7-NEXT:    v_and_b32_e32 v15, v2, v15
-; GFX7-NEXT:    v_and_b32_e32 v16, v2, v16
-; GFX7-NEXT:    v_and_b32_e32 v0, v2, v0
-; GFX7-NEXT:    buffer_load_ushort v2, off, s[0:3], 0
+; GFX7-NEXT:    v_bfe_i32 v9, v0, 0, 4
 ; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX7-NEXT:    v_bfe_i32 v10, v0, 4, 4
+; GFX7-NEXT:    v_and_b32_e32 v9, s4, v9
+; GFX7-NEXT:    v_bfe_i32 v4, v2, 8, 4
+; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
+; GFX7-NEXT:    v_bfe_i32 v11, v0, 8, 4
 ; GFX7-NEXT:    v_and_b32_e32 v10, s4, v10
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v9, v16
+; GFX7-NEXT:    v_bfe_i32 v5, v2, 12, 4
 ; GFX7-NEXT:    v_and_b32_e32 v4, s4, v4
+; GFX7-NEXT:    v_bfe_i32 v12, v0, 12, 4
 ; GFX7-NEXT:    v_and_b32_e32 v11, s4, v11
+; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v10, v1
+; GFX7-NEXT:    v_bfe_i32 v6, v2, 16, 4
 ; GFX7-NEXT:    v_and_b32_e32 v5, s4, v5
+; GFX7-NEXT:    v_bfe_i32 v13, v0, 16, 4
 ; GFX7-NEXT:    v_and_b32_e32 v12, s4, v12
+; GFX7-NEXT:    v_mad_u32_u24 v1, v4, v11, v1
+; GFX7-NEXT:    v_bfe_i32 v7, v2, 20, 4
 ; GFX7-NEXT:    v_and_b32_e32 v6, s4, v6
+; GFX7-NEXT:    v_bfe_i32 v14, v0, 20, 4
 ; GFX7-NEXT:    v_and_b32_e32 v13, s4, v13
+; GFX7-NEXT:    v_mad_u32_u24 v1, v5, v12, v1
+; GFX7-NEXT:    v_bfe_i32 v8, v2, 24, 4
 ; GFX7-NEXT:    v_and_b32_e32 v7, s4, v7
+; GFX7-NEXT:    v_bfe_i32 v15, v0, 24, 4
 ; GFX7-NEXT:    v_and_b32_e32 v14, s4, v14
-; GFX7-NEXT:    v_and_b32_e32 v8, s4, v8
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v10, v2
-; GFX7-NEXT:    v_mad_u32_u24 v1, v4, v11, v1
-; GFX7-NEXT:    v_mad_u32_u24 v1, v5, v12, v1
 ; GFX7-NEXT:    v_mad_u32_u24 v1, v6, v13, v1
+; GFX7-NEXT:    v_ashrrev_i32_e32 v2, 28, v2
+; GFX7-NEXT:    v_and_b32_e32 v8, s4, v8
+; GFX7-NEXT:    v_ashrrev_i32_e32 v0, 28, v0
+; GFX7-NEXT:    v_and_b32_e32 v15, s4, v15
 ; GFX7-NEXT:    v_mad_u32_u24 v1, v7, v14, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v1, v8, v15, v1
-; GFX7-NEXT:    v_mad_u32_u24 v1, v9, v16, v1
-; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v0, v1
+; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
 ; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
@@ -913,57 +912,56 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    s_mov_b32 s2, -1
-; GFX7-NEXT:    v_mov_b32_e32 v2, 0xff
+; GFX7-NEXT:    buffer_load_ubyte v16, off, s[0:3], 0
 ; GFX7-NEXT:    s_movk_i32 s4, 0xff
 ; GFX7-NEXT:    s_addc_u32 s13, s13, 0
+; GFX7-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-NEXT:    v_bfe_i32 v1, v2, 0, 4
+; GFX7-NEXT:    v_bfe_i32 v3, v2, 4, 4
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-NEXT:    v_bfe_i32 v1, v3, 0, 4
-; GFX7-NEXT:    v_bfe_i32 v4, v3, 4, 4
-; GFX7-NEXT:    v_bfe_i32 v5, v3, 8, 4
-; GFX7-NEXT:    v_bfe_i32 v6, v3, 12, 4
-; GFX7-NEXT:    v_bfe_i32 v7, v3, 16, 4
-; GFX7-NEXT:    v_bfe_i32 v8, v3, 20, 4
-; GFX7-NEXT:    v_bfe_i32 v9, v3, 24, 4
-; GFX7-NEXT:    v_ashrrev_i32_e32 v3, 28, v3
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_bfe_i32 v10, v0, 0, 4
-; GFX7-NEXT:    v_bfe_i32 v11, v0, 4, 4
-; GFX7-NEXT:    v_bfe_i32 v12, v0, 8, 4
-; GFX7-NEXT:    v_bfe_i32 v13, v0, 12, 4
-; GFX7-NEXT:    v_bfe_i32 v14, v0, 16, 4
-; GFX7-NEXT:    v_bfe_i32 v15, v0, 20, 4
-; GFX7-NEXT:    v_bfe_i32 v16, v0, 24, 4
-; GFX7-NEXT:    v_ashrrev_i32_e32 v0, 28, v0
-; GFX7-NEXT:    v_and_b32_e32 v9, v2, v9
-; GFX7-NEXT:    v_and_b32_e32 v3, v2, v3
-; GFX7-NEXT:    v_and_b32_e32 v15, v2, v15
-; GFX7-NEXT:    v_and_b32_e32 v16, v2, v16
-; GFX7-NEXT:    v_and_b32_e32 v0, v2, v0
-; GFX7-NEXT:    buffer_load_ubyte v2, off, s[0:3], 0
+; GFX7-NEXT:    v_bfe_i32 v9, v0, 0, 4
 ; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX7-NEXT:    v_bfe_i32 v10, v0, 4, 4
+; GFX7-NEXT:    v_and_b32_e32 v9, s4, v9
+; GFX7-NEXT:    v_bfe_i32 v4, v2, 8, 4
+; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
+; GFX7-NEXT:    v_bfe_i32 v11, v0, 8, 4
 ; GFX7-NEXT:    v_and_b32_e32 v10, s4, v10
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v9, v16
+; GFX7-NEXT:    v_bfe_i32 v5, v2, 12, 4
 ; GFX7-NEXT:    v_and_b32_e32 v4, s4, v4
+; GFX7-NEXT:    v_bfe_i32 v12, v0, 12, 4
 ; GFX7-NEXT:    v_and_b32_e32 v11, s4, v11
+; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v10, v1
+; GFX7-NEXT:    v_bfe_i32 v6, v2, 16, 4
 ; GFX7-NEXT:    v_and_b32_e32 v5, s4, v5
+; GFX7-NEXT:    v_bfe_i32 v13, v0, 16, 4
 ; GFX7-NEXT:    v_and_b32_e32 v12, s4, v12
+; GFX7-NEXT:    v_mad_u32_u24 v1, v4, v11, v1
+; GFX7-NEXT:    v_bfe_i32 v7, v2, 20, 4
 ; GFX7-NEXT:    v_and_b32_e32 v6, s4, v6
+; GFX7-NEXT:    v_bfe_i32 v14, v0, 20, 4
 ; GFX7-NEXT:    v_and_b32_e32 v13, s4, v13
+; GFX7-NEXT:    v_mad_u32_u24 v1, v5, v12, v1
+; GFX7-NEXT:    v_bfe_i32 v8, v2, 24, 4
 ; GFX7-NEXT:    v_and_b32_e32 v7, s4, v7
+; GFX7-NEXT:    v_bfe_i32 v15, v0, 24, 4
 ; GFX7-NEXT:    v_and_b32_e32 v14, s4, v14
-; GFX7-NEXT:    v_and_b32_e32 v8, s4, v8
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v10, v2
-; GFX7-NEXT:    v_mad_u32_u24 v1, v4, v11, v1
-; GFX7-NEXT:    v_mad_u32_u24 v1, v5, v12, v1
 ; GFX7-NEXT:    v_mad_u32_u24 v1, v6, v13, v1
+; GFX7-NEXT:    v_ashrrev_i32_e32 v2, 28, v2
+; GFX7-NEXT:    v_and_b32_e32 v8, s4, v8
+; GFX7-NEXT:    v_ashrrev_i32_e32 v0, 28, v0
+; GFX7-NEXT:    v_and_b32_e32 v15, s4, v15
 ; GFX7-NEXT:    v_mad_u32_u24 v1, v7, v14, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v1, v8, v15, v1
-; GFX7-NEXT:    v_mad_u32_u24 v1, v9, v16, v1
-; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v0, v1
+; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
 ; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
@@ -2201,69 +2199,68 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT:    s_mov_b32 s4, 0xffff
-; GFX7-NEXT:    v_mov_b32_e32 v2, 0xffff
 ; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    buffer_load_ushort v16, off, s[0:3], 0
+; GFX7-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX7-NEXT:    s_addc_u32 s13, s13, 0
-; GFX7-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-NEXT:    v_bfe_i32 v1, v3, 20, 4
-; GFX7-NEXT:    v_bfe_i32 v4, v3, 16, 4
-; GFX7-NEXT:    v_bfe_i32 v5, v3, 4, 4
-; GFX7-NEXT:    v_bfe_i32 v6, v3, 0, 4
+; GFX7-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-NEXT:    v_bfe_i32 v1, v2, 20, 4
+; GFX7-NEXT:    v_bfe_i32 v3, v2, 16, 4
+; GFX7-NEXT:    v_bfe_i32 v4, v2, 4, 4
+; GFX7-NEXT:    v_bfe_i32 v5, v2, 0, 4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_and_b32_e32 v4, s4, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-NEXT:    v_and_b32_e32 v6, s4, v6
+; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT:    v_and_b32_e32 v5, s4, v5
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_bfe_i32 v9, v0, 20, 4
+; GFX7-NEXT:    v_bfe_i32 v10, v0, 16, 4
+; GFX7-NEXT:    v_bfe_i32 v11, v0, 4, 4
+; GFX7-NEXT:    v_bfe_i32 v12, v0, 0, 4
+; GFX7-NEXT:    v_or_b32_e32 v1, v3, v1
+; GFX7-NEXT:    v_or_b32_e32 v3, v5, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v9
+; GFX7-NEXT:    v_and_b32_e32 v5, s4, v10
+; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 16, v11
+; GFX7-NEXT:    v_and_b32_e32 v10, s4, v12
+; GFX7-NEXT:    v_bfe_i32 v13, v0, 24, 4
+; GFX7-NEXT:    v_ashrrev_i32_e32 v15, 28, v0
+; GFX7-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX7-NEXT:    v_or_b32_e32 v5, v10, v9
+; GFX7-NEXT:    v_and_b32_e32 v11, s4, v13
+; GFX7-NEXT:    v_and_b32_e32 v13, s4, v15
+; GFX7-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
+; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 16, v5
+; GFX7-NEXT:    v_and_b32_e32 v5, s4, v5
+; GFX7-NEXT:    v_bfe_i32 v7, v2, 8, 4
+; GFX7-NEXT:    v_bfe_i32 v14, v0, 8, 4
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_bfe_i32 v10, v0, 20, 4
-; GFX7-NEXT:    v_bfe_i32 v11, v0, 16, 4
-; GFX7-NEXT:    v_bfe_i32 v12, v0, 4, 4
-; GFX7-NEXT:    v_bfe_i32 v13, v0, 0, 4
-; GFX7-NEXT:    v_or_b32_e32 v1, v4, v1
-; GFX7-NEXT:    v_or_b32_e32 v4, v6, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v10
-; GFX7-NEXT:    v_and_b32_e32 v6, s4, v11
-; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 16, v12
-; GFX7-NEXT:    v_and_b32_e32 v11, v2, v13
-; GFX7-NEXT:    v_bfe_i32 v7, v3, 24, 4
-; GFX7-NEXT:    v_bfe_i32 v8, v3, 8, 4
-; GFX7-NEXT:    v_ashrrev_i32_e32 v9, 28, v3
-; GFX7-NEXT:    v_bfe_i32 v3, v3, 12, 4
-; GFX7-NEXT:    v_bfe_i32 v14, v0, 24, 4
-; GFX7-NEXT:    v_bfe_i32 v15, v0, 8, 4
-; GFX7-NEXT:    v_ashrrev_i32_e32 v16, 28, v0
+; GFX7-NEXT:    v_mad_u32_u24 v3, v3, v5, v16
+; GFX7-NEXT:    v_bfe_i32 v6, v2, 24, 4
+; GFX7-NEXT:    v_ashrrev_i32_e32 v8, 28, v2
+; GFX7-NEXT:    v_bfe_i32 v2, v2, 12, 4
+; GFX7-NEXT:    v_and_b32_e32 v7, s4, v7
 ; GFX7-NEXT:    v_bfe_i32 v0, v0, 12, 4
-; GFX7-NEXT:    v_or_b32_e32 v5, v6, v5
-; GFX7-NEXT:    v_or_b32_e32 v6, v11, v10
-; GFX7-NEXT:    v_and_b32_e32 v3, v2, v3
-; GFX7-NEXT:    v_and_b32_e32 v9, v2, v9
-; GFX7-NEXT:    v_and_b32_e32 v12, v2, v14
-; GFX7-NEXT:    v_and_b32_e32 v13, v2, v15
-; GFX7-NEXT:    v_and_b32_e32 v0, v2, v0
-; GFX7-NEXT:    v_and_b32_e32 v14, v2, v16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v15, 16, v1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v16, 16, v4
-; GFX7-NEXT:    v_and_b32_e32 v4, v2, v4
-; GFX7-NEXT:    v_and_b32_e32 v1, v2, v1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 16, v5
-; GFX7-NEXT:    v_lshrrev_b32_e32 v11, 16, v6
-; GFX7-NEXT:    v_and_b32_e32 v6, v2, v6
-; GFX7-NEXT:    v_and_b32_e32 v2, v2, v5
-; GFX7-NEXT:    buffer_load_ushort v5, off, s[0:3], 0
+; GFX7-NEXT:    v_and_b32_e32 v12, s4, v14
+; GFX7-NEXT:    v_mad_u32_u24 v3, v15, v10, v3
+; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX7-NEXT:    v_mad_u32_u24 v3, v7, v12, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v14, 16, v1
+; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 16, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, s4, v4
+; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v3
+; GFX7-NEXT:    v_mad_u32_u24 v0, v1, v4, v0
+; GFX7-NEXT:    v_and_b32_e32 v6, s4, v6
+; GFX7-NEXT:    v_mad_u32_u24 v0, v14, v9, v0
 ; GFX7-NEXT:    v_and_b32_e32 v8, s4, v8
-; GFX7-NEXT:    v_and_b32_e32 v7, s4, v7
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mad_u32_u24 v4, v4, v6, v5
-; GFX7-NEXT:    v_mad_u32_u24 v4, v16, v11, v4
-; GFX7-NEXT:    v_mad_u32_u24 v4, v8, v13, v4
-; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v0, v4
-; GFX7-NEXT:    v_mad_u32_u24 v0, v1, v2, v0
-; GFX7-NEXT:    v_mad_u32_u24 v0, v15, v10, v0
-; GFX7-NEXT:    v_mad_u32_u24 v0, v7, v12, v0
-; GFX7-NEXT:    v_mad_u32_u24 v0, v9, v14, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, v6, v11, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, v8, v13, v0
 ; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
@@ -2813,95 +2810,93 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
 ; GFX7-NEXT:    s_movk_i32 s4, 0xff
 ; GFX7-NEXT:    s_mov_b32 s2, -1
-; GFX7-NEXT:    v_mov_b32_e32 v2, 0xff
 ; GFX7-NEXT:    s_mov_b32 s5, 0xffff
-; GFX7-NEXT:    v_mov_b32_e32 v3, 0xffff
 ; GFX7-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-NEXT:    v_bfe_i32 v6, v4, 20, 4
-; GFX7-NEXT:    v_bfe_i32 v7, v4, 16, 4
-; GFX7-NEXT:    v_bfe_i32 v8, v4, 12, 4
-; GFX7-NEXT:    v_bfe_i32 v9, v4, 8, 4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
-; GFX7-NEXT:    v_and_b32_e32 v7, s4, v7
-; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 8, v8
-; GFX7-NEXT:    v_and_b32_e32 v9, s4, v9
+; GFX7-NEXT:    v_ashrrev_i32_e32 v1, 28, v2
+; GFX7-NEXT:    v_bfe_i32 v3, v2, 24, 4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 8, v1
+; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_bfe_i32 v13, v0, 24, 4
-; GFX7-NEXT:    v_bfe_i32 v16, v0, 12, 4
-; GFX7-NEXT:    v_or_b32_e32 v6, v7, v6
-; GFX7-NEXT:    v_or_b32_e32 v7, v9, v8
-; GFX7-NEXT:    v_and_b32_e32 v9, v2, v13
+; GFX7-NEXT:    v_bfe_i32 v13, v0, 16, 4
+; GFX7-NEXT:    v_bfe_i32 v16, v0, 4, 4
+; GFX7-NEXT:    v_or_b32_e32 v3, v3, v9
+; GFX7-NEXT:    v_and_b32_e32 v9, s4, v13
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v13, 8, v16
 ; GFX7-NEXT:    buffer_load_ubyte v16, off, s[0:3], 0
-; GFX7-NEXT:    v_ashrrev_i32_e32 v1, 28, v4
-; GFX7-NEXT:    v_bfe_i32 v5, v4, 24, 4
-; GFX7-NEXT:    v_bfe_i32 v10, v4, 4, 4
-; GFX7-NEXT:    v_bfe_i32 v4, v4, 0, 4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v11, 8, v1
+; GFX7-NEXT:    v_bfe_i32 v4, v2, 20, 4
+; GFX7-NEXT:    v_bfe_i32 v5, v2, 16, 4
+; GFX7-NEXT:    v_bfe_i32 v6, v2, 12, 4
+; GFX7-NEXT:    v_bfe_i32 v7, v2, 8, 4
+; GFX7-NEXT:    v_bfe_i32 v8, v2, 4, 4
+; GFX7-NEXT:    v_bfe_i32 v2, v2, 0, 4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
 ; GFX7-NEXT:    v_and_b32_e32 v5, s4, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 8, v10
-; GFX7-NEXT:    v_and_b32_e32 v4, v2, v4
-; GFX7-NEXT:    v_ashrrev_i32_e32 v12, 28, v0
-; GFX7-NEXT:    v_bfe_i32 v14, v0, 20, 4
-; GFX7-NEXT:    v_bfe_i32 v15, v0, 16, 4
-; GFX7-NEXT:    v_bfe_i32 v17, v0, 8, 4
-; GFX7-NEXT:    v_bfe_i32 v18, v0, 4, 4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
+; GFX7-NEXT:    v_and_b32_e32 v7, s4, v7
+; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 8, v8
+; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX7-NEXT:    v_ashrrev_i32_e32 v10, 28, v0
+; GFX7-NEXT:    v_bfe_i32 v11, v0, 24, 4
+; GFX7-NEXT:    v_bfe_i32 v12, v0, 20, 4
+; GFX7-NEXT:    v_bfe_i32 v14, v0, 12, 4
+; GFX7-NEXT:    v_bfe_i32 v15, v0, 8, 4
 ; GFX7-NEXT:    v_bfe_i32 v0, v0, 0, 4
-; GFX7-NEXT:    v_or_b32_e32 v5, v5, v11
-; GFX7-NEXT:    v_or_b32_e32 v4, v4, v10
+; GFX7-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX7-NEXT:    v_or_b32_e32 v5, v7, v6
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 8, v10
+; GFX7-NEXT:    v_and_b32_e32 v7, s4, v11
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 8, v12
-; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 8, v14
-; GFX7-NEXT:    v_and_b32_e32 v11, v2, v15
-; GFX7-NEXT:    v_and_b32_e32 v14, v2, v17
-; GFX7-NEXT:    v_lshlrev_b32_e32 v15, 8, v18
-; GFX7-NEXT:    v_and_b32_e32 v0, v2, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-NEXT:    v_and_b32_e32 v6, s5, v6
-; GFX7-NEXT:    v_or_b32_e32 v8, v9, v8
-; GFX7-NEXT:    v_or_b32_e32 v9, v11, v10
-; GFX7-NEXT:    v_or_b32_e32 v10, v14, v13
-; GFX7-NEXT:    v_or_b32_e32 v0, v0, v15
-; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX7-NEXT:    v_lshlrev_b32_e32 v11, 8, v14
+; GFX7-NEXT:    v_and_b32_e32 v12, s4, v15
+; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX7-NEXT:    v_and_b32_e32 v4, s5, v4
-; GFX7-NEXT:    v_or_b32_e32 v5, v6, v5
+; GFX7-NEXT:    v_or_b32_e32 v6, v7, v6
+; GFX7-NEXT:    v_or_b32_e32 v7, v9, v8
+; GFX7-NEXT:    v_or_b32_e32 v8, v12, v11
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v13
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT:    v_and_b32_e32 v2, s5, v2
+; GFX7-NEXT:    v_or_b32_e32 v3, v4, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
-; GFX7-NEXT:    v_and_b32_e32 v0, v3, v0
-; GFX7-NEXT:    v_or_b32_e32 v4, v4, v7
-; GFX7-NEXT:    v_and_b32_e32 v7, v3, v9
-; GFX7-NEXT:    v_or_b32_e32 v0, v0, v8
-; GFX7-NEXT:    v_or_b32_e32 v3, v7, v6
-; GFX7-NEXT:    v_and_b32_e32 v7, v2, v4
-; GFX7-NEXT:    v_and_b32_e32 v13, v2, v0
-; GFX7-NEXT:    v_bfe_u32 v8, v4, 8, 8
-; GFX7-NEXT:    v_bfe_u32 v14, v0, 8, 8
-; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 24, v4
-; GFX7-NEXT:    v_bfe_u32 v4, v4, 16, 8
+; GFX7-NEXT:    v_and_b32_e32 v0, s5, v0
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v5
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v6
+; GFX7-NEXT:    v_and_b32_e32 v6, s4, v2
+; GFX7-NEXT:    v_and_b32_e32 v12, s4, v0
+; GFX7-NEXT:    v_and_b32_e32 v5, s5, v7
+; GFX7-NEXT:    v_bfe_u32 v7, v2, 8, 8
+; GFX7-NEXT:    v_bfe_u32 v13, v0, 8, 8
+; GFX7-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 24, v2
+; GFX7-NEXT:    v_bfe_u32 v2, v2, 16, 8
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v11, 24, v0
 ; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
-; GFX7-NEXT:    v_and_b32_e32 v1, v2, v1
-; GFX7-NEXT:    v_and_b32_e32 v12, v2, v12
-; GFX7-NEXT:    v_and_b32_e32 v9, v2, v5
-; GFX7-NEXT:    v_and_b32_e32 v2, v2, v3
-; GFX7-NEXT:    v_bfe_u32 v10, v5, 8, 8
-; GFX7-NEXT:    v_bfe_u32 v15, v3, 8, 8
-; GFX7-NEXT:    v_bfe_u32 v5, v5, 16, 8
+; GFX7-NEXT:    v_and_b32_e32 v8, s4, v3
+; GFX7-NEXT:    v_and_b32_e32 v14, s4, v4
+; GFX7-NEXT:    v_bfe_u32 v9, v3, 8, 8
+; GFX7-NEXT:    v_bfe_u32 v15, v4, 8, 8
 ; GFX7-NEXT:    v_bfe_u32 v3, v3, 16, 8
+; GFX7-NEXT:    v_bfe_u32 v4, v4, 16, 8
+; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX7-NEXT:    v_and_b32_e32 v10, s4, v10
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mad_u32_u24 v7, v7, v13, v16
-; GFX7-NEXT:    v_mad_u32_u24 v7, v8, v14, v7
-; GFX7-NEXT:    v_mad_u32_u24 v0, v4, v0, v7
-; GFX7-NEXT:    v_mad_u32_u24 v0, v6, v11, v0
-; GFX7-NEXT:    v_mad_u32_u24 v0, v9, v2, v0
-; GFX7-NEXT:    v_mad_u32_u24 v0, v10, v15, v0
-; GFX7-NEXT:    v_mad_u32_u24 v0, v5, v3, v0
-; GFX7-NEXT:    v_mad_u32_u24 v0, v1, v12, v0
+; GFX7-NEXT:    v_mad_u32_u24 v6, v6, v12, v16
+; GFX7-NEXT:    v_mad_u32_u24 v6, v7, v13, v6
+; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v6
+; GFX7-NEXT:    v_mad_u32_u24 v0, v5, v11, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, v8, v14, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, v9, v15, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v4, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, v1, v10, v0
 ; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;

diff  --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll
index ed1914a80429..5bd000fa7b43 100644
--- a/llvm/test/CodeGen/AMDGPU/idot8u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll
@@ -2481,55 +2481,54 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    buffer_load_ubyte v16, off, s[0:3], 0
 ; GFX7-NEXT:    s_movk_i32 s4, 0xf00
-; GFX7-NEXT:    v_mov_b32_e32 v3, 0xf00
 ; GFX7-NEXT:    s_movk_i32 s5, 0xf0f
 ; GFX7-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX7-NEXT:    s_waitcnt vmcnt(2)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 28, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 4, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 4, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 12, v2
 ; GFX7-NEXT:    v_bfe_u32 v1, v2, 8, 4
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 4, v2
-; GFX7-NEXT:    v_and_b32_e32 v5, 15, v2
-; GFX7-NEXT:    v_bfe_u32 v7, v2, 16, 4
-; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 12, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 28, v2
+; GFX7-NEXT:    v_bfe_u32 v6, v2, 16, 4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 4, v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v11, 4, v0
-; GFX7-NEXT:    v_alignbit_b32 v2, v6, v2, 24
-; GFX7-NEXT:    v_and_b32_e32 v6, s4, v9
-; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 4, v0
-; GFX7-NEXT:    v_bfe_u32 v10, v0, 8, 4
-; GFX7-NEXT:    v_and_b32_e32 v4, s4, v4
-; GFX7-NEXT:    v_or_b32_e32 v5, v5, v6
-; GFX7-NEXT:    v_and_b32_e32 v6, v3, v9
-; GFX7-NEXT:    v_and_b32_e32 v3, v3, v11
-; GFX7-NEXT:    v_and_b32_e32 v12, 15, v0
-; GFX7-NEXT:    v_or_b32_e32 v1, v1, v4
-; GFX7-NEXT:    v_or_b32_e32 v3, v10, v3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v13, 28, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 4, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v12, 28, v0
+; GFX7-NEXT:    v_and_b32_e32 v7, s4, v7
+; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
+; GFX7-NEXT:    v_and_b32_e32 v4, 15, v2
+; GFX7-NEXT:    v_bfe_u32 v9, v0, 8, 4
+; GFX7-NEXT:    v_and_b32_e32 v11, 15, v0
+; GFX7-NEXT:    v_bfe_u32 v13, v0, 16, 4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v14, 12, v0
+; GFX7-NEXT:    v_alignbit_b32 v2, v5, v2, 24
+; GFX7-NEXT:    v_and_b32_e32 v5, s4, v8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 4, v0
+; GFX7-NEXT:    v_or_b32_e32 v6, v6, v7
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX7-NEXT:    v_alignbit_b32 v0, v12, v0, 24
+; GFX7-NEXT:    v_and_b32_e32 v7, s4, v10
+; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
+; GFX7-NEXT:    v_and_b32_e32 v3, s4, v14
+; GFX7-NEXT:    v_and_b32_e32 v5, s4, v8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_or_b32_e32 v6, v12, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_bfe_u32 v14, v0, 16, 4
-; GFX7-NEXT:    v_lshrrev_b32_e32 v15, 12, v0
-; GFX7-NEXT:    v_and_b32_e32 v8, s4, v8
+; GFX7-NEXT:    v_and_b32_e32 v0, s5, v0
+; GFX7-NEXT:    v_or_b32_e32 v7, v9, v7
 ; GFX7-NEXT:    v_and_b32_e32 v2, s5, v2
-; GFX7-NEXT:    v_alignbit_b32 v0, v13, v0, 24
-; GFX7-NEXT:    v_or_b32_e32 v1, v5, v1
-; GFX7-NEXT:    v_or_b32_e32 v3, v6, v3
-; GFX7-NEXT:    v_or_b32_e32 v7, v7, v8
-; GFX7-NEXT:    v_and_b32_e32 v4, s4, v15
+; GFX7-NEXT:    v_or_b32_e32 v3, v13, v3
+; GFX7-NEXT:    v_or_b32_e32 v5, v11, v5
+; GFX7-NEXT:    v_or_b32_e32 v1, v4, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_and_b32_e32 v0, s5, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX7-NEXT:    v_or_b32_e32 v3, v5, v4
+; GFX7-NEXT:    v_or_b32_e32 v2, v6, v2
 ; GFX7-NEXT:    v_and_b32_e32 v6, 15, v1
 ; GFX7-NEXT:    v_and_b32_e32 v12, 15, v3
-; GFX7-NEXT:    v_or_b32_e32 v4, v14, v4
-; GFX7-NEXT:    v_or_b32_e32 v2, v7, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    v_bfe_u32 v7, v1, 8, 4
 ; GFX7-NEXT:    v_bfe_u32 v13, v3, 8, 4
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mad_u32_u24 v6, v6, v12, v16
-; GFX7-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 24, v1
 ; GFX7-NEXT:    v_bfe_u32 v1, v1, 16, 4
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 24, v3

diff  --git a/llvm/test/CodeGen/AMDGPU/inline-asm.ll b/llvm/test/CodeGen/AMDGPU/inline-asm.ll
index 98ee7d47f2f8..52baec321bcc 100644
--- a/llvm/test/CodeGen/AMDGPU/inline-asm.ll
+++ b/llvm/test/CodeGen/AMDGPU/inline-asm.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK --check-prefix=PRE-GFX8 %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck  --check-prefix=CHECK --check-prefix=GFX8 %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck  --check-prefix=CHECK %s
 
 ; CHECK-LABEL: {{^}}inline_asm:
 ; CHECK: s_endpgm
@@ -260,8 +260,7 @@ entry:
 ; CHECK: ; def v0
 ; CHECK: v_mov_b32_e32 v1, v0
 ; CHECK: ; def v0
-; PRE-GFX8: v_lshl_b32_e32 v{{[0-9]+}}, v1, v0
-; GFX8: v_lshlrev_b32_e32 v{{[0-9]+}}, v0, v1
+; CHECK: v_lshlrev_b32_e32 v{{[0-9]+}}, v0, v1
 define amdgpu_kernel void @muliple_def_phys_vgpr() {
 entry:
   %def0 = call i32 asm sideeffect "; def $0 ", "={v0}"()

diff  --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
index 50b75095b056..bb52fc6597fe 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
@@ -711,7 +711,7 @@ define amdgpu_kernel void @v_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out,
 ; VI-NEXT:    v_mov_b32_e32 v1, 0x3e70000
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    flat_store_dword v[2:3], v0
 ; VI-NEXT:    s_endpgm
 ;
@@ -769,7 +769,7 @@ define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(<2 x i16> addrspace
 ; VI-NEXT:    v_mov_b32_e32 v1, 0xfff10000
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    flat_store_dword v[2:3], v0
 ; VI-NEXT:    s_endpgm
 ;
@@ -945,7 +945,7 @@ define amdgpu_kernel void @v_insertelement_v2f16_1(<2 x half> addrspace(1)* %out
 ; VI-NEXT:    v_mov_b32_e32 v1, 0x45000000
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    flat_store_dword v[2:3], v0
 ; VI-NEXT:    s_endpgm
 ;
@@ -1003,7 +1003,7 @@ define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(<2 x half> addrspac
 ; VI-NEXT:    v_mov_b32_e32 v1, 0x230000
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    flat_store_dword v[2:3], v0
 ; VI-NEXT:    s_endpgm
 ;
@@ -1343,7 +1343,7 @@ define amdgpu_kernel void @v_insertelement_v4f16_1(<4 x half> addrspace(1)* %out
 ; VI-NEXT:    v_mov_b32_e32 v4, s0
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
 ;
@@ -1473,7 +1473,7 @@ define amdgpu_kernel void @v_insertelement_v4f16_3(<4 x half> addrspace(1)* %out
 ; VI-NEXT:    v_mov_b32_e32 v4, s0
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
 ;

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll
index 4c478b3c64cc..0c0d244c809a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll
@@ -161,9 +161,8 @@ define amdgpu_ps void @branch(float %arg0, float %arg1) {
 ; SI-NEXT:    v_cvt_i32_f32_e32 v1, v1
 ; SI-NEXT:    s_mov_b64 s[2:3], exec
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v1
-; SI-NEXT:    v_and_b32_e32 v1, 1, v0
 ; SI-NEXT:    v_and_b32_e32 v0, 1, v0
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; SI-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v0
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], s[0:1]
 ; SI-NEXT:    s_xor_b64 s[0:1], exec, s[4:5]
@@ -189,9 +188,8 @@ define amdgpu_ps void @branch(float %arg0, float %arg1) {
 ; GFX9-NEXT:    v_cvt_i32_f32_e32 v1, v1
 ; GFX9-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX9-NEXT:    v_and_b32_e32 v1, 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v0
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], s[0:1]
 ; GFX9-NEXT:    s_xor_b64 s[0:1], exec, s[4:5]
@@ -217,9 +215,8 @@ define amdgpu_ps void @branch(float %arg0, float %arg1) {
 ; GFX10-32-NEXT:    v_cvt_i32_f32_e32 v1, v1
 ; GFX10-32-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX10-32-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX10-32-NEXT:    v_and_b32_e32 v1, 1, v0
 ; GFX10-32-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-32-NEXT:    v_cmp_eq_u32_e64 s0, 1, v0
 ; GFX10-32-NEXT:    s_and_saveexec_b32 s2, s0
 ; GFX10-32-NEXT:    s_xor_b32 s0, exec_lo, s2
@@ -245,9 +242,8 @@ define amdgpu_ps void @branch(float %arg0, float %arg1) {
 ; GFX10-64-NEXT:    v_cvt_i32_f32_e32 v1, v1
 ; GFX10-64-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX10-64-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX10-64-NEXT:    v_and_b32_e32 v1, 1, v0
 ; GFX10-64-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX10-64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX10-64-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v0
 ; GFX10-64-NEXT:    s_and_saveexec_b64 s[4:5], s[0:1]
 ; GFX10-64-NEXT:    s_xor_b64 s[0:1], exec, s[4:5]

diff  --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
index 2982b21bf047..53e8a3c48fba 100644
--- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
@@ -123,8 +123,8 @@ define amdgpu_kernel void @v_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16>
 ; CI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
-; CI-NEXT:    v_lshr_b32_e32 v2, v2, v3
-; CI-NEXT:    v_lshr_b32_e32 v3, v4, v5
+; CI-NEXT:    v_lshrrev_b32_e32 v2, v3, v2
+; CI-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
 ; CI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; CI-NEXT:    v_or_b32_e32 v2, v2, v3
 ; CI-NEXT:    buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
@@ -512,10 +512,10 @@ define amdgpu_kernel void @v_lshr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16>
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
 ; CI-NEXT:    v_lshrrev_b32_e32 v9, 16, v5
-; CI-NEXT:    v_lshr_b32_e32 v3, v3, v5
-; CI-NEXT:    v_lshr_b32_e32 v5, v7, v9
-; CI-NEXT:    v_lshr_b32_e32 v2, v2, v4
-; CI-NEXT:    v_lshr_b32_e32 v4, v6, v8
+; CI-NEXT:    v_lshrrev_b32_e32 v3, v5, v3
+; CI-NEXT:    v_lshrrev_b32_e32 v5, v9, v7
+; CI-NEXT:    v_lshrrev_b32_e32 v2, v4, v2
+; CI-NEXT:    v_lshrrev_b32_e32 v4, v8, v6
 ; CI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; CI-NEXT:    v_or_b32_e32 v3, v3, v5

diff  --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
index 53ccafa912d8..dc698ac61b70 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
@@ -573,8 +573,7 @@ entry:
 ;
 ; TODO: Why is the constant not peepholed into the v_or_b32_e32?
 ;
-; NOSDWA: s_mov_b32 [[CONST:s[0-9]+]], 0x10000
-; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, s0,
+; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, 0x10000,
 ; SDWA: v_or_b32_e32 v{{[0-9]+}}, 0x10000,
 define amdgpu_kernel void @sdwa_crash_inlineasm_def() #0 {
 bb:

diff  --git a/llvm/test/CodeGen/AMDGPU/select-constant-xor.ll b/llvm/test/CodeGen/AMDGPU/select-constant-xor.ll
index ea2c9fda56e2..7dc7afbaa1df 100644
--- a/llvm/test/CodeGen/AMDGPU/select-constant-xor.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-constant-xor.ll
@@ -105,7 +105,7 @@ define i32 @selecti8i32(i8 %a) {
 ; CHECK-NEXT:    v_bfe_i32 v0, v0, 0, 8
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0x54
 ; CHECK-NEXT:    v_ashrrev_i16 v0, 7, v0
-; CHECK-NEXT:    v_xor_b32_sdwa v0, v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; CHECK-NEXT:    v_xor_b32_sdwa v0, sext(v0), v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %c = icmp sgt i8 %a, -1
   %s = select i1 %c, i32 84, i32 -85

diff  --git a/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll b/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll
index 4a7684b65404..cb77e8268699 100644
--- a/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll
@@ -577,7 +577,7 @@ define amdgpu_kernel void @v_sext_in_reg_i1_i16(i16 addrspace(3)* %out, i16 addr
 ; GCN: {{buffer|flat|global}}_load_ushort [[VAL0:v[0-9]+]]
 ; GCN: {{buffer|flat|global}}_load_ushort [[VAL1:v[0-9]+]]
 
-; SI: v_lshl_b32_e32 [[REG:v[0-9]+]], [[VAL0]], [[VAL1]]
+; SI: v_lshlrev_b32_e32 [[REG:v[0-9]+]], [[VAL1]], [[VAL0]]
 ; GFX89: v_lshlrev_b16_e32 [[REG:v[0-9]+]], [[VAL1]], [[VAL0]]
 
 ; GCN: v_bfe_i32 [[BFE:v[0-9]+]], [[REG]], 0, 1{{$}}

diff  --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll
index a56bcfb1073f..ae0c88e15bc6 100644
--- a/llvm/test/CodeGen/AMDGPU/shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl.ll
@@ -397,7 +397,7 @@ define amdgpu_kernel void @shl_i16_computed_amount(i16 addrspace(1)* %out, i16 a
 ; SI-NEXT:    s_mov_b32 s0, s4
 ; SI-NEXT:    s_mov_b32 s1, s5
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v0
-; SI-NEXT:    v_lshl_b32_e32 v0, v2, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v0, v0, v2
 ; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -558,8 +558,8 @@ define amdgpu_kernel void @shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> add
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; SI-NEXT:    v_lshl_b32_e32 v0, v2, v0
-; SI-NEXT:    v_lshl_b32_e32 v1, v1, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v0, v0, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v1, v3, v1
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -655,10 +655,10 @@ define amdgpu_kernel void @shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> add
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
 ; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v5
-; SI-NEXT:    v_lshl_b32_e32 v3, v3, v5
-; SI-NEXT:    v_lshl_b32_e32 v2, v2, v4
-; SI-NEXT:    v_lshl_b32_e32 v4, v7, v9
-; SI-NEXT:    v_lshl_b32_e32 v5, v6, v8
+; SI-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v4, v9, v7
+; SI-NEXT:    v_lshlrev_b32_e32 v5, v8, v6
 ; SI-NEXT:    v_and_b32_e32 v3, s0, v3
 ; SI-NEXT:    v_and_b32_e32 v2, s0, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4

diff  --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
index 7f8f8c3d5880..aa7763bfece0 100644
--- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
@@ -126,8 +126,8 @@ define amdgpu_kernel void @v_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> a
 ; CI-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
-; CI-NEXT:    v_lshl_b32_e32 v2, v2, v3
-; CI-NEXT:    v_lshl_b32_e32 v3, v4, v5
+; CI-NEXT:    v_lshlrev_b32_e32 v2, v3, v2
+; CI-NEXT:    v_lshlrev_b32_e32 v3, v5, v4
 ; CI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; CI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; CI-NEXT:    v_or_b32_e32 v2, v2, v3
@@ -516,10 +516,10 @@ define amdgpu_kernel void @v_shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> a
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
 ; CI-NEXT:    v_lshrrev_b32_e32 v9, 16, v5
-; CI-NEXT:    v_lshl_b32_e32 v3, v3, v5
-; CI-NEXT:    v_lshl_b32_e32 v2, v2, v4
-; CI-NEXT:    v_lshl_b32_e32 v4, v7, v9
-; CI-NEXT:    v_lshl_b32_e32 v5, v6, v8
+; CI-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
+; CI-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; CI-NEXT:    v_lshlrev_b32_e32 v4, v9, v7
+; CI-NEXT:    v_lshlrev_b32_e32 v5, v8, v6
 ; CI-NEXT:    v_and_b32_e32 v3, s0, v3
 ; CI-NEXT:    v_and_b32_e32 v2, s0, v2
 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4

diff  --git a/llvm/test/CodeGen/AMDGPU/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/ssubsat.ll
index 4a7a6b77c042..79f441e44329 100644
--- a/llvm/test/CodeGen/AMDGPU/ssubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/ssubsat.ll
@@ -525,7 +525,7 @@ define <4 x i32> @v_ssubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v7
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v4, v3
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 31, v4
-; GFX6-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
+; GFX6-NEXT:    v_xor_b32_e32 v3, s6, v3
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -559,7 +559,7 @@ define <4 x i32> @v_ssubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v7
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v4, v3
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 31, v4
-; GFX8-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
+; GFX8-NEXT:    v_xor_b32_e32 v3, s6, v3
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -613,39 +613,38 @@ define <8 x i32> @v_ssubsat_v8i32(<8 x i32> %lhs, <8 x i32> %rhs) {
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v8, s[4:5], v3, v11
-; GFX6-NEXT:    v_bfrev_b32_e32 v16, 1
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v11
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v3
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 31, v8
-; GFX6-NEXT:    v_xor_b32_e32 v3, v16, v3
+; GFX6-NEXT:    v_xor_b32_e32 v3, s6, v3
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v8, s[4:5], v4, v12
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v12
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v4
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v4, 31, v8
-; GFX6-NEXT:    v_xor_b32_e32 v4, v16, v4
+; GFX6-NEXT:    v_xor_b32_e32 v4, s6, v4
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v8, s[4:5], v5, v13
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v13
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v5
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v5, 31, v8
-; GFX6-NEXT:    v_xor_b32_e32 v5, v16, v5
+; GFX6-NEXT:    v_xor_b32_e32 v5, s6, v5
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v8, s[4:5], v6, v14
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v14
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v6
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v6, 31, v8
-; GFX6-NEXT:    v_xor_b32_e32 v6, v16, v6
+; GFX6-NEXT:    v_xor_b32_e32 v6, s6, v6
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v8, s[4:5], v7, v15
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v15
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v7
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v7, 31, v8
-; GFX6-NEXT:    v_xor_b32_e32 v7, v16, v7
+; GFX6-NEXT:    v_xor_b32_e32 v7, s6, v7
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -676,39 +675,38 @@ define <8 x i32> @v_ssubsat_v8i32(<8 x i32> %lhs, <8 x i32> %rhs) {
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v8, s[4:5], v3, v11
-; GFX8-NEXT:    v_bfrev_b32_e32 v16, 1
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v11
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v3
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 31, v8
-; GFX8-NEXT:    v_xor_b32_e32 v3, v16, v3
+; GFX8-NEXT:    v_xor_b32_e32 v3, s6, v3
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v8, s[4:5], v4, v12
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v12
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v4
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v4, 31, v8
-; GFX8-NEXT:    v_xor_b32_e32 v4, v16, v4
+; GFX8-NEXT:    v_xor_b32_e32 v4, s6, v4
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v8, s[4:5], v5, v13
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v13
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v5
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v5, 31, v8
-; GFX8-NEXT:    v_xor_b32_e32 v5, v16, v5
+; GFX8-NEXT:    v_xor_b32_e32 v5, s6, v5
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v8, s[4:5], v6, v14
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v14
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v6
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v6, 31, v8
-; GFX8-NEXT:    v_xor_b32_e32 v6, v16, v6
+; GFX8-NEXT:    v_xor_b32_e32 v6, s6, v6
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v8, s[4:5], v7, v15
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v15
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v7
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v7, 31, v8
-; GFX8-NEXT:    v_xor_b32_e32 v7, v16, v7
+; GFX8-NEXT:    v_xor_b32_e32 v7, s6, v7
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -772,93 +770,92 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v3, v19
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v19
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v3
-; GFX6-NEXT:    v_bfrev_b32_e32 v17, 1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 31, v16
-; GFX6-NEXT:    v_xor_b32_e32 v3, v17, v3
+; GFX6-NEXT:    v_xor_b32_e32 v3, s6, v3
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v16, v3, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v4, v20
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v20
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v4
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v4, 31, v16
-; GFX6-NEXT:    v_xor_b32_e32 v4, v17, v4
+; GFX6-NEXT:    v_xor_b32_e32 v4, s6, v4
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v4, v16, v4, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v5, v21
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v21
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v5
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v5, 31, v16
-; GFX6-NEXT:    v_xor_b32_e32 v5, v17, v5
+; GFX6-NEXT:    v_xor_b32_e32 v5, s6, v5
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v5, v16, v5, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v6, v22
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v22
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v6
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v6, 31, v16
-; GFX6-NEXT:    v_xor_b32_e32 v6, v17, v6
+; GFX6-NEXT:    v_xor_b32_e32 v6, s6, v6
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v6, v16, v6, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v7, v23
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v23
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v7
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v7, 31, v16
-; GFX6-NEXT:    v_xor_b32_e32 v7, v17, v7
+; GFX6-NEXT:    v_xor_b32_e32 v7, s6, v7
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v7, v16, v7, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v8, v24
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v24
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v8
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v8, 31, v16
-; GFX6-NEXT:    v_xor_b32_e32 v8, v17, v8
+; GFX6-NEXT:    v_xor_b32_e32 v8, s6, v8
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v8, v16, v8, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v9, v25
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v25
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v9
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v9, 31, v16
-; GFX6-NEXT:    v_xor_b32_e32 v9, v17, v9
+; GFX6-NEXT:    v_xor_b32_e32 v9, s6, v9
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v9, v16, v9, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v10, v26
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v26
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v10
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v10, 31, v16
-; GFX6-NEXT:    v_xor_b32_e32 v10, v17, v10
+; GFX6-NEXT:    v_xor_b32_e32 v10, s6, v10
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v10, v16, v10, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v11, v27
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v27
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v11
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v11, 31, v16
-; GFX6-NEXT:    v_xor_b32_e32 v11, v17, v11
+; GFX6-NEXT:    v_xor_b32_e32 v11, s6, v11
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v11, v16, v11, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v12, v28
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v28
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v12
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v12, 31, v16
-; GFX6-NEXT:    v_xor_b32_e32 v12, v17, v12
+; GFX6-NEXT:    v_xor_b32_e32 v12, s6, v12
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v12, v16, v12, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v13, v29
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v29
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v13
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v13, 31, v16
-; GFX6-NEXT:    v_xor_b32_e32 v13, v17, v13
+; GFX6-NEXT:    v_xor_b32_e32 v13, s6, v13
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v13, v16, v13, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v14, v30
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v30
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v14
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v14, 31, v16
-; GFX6-NEXT:    v_xor_b32_e32 v14, v17, v14
+; GFX6-NEXT:    v_xor_b32_e32 v14, s6, v14
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v14, v16, v14, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v15, v31
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v31
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v15
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v15, 31, v16
-; GFX6-NEXT:    v_xor_b32_e32 v15, v17, v15
+; GFX6-NEXT:    v_xor_b32_e32 v15, s6, v15
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v15, v16, v15, vcc
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -891,93 +888,92 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v3, v19
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v19
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v3
-; GFX8-NEXT:    v_bfrev_b32_e32 v17, 1
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 31, v16
-; GFX8-NEXT:    v_xor_b32_e32 v3, v17, v3
+; GFX8-NEXT:    v_xor_b32_e32 v3, s6, v3
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v16, v3, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v4, v20
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v20
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v4
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v4, 31, v16
-; GFX8-NEXT:    v_xor_b32_e32 v4, v17, v4
+; GFX8-NEXT:    v_xor_b32_e32 v4, s6, v4
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v16, v4, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v5, v21
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v21
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v5
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v5, 31, v16
-; GFX8-NEXT:    v_xor_b32_e32 v5, v17, v5
+; GFX8-NEXT:    v_xor_b32_e32 v5, s6, v5
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v16, v5, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v6, v22
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v22
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v6
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v6, 31, v16
-; GFX8-NEXT:    v_xor_b32_e32 v6, v17, v6
+; GFX8-NEXT:    v_xor_b32_e32 v6, s6, v6
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v16, v6, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v7, v23
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v23
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v7
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v7, 31, v16
-; GFX8-NEXT:    v_xor_b32_e32 v7, v17, v7
+; GFX8-NEXT:    v_xor_b32_e32 v7, s6, v7
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v16, v7, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v8, v24
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v24
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v8
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v8, 31, v16
-; GFX8-NEXT:    v_xor_b32_e32 v8, v17, v8
+; GFX8-NEXT:    v_xor_b32_e32 v8, s6, v8
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v8, v16, v8, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v9, v25
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v25
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v9
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v9, 31, v16
-; GFX8-NEXT:    v_xor_b32_e32 v9, v17, v9
+; GFX8-NEXT:    v_xor_b32_e32 v9, s6, v9
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v9, v16, v9, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v10, v26
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v26
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v10
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v10, 31, v16
-; GFX8-NEXT:    v_xor_b32_e32 v10, v17, v10
+; GFX8-NEXT:    v_xor_b32_e32 v10, s6, v10
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v10, v16, v10, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v11, v27
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v27
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v11
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v11, 31, v16
-; GFX8-NEXT:    v_xor_b32_e32 v11, v17, v11
+; GFX8-NEXT:    v_xor_b32_e32 v11, s6, v11
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v11, v16, v11, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v12, v28
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v28
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v12
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v12, 31, v16
-; GFX8-NEXT:    v_xor_b32_e32 v12, v17, v12
+; GFX8-NEXT:    v_xor_b32_e32 v12, s6, v12
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v12, v16, v12, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v13, v29
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v29
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v13
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v13, 31, v16
-; GFX8-NEXT:    v_xor_b32_e32 v13, v17, v13
+; GFX8-NEXT:    v_xor_b32_e32 v13, s6, v13
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v13, v16, v13, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v14, v30
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v30
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v14
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v14, 31, v16
-; GFX8-NEXT:    v_xor_b32_e32 v14, v17, v14
+; GFX8-NEXT:    v_xor_b32_e32 v14, s6, v14
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v14, v16, v14, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v15, v31
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v31
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v15
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v15, 31, v16
-; GFX8-NEXT:    v_xor_b32_e32 v15, v17, v15
+; GFX8-NEXT:    v_xor_b32_e32 v15, s6, v15
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v15, v16, v15, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll
index 4f7e04870133..86fd470347cb 100644
--- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll
+++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll
@@ -37,8 +37,8 @@ define void @needs_align16_default_stack_align(i32 %idx) #0 {
 
 ; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen
 ; GCN: v_or_b32_e32 v{{[0-9]+}}, 12
-; GCN: s_addk_i32 s32, 0x2800{{$}}
 ; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen
+; GCN: s_addk_i32 s32, 0x2800{{$}}
 ; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen
 ; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen
 
@@ -58,8 +58,8 @@ define void @needs_align16_stack_align4(i32 %idx) #2 {
 
 ; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen
 ; GCN: v_or_b32_e32 v{{[0-9]+}}, 12
-; GCN: s_addk_i32 s32, 0x3000{{$}}
 ; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen
+; GCN: s_addk_i32 s32, 0x3000{{$}}
 ; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen
 ; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen
 

diff  --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
index e726853ec106..fa5f37a4e57e 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
@@ -49,8 +49,7 @@ define amdgpu_gfx void @strict_wwm_no_cfg(<4 x i32> inreg %tmp14) {
 ; GFX9-O0-NEXT:    s_mov_b32 s35, 1
 ; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v3, s35, v3
 ; GFX9-O0-NEXT:    s_mov_b32 s35, 2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s35
-; GFX9-O0-NEXT:    v_and_b32_e32 v3, v3, v4
+; GFX9-O0-NEXT:    v_and_b32_e64 v3, v3, s35
 ; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[36:39], s34 offset:4
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
@@ -217,8 +216,7 @@ define amdgpu_gfx void @strict_wwm_cfg(<4 x i32> inreg %tmp14, i32 %arg) {
 ; GFX9-O0-NEXT:    s_mov_b32 s34, 1
 ; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v0, s34, v0
 ; GFX9-O0-NEXT:    s_mov_b32 s34, 2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s34
-; GFX9-O0-NEXT:    v_and_b32_e32 v0, v0, v3
+; GFX9-O0-NEXT:    v_and_b32_e64 v0, v0, s34
 ; GFX9-O0-NEXT:    s_mov_b32 s34, 0
 ; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[36:39], s34 offset:4
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1


        


More information about the llvm-commits mailing list