[llvm] [AMDGPU] Disable i16 -> i32 promotion in CGP for DAGISel (PR #112334)
Pierre van Houtryve via llvm-commits
llvm-commits at lists.llvm.org
Tue Oct 15 01:42:25 PDT 2024
https://github.com/Pierre-vh created https://github.com/llvm/llvm-project/pull/112334
Only promote in CGP if GlobalISel is enabled, or if the CL option is specifically used.
>From d05a3222b5637cb3a8cec49acf882b6b7fdd76a4 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Tue, 15 Oct 2024 10:40:13 +0200
Subject: [PATCH] [AMDGPU] Disable i16 -> i32 promotion in CGP for DAGISel
Only promote in CGP if GlobalISel is enabled, or if the CL option is specifically used.
---
.../Target/AMDGPU/AMDGPUCodeGenPrepare.cpp | 7 +-
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 35 +-
llvm/lib/Target/AMDGPU/SIISelLowering.h | 1 +
llvm/test/CodeGen/AMDGPU/add.v2i16.ll | 33 +-
llvm/test/CodeGen/AMDGPU/anyext.ll | 8 +-
.../CodeGen/AMDGPU/calling-conventions.ll | 413 +++++++++---------
.../CodeGen/AMDGPU/cgp-bitfield-extract.ll | 9 +-
llvm/test/CodeGen/AMDGPU/ctlz.ll | 21 +-
llvm/test/CodeGen/AMDGPU/cttz.ll | 12 +-
llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll | 20 +-
llvm/test/CodeGen/AMDGPU/fneg.ll | 13 +-
llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll | 14 +-
llvm/test/CodeGen/AMDGPU/min.ll | 369 ++++++++--------
llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll | 19 +-
llvm/test/CodeGen/AMDGPU/sext-in-reg.ll | 15 +-
llvm/test/CodeGen/AMDGPU/shl.v2i16.ll | 6 +-
llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll | 230 +++++-----
llvm/test/CodeGen/AMDGPU/sra.ll | 79 ++--
llvm/test/CodeGen/AMDGPU/sub.v2i16.ll | 34 +-
llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll | 21 +-
llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll | 2 +-
llvm/test/CodeGen/AMDGPU/zero_extend.ll | 7 +-
22 files changed, 670 insertions(+), 698 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 7d3164c79089e0..4e3fb628e7ea7a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -401,7 +401,12 @@ bool AMDGPUCodeGenPrepareImpl::isSigned(const SelectInst &I) const {
}
bool AMDGPUCodeGenPrepareImpl::needsPromotionToI32(const Type *T) const {
- if (!Widen16BitOps)
+ // Disable i16 -> i32 widening for the DAG path only if the option is not
+ // used.
+ if (Widen16BitOps.getNumOccurrences()) {
+ if (!Widen16BitOps)
+ return false;
+ } else if (!getCGPassBuilderOption().EnableGlobalISelOption)
return false;
const IntegerType *IntTy = dyn_cast<IntegerType>(T);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 8c197f23149612..c7bae4dd05fd34 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -6822,6 +6822,22 @@ static unsigned getExtOpcodeForPromotedOp(SDValue Op) {
}
}
+bool SITargetLowering::shouldPromoteUniformOpToI32(SDValue Op, EVT ExtTy, EVT OpTy) const {
+ switch(Op.getOpcode()) {
+ case ISD::SMIN:
+ case ISD::SMAX:
+ case ISD::UMIN:
+ case ISD::UMAX: {
+ if (!Subtarget->has16BitInsts() || OpTy.isVector())
+ return false;
+ unsigned Size = OpTy.getSizeInBits();
+ return !Op->isDivergent() && Size >= 2 && Size <= 16;
+ }
+ default:
+ return !isNarrowingProfitable(Op.getNode(), ExtTy, OpTy);
+ }
+}
+
SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
DAGCombinerInfo &DCI) const {
const unsigned Opc = Op.getOpcode();
@@ -6836,7 +6852,7 @@ SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
auto ExtTy = OpTy.changeElementType(MVT::i32);
if (DCI.isBeforeLegalizeOps() ||
- isNarrowingProfitable(Op.getNode(), ExtTy, OpTy))
+ !shouldPromoteUniformOpToI32(Op, ExtTy, OpTy))
return SDValue();
auto &DAG = DCI.DAG;
@@ -6852,14 +6868,25 @@ SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
RHS = Op->getOperand(1);
}
- const unsigned ExtOp = getExtOpcodeForPromotedOp(Op);
- LHS = DAG.getNode(ExtOp, DL, ExtTy, {LHS});
+ // For SIGN_EXTEND, check for an existing SIGN_EXTEND_INREG and reproduce that instead.
+ // It leads to better code generation.
+ // TODO: DAGCombiner should take care of that but the combine doesn't apply all the time.
+ const auto ExtendOp = [&, ExtOp = getExtOpcodeForPromotedOp(Op)](SDValue Op){
+ if (ExtOp == ISD::SIGN_EXTEND && Op.getOpcode() == ISD::SIGN_EXTEND_INREG) {
+ SDValue InRegSrc = DAG.getNode(ISD::ANY_EXTEND, DL, ExtTy, Op.getOperand(0));
+ return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ExtTy, {InRegSrc, Op->getOperand(1)});
+ }
+
+ return DAG.getNode(ExtOp, DL, ExtTy, {Op});
+ };
+
+ LHS = ExtendOp(LHS);
// Special case: for shifts, the RHS always needs a zext.
if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
RHS = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtTy, {RHS});
else
- RHS = DAG.getNode(ExtOp, DL, ExtTy, {RHS});
+ RHS = ExtendOp(RHS);
// setcc always return i1/i1 vec so no need to truncate after.
if (Opc == ISD::SETCC) {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 6c3edf37945e24..4fdecd8252125a 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -147,6 +147,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const;
+ bool shouldPromoteUniformOpToI32(SDValue Op, EVT ExtTy, EVT OpTy) const;
SDValue promoteUniformOpToI32(SDValue Op, DAGCombinerInfo &DCI) const;
SDValue lowerMUL(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerXMULO(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
index 4cc384e9d27188..bcd75255acef44 100644
--- a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
@@ -103,13 +103,13 @@ define amdgpu_kernel void @s_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s1, s2, 16
-; VI-NEXT: s_lshr_b32 s3, s0, 16
+; VI-NEXT: s_add_i32 s1, s2, s0
+; VI-NEXT: s_lshr_b32 s0, s0, 16
+; VI-NEXT: s_lshr_b32 s2, s2, 16
; VI-NEXT: s_add_i32 s2, s2, s0
-; VI-NEXT: s_add_i32 s1, s1, s3
-; VI-NEXT: s_and_b32 s0, s2, 0xffff
-; VI-NEXT: s_lshl_b32 s1, s1, 16
-; VI-NEXT: s_or_b32 s0, s0, s1
+; VI-NEXT: s_and_b32 s1, s1, 0xffff
+; VI-NEXT: s_lshl_b32 s0, s2, 16
+; VI-NEXT: s_or_b32 s0, s1, s0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
@@ -170,16 +170,15 @@ define amdgpu_kernel void @s_test_add_self_v2i16(ptr addrspace(1) %out, ptr addr
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s2, s[2:3], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s0, s2, 16
-; VI-NEXT: s_and_b32 s1, s2, 0xffff
+; VI-NEXT: s_lshr_b32 s1, s2, 16
+; VI-NEXT: s_add_i32 s0, s2, s2
; VI-NEXT: s_add_i32 s1, s1, s1
-; VI-NEXT: s_add_i32 s0, s0, s0
-; VI-NEXT: s_lshl_b32 s0, s0, 16
-; VI-NEXT: s_and_b32 s1, s1, 0xffff
-; VI-NEXT: s_or_b32 s0, s1, s0
+; VI-NEXT: s_and_b32 s0, s0, 0xffff
+; VI-NEXT: s_lshl_b32 s1, s1, 16
+; VI-NEXT: s_or_b32 s0, s0, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
@@ -230,12 +229,12 @@ define amdgpu_kernel void @s_test_add_v2i16_kernarg(ptr addrspace(1) %out, <2 x
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s4, s2, 16
-; VI-NEXT: s_lshr_b32 s5, s3, 16
+; VI-NEXT: s_lshr_b32 s4, s3, 16
+; VI-NEXT: s_lshr_b32 s5, s2, 16
; VI-NEXT: s_add_i32 s2, s2, s3
-; VI-NEXT: s_add_i32 s4, s4, s5
+; VI-NEXT: s_add_i32 s5, s5, s4
; VI-NEXT: s_and_b32 s2, s2, 0xffff
-; VI-NEXT: s_lshl_b32 s3, s4, 16
+; VI-NEXT: s_lshl_b32 s3, s5, 16
; VI-NEXT: s_or_b32 s2, s2, s3
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
diff --git a/llvm/test/CodeGen/AMDGPU/anyext.ll b/llvm/test/CodeGen/AMDGPU/anyext.ll
index 8b6c8be9f37882..115cb40676da8c 100644
--- a/llvm/test/CodeGen/AMDGPU/anyext.ll
+++ b/llvm/test/CodeGen/AMDGPU/anyext.ll
@@ -27,11 +27,9 @@ define amdgpu_kernel void @anyext_i1_i32(ptr addrspace(1) %out, i32 %cond) #0 {
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_cmp_eq_u32 s4, 0
+; GFX8-NEXT: s_cmp_lg_u32 s4, 0
; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0
; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; GFX8-NEXT: v_not_b32_e32 v0, v0
-; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
@@ -42,11 +40,9 @@ define amdgpu_kernel void @anyext_i1_i32(ptr addrspace(1) %out, i32 %cond) #0 {
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_cmp_eq_u32 s4, 0
+; GFX9-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; GFX9-NEXT: v_not_b32_e32 v0, v0
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
index 29770738f83d57..e9ddc801b050c5 100644
--- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
+++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
@@ -980,7 +980,6 @@ define amdgpu_ps void @ps_mesa_inreg_i16(i16 inreg %arg0) {
;
; VI-LABEL: ps_mesa_inreg_i16:
; VI: ; %bb.0:
-; VI-NEXT: s_and_b32 s0, 0xffff, s0
; VI-NEXT: s_add_i32 s0, s0, s0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: flat_store_short v[0:1], v0
@@ -988,9 +987,8 @@ define amdgpu_ps void @ps_mesa_inreg_i16(i16 inreg %arg0) {
;
; GFX11-LABEL: ps_mesa_inreg_i16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_and_b32 s0, 0xffff, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_add_i32 s0, s0, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: global_store_b16 v[0:1], v0, off
; GFX11-NEXT: s_nop 0
@@ -1140,20 +1138,20 @@ define amdgpu_kernel void @amd_kernel_v4i8(<4 x i8> %arg0) {
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s2, s0, 16
-; VI-NEXT: s_lshr_b32 s1, s0, 24
-; VI-NEXT: s_add_i32 s2, s2, s2
+; VI-NEXT: s_lshr_b32 s1, s0, 16
+; VI-NEXT: s_lshr_b32 s2, s0, 24
; VI-NEXT: s_bfe_u32 s3, s0, 0x80008
+; VI-NEXT: s_add_i32 s2, s2, s2
; VI-NEXT: s_add_i32 s1, s1, s1
-; VI-NEXT: s_and_b32 s2, s2, 0xff
-; VI-NEXT: s_add_i32 s3, s3, s3
+; VI-NEXT: s_lshl_b32 s2, s2, 8
+; VI-NEXT: s_and_b32 s1, s1, 0xff
; VI-NEXT: s_add_i32 s0, s0, s0
-; VI-NEXT: s_lshl_b32 s1, s1, 24
-; VI-NEXT: s_lshl_b32 s2, s2, 16
+; VI-NEXT: s_add_i32 s3, s3, s3
; VI-NEXT: s_or_b32 s1, s1, s2
; VI-NEXT: s_and_b32 s0, s0, 0xff
; VI-NEXT: s_lshl_b32 s2, s3, 8
; VI-NEXT: s_or_b32 s0, s0, s2
+; VI-NEXT: s_lshl_b32 s1, s1, 16
; VI-NEXT: s_and_b32 s0, s0, 0xffff
; VI-NEXT: s_or_b32 s0, s0, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
@@ -1227,8 +1225,8 @@ define amdgpu_kernel void @amd_kernel_v3i8(<3 x i8> %arg0) {
; VI-NEXT: s_lshr_b32 s1, s0, 16
; VI-NEXT: s_bfe_u32 s2, s0, 0x80008
; VI-NEXT: s_add_i32 s0, s0, s0
-; VI-NEXT: s_add_i32 s1, s1, s1
; VI-NEXT: s_add_i32 s2, s2, s2
+; VI-NEXT: s_add_i32 s1, s1, s1
; VI-NEXT: s_and_b32 s0, s0, 0xff
; VI-NEXT: s_lshl_b32 s2, s2, 8
; VI-NEXT: v_mov_b32_e32 v2, s1
@@ -1308,22 +1306,21 @@ define amdgpu_kernel void @amd_kernel_v5i8(<5 x i8> %arg0) {
; VI-NEXT: v_mov_b32_e32 v0, 4
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s3, s0, 16
-; VI-NEXT: s_lshr_b32 s2, s0, 24
-; VI-NEXT: s_add_i32 s3, s3, s3
+; VI-NEXT: s_lshr_b32 s2, s0, 16
+; VI-NEXT: s_lshr_b32 s3, s0, 24
; VI-NEXT: s_bfe_u32 s4, s0, 0x80008
+; VI-NEXT: s_add_i32 s3, s3, s3
; VI-NEXT: s_add_i32 s2, s2, s2
-; VI-NEXT: s_and_b32 s3, s3, 0xff
-; VI-NEXT: s_add_i32 s4, s4, s4
+; VI-NEXT: s_lshl_b32 s3, s3, 8
+; VI-NEXT: s_and_b32 s2, s2, 0xff
; VI-NEXT: s_add_i32 s0, s0, s0
-; VI-NEXT: s_lshl_b32 s2, s2, 24
-; VI-NEXT: s_lshl_b32 s3, s3, 16
-; VI-NEXT: s_and_b32 s1, s1, 0xff
+; VI-NEXT: s_add_i32 s4, s4, s4
; VI-NEXT: s_or_b32 s2, s2, s3
; VI-NEXT: s_and_b32 s0, s0, 0xff
; VI-NEXT: s_lshl_b32 s3, s4, 8
-; VI-NEXT: s_add_i32 s1, s1, s1
; VI-NEXT: s_or_b32 s0, s0, s3
+; VI-NEXT: s_add_i32 s1, s1, s1
+; VI-NEXT: s_lshl_b32 s2, s2, 16
; VI-NEXT: s_and_b32 s0, s0, 0xffff
; VI-NEXT: v_mov_b32_e32 v2, s1
; VI-NEXT: s_or_b32 s0, s0, s2
@@ -1423,37 +1420,37 @@ define amdgpu_kernel void @amd_kernel_v8i8(<8 x i8> %arg0) {
; VI-NEXT: v_mov_b32_e32 v2, 0
; VI-NEXT: v_mov_b32_e32 v3, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s3, s1, 16
-; VI-NEXT: s_lshr_b32 s2, s1, 24
+; VI-NEXT: s_lshr_b32 s2, s0, 16
+; VI-NEXT: s_lshr_b32 s3, s0, 24
+; VI-NEXT: s_lshr_b32 s4, s1, 16
+; VI-NEXT: s_lshr_b32 s5, s1, 24
+; VI-NEXT: s_bfe_u32 s6, s0, 0x80008
+; VI-NEXT: s_bfe_u32 s7, s1, 0x80008
+; VI-NEXT: s_add_i32 s5, s5, s5
+; VI-NEXT: s_add_i32 s4, s4, s4
; VI-NEXT: s_add_i32 s3, s3, s3
-; VI-NEXT: s_bfe_u32 s6, s1, 0x80008
; VI-NEXT: s_add_i32 s2, s2, s2
-; VI-NEXT: s_and_b32 s3, s3, 0xff
-; VI-NEXT: s_add_i32 s6, s6, s6
+; VI-NEXT: s_lshl_b32 s5, s5, 8
+; VI-NEXT: s_and_b32 s4, s4, 0xff
; VI-NEXT: s_add_i32 s1, s1, s1
-; VI-NEXT: s_lshl_b32 s2, s2, 24
-; VI-NEXT: s_lshl_b32 s3, s3, 16
-; VI-NEXT: s_lshr_b32 s5, s0, 16
-; VI-NEXT: s_or_b32 s2, s2, s3
-; VI-NEXT: s_and_b32 s1, s1, 0xff
-; VI-NEXT: s_lshl_b32 s3, s6, 8
-; VI-NEXT: s_lshr_b32 s4, s0, 24
-; VI-NEXT: s_add_i32 s5, s5, s5
-; VI-NEXT: s_or_b32 s1, s1, s3
-; VI-NEXT: s_bfe_u32 s7, s0, 0x80008
-; VI-NEXT: s_add_i32 s4, s4, s4
-; VI-NEXT: s_and_b32 s1, s1, 0xffff
-; VI-NEXT: s_and_b32 s3, s5, 0xff
; VI-NEXT: s_add_i32 s7, s7, s7
+; VI-NEXT: s_lshl_b32 s3, s3, 8
+; VI-NEXT: s_and_b32 s2, s2, 0xff
; VI-NEXT: s_add_i32 s0, s0, s0
-; VI-NEXT: s_or_b32 s1, s1, s2
-; VI-NEXT: s_lshl_b32 s2, s4, 24
-; VI-NEXT: s_lshl_b32 s3, s3, 16
+; VI-NEXT: s_add_i32 s6, s6, s6
+; VI-NEXT: s_or_b32 s4, s4, s5
+; VI-NEXT: s_and_b32 s1, s1, 0xff
+; VI-NEXT: s_lshl_b32 s5, s7, 8
; VI-NEXT: s_or_b32 s2, s2, s3
; VI-NEXT: s_and_b32 s0, s0, 0xff
-; VI-NEXT: s_lshl_b32 s3, s7, 8
+; VI-NEXT: s_lshl_b32 s3, s6, 8
+; VI-NEXT: s_or_b32 s1, s1, s5
; VI-NEXT: s_or_b32 s0, s0, s3
+; VI-NEXT: s_lshl_b32 s4, s4, 16
+; VI-NEXT: s_and_b32 s1, s1, 0xffff
+; VI-NEXT: s_lshl_b32 s2, s2, 16
; VI-NEXT: s_and_b32 s0, s0, 0xffff
+; VI-NEXT: s_or_b32 s1, s1, s4
; VI-NEXT: s_or_b32 s0, s0, s2
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -1595,69 +1592,69 @@ define amdgpu_kernel void @amd_kernel_v16i8(<16 x i8> %arg0) {
; VI-NEXT: v_mov_b32_e32 v4, 0
; VI-NEXT: v_mov_b32_e32 v5, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s5, s3, 16
-; VI-NEXT: s_lshr_b32 s4, s3, 24
+; VI-NEXT: s_lshr_b32 s4, s0, 16
+; VI-NEXT: s_lshr_b32 s5, s0, 24
+; VI-NEXT: s_lshr_b32 s6, s1, 16
+; VI-NEXT: s_lshr_b32 s7, s1, 24
+; VI-NEXT: s_lshr_b32 s8, s2, 16
+; VI-NEXT: s_lshr_b32 s9, s2, 24
+; VI-NEXT: s_lshr_b32 s10, s3, 16
+; VI-NEXT: s_lshr_b32 s11, s3, 24
+; VI-NEXT: s_bfe_u32 s12, s0, 0x80008
+; VI-NEXT: s_bfe_u32 s13, s1, 0x80008
+; VI-NEXT: s_bfe_u32 s14, s2, 0x80008
+; VI-NEXT: s_bfe_u32 s15, s3, 0x80008
+; VI-NEXT: s_add_i32 s11, s11, s11
+; VI-NEXT: s_add_i32 s10, s10, s10
+; VI-NEXT: s_add_i32 s9, s9, s9
+; VI-NEXT: s_add_i32 s8, s8, s8
+; VI-NEXT: s_add_i32 s7, s7, s7
+; VI-NEXT: s_add_i32 s6, s6, s6
; VI-NEXT: s_add_i32 s5, s5, s5
-; VI-NEXT: s_bfe_u32 s12, s3, 0x80008
; VI-NEXT: s_add_i32 s4, s4, s4
-; VI-NEXT: s_and_b32 s5, s5, 0xff
-; VI-NEXT: s_add_i32 s12, s12, s12
+; VI-NEXT: s_lshl_b32 s11, s11, 8
+; VI-NEXT: s_and_b32 s10, s10, 0xff
; VI-NEXT: s_add_i32 s3, s3, s3
-; VI-NEXT: s_lshl_b32 s4, s4, 24
-; VI-NEXT: s_lshl_b32 s5, s5, 16
-; VI-NEXT: s_lshr_b32 s7, s2, 16
-; VI-NEXT: s_or_b32 s4, s4, s5
-; VI-NEXT: s_and_b32 s3, s3, 0xff
-; VI-NEXT: s_lshl_b32 s5, s12, 8
-; VI-NEXT: s_lshr_b32 s6, s2, 24
-; VI-NEXT: s_add_i32 s7, s7, s7
-; VI-NEXT: s_or_b32 s3, s3, s5
-; VI-NEXT: s_bfe_u32 s13, s2, 0x80008
-; VI-NEXT: s_add_i32 s6, s6, s6
-; VI-NEXT: s_and_b32 s3, s3, 0xffff
-; VI-NEXT: s_and_b32 s5, s7, 0xff
-; VI-NEXT: s_add_i32 s13, s13, s13
+; VI-NEXT: s_add_i32 s15, s15, s15
+; VI-NEXT: s_lshl_b32 s9, s9, 8
+; VI-NEXT: s_and_b32 s8, s8, 0xff
; VI-NEXT: s_add_i32 s2, s2, s2
-; VI-NEXT: s_or_b32 s3, s3, s4
-; VI-NEXT: s_lshl_b32 s4, s6, 24
-; VI-NEXT: s_lshl_b32 s5, s5, 16
-; VI-NEXT: s_lshr_b32 s9, s1, 16
-; VI-NEXT: s_or_b32 s4, s4, s5
-; VI-NEXT: s_and_b32 s2, s2, 0xff
-; VI-NEXT: s_lshl_b32 s5, s13, 8
-; VI-NEXT: s_lshr_b32 s8, s1, 24
-; VI-NEXT: s_add_i32 s9, s9, s9
-; VI-NEXT: s_or_b32 s2, s2, s5
-; VI-NEXT: s_bfe_u32 s14, s1, 0x80008
-; VI-NEXT: s_add_i32 s8, s8, s8
-; VI-NEXT: s_and_b32 s2, s2, 0xffff
-; VI-NEXT: s_and_b32 s5, s9, 0xff
; VI-NEXT: s_add_i32 s14, s14, s14
+; VI-NEXT: s_lshl_b32 s7, s7, 8
+; VI-NEXT: s_and_b32 s6, s6, 0xff
; VI-NEXT: s_add_i32 s1, s1, s1
-; VI-NEXT: s_or_b32 s2, s2, s4
-; VI-NEXT: s_lshl_b32 s4, s8, 24
-; VI-NEXT: s_lshl_b32 s5, s5, 16
-; VI-NEXT: s_lshr_b32 s11, s0, 16
-; VI-NEXT: s_or_b32 s4, s4, s5
-; VI-NEXT: s_and_b32 s1, s1, 0xff
-; VI-NEXT: s_lshl_b32 s5, s14, 8
-; VI-NEXT: s_lshr_b32 s10, s0, 24
-; VI-NEXT: s_add_i32 s11, s11, s11
-; VI-NEXT: s_or_b32 s1, s1, s5
-; VI-NEXT: s_bfe_u32 s15, s0, 0x80008
-; VI-NEXT: s_add_i32 s10, s10, s10
-; VI-NEXT: s_and_b32 s1, s1, 0xffff
-; VI-NEXT: s_and_b32 s5, s11, 0xff
-; VI-NEXT: s_add_i32 s15, s15, s15
+; VI-NEXT: s_add_i32 s13, s13, s13
+; VI-NEXT: s_lshl_b32 s5, s5, 8
+; VI-NEXT: s_and_b32 s4, s4, 0xff
; VI-NEXT: s_add_i32 s0, s0, s0
-; VI-NEXT: s_or_b32 s1, s1, s4
-; VI-NEXT: s_lshl_b32 s4, s10, 24
-; VI-NEXT: s_lshl_b32 s5, s5, 16
+; VI-NEXT: s_add_i32 s12, s12, s12
+; VI-NEXT: s_or_b32 s10, s10, s11
+; VI-NEXT: s_and_b32 s3, s3, 0xff
+; VI-NEXT: s_lshl_b32 s11, s15, 8
+; VI-NEXT: s_or_b32 s8, s8, s9
+; VI-NEXT: s_and_b32 s2, s2, 0xff
+; VI-NEXT: s_lshl_b32 s9, s14, 8
+; VI-NEXT: s_or_b32 s6, s6, s7
+; VI-NEXT: s_and_b32 s1, s1, 0xff
+; VI-NEXT: s_lshl_b32 s7, s13, 8
; VI-NEXT: s_or_b32 s4, s4, s5
; VI-NEXT: s_and_b32 s0, s0, 0xff
-; VI-NEXT: s_lshl_b32 s5, s15, 8
+; VI-NEXT: s_lshl_b32 s5, s12, 8
+; VI-NEXT: s_or_b32 s3, s3, s11
+; VI-NEXT: s_or_b32 s2, s2, s9
+; VI-NEXT: s_or_b32 s1, s1, s7
; VI-NEXT: s_or_b32 s0, s0, s5
+; VI-NEXT: s_lshl_b32 s10, s10, 16
+; VI-NEXT: s_and_b32 s3, s3, 0xffff
+; VI-NEXT: s_lshl_b32 s8, s8, 16
+; VI-NEXT: s_and_b32 s2, s2, 0xffff
+; VI-NEXT: s_lshl_b32 s6, s6, 16
+; VI-NEXT: s_and_b32 s1, s1, 0xffff
+; VI-NEXT: s_lshl_b32 s4, s4, 16
; VI-NEXT: s_and_b32 s0, s0, 0xffff
+; VI-NEXT: s_or_b32 s3, s3, s10
+; VI-NEXT: s_or_b32 s2, s2, s8
+; VI-NEXT: s_or_b32 s1, s1, s6
; VI-NEXT: s_or_b32 s0, s0, s4
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -1904,138 +1901,138 @@ define amdgpu_kernel void @amd_kernel_v32i8(<32 x i8> %arg0) {
; VI-NEXT: v_mov_b32_e32 v4, 16
; VI-NEXT: v_mov_b32_e32 v5, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s9, s3, 16
-; VI-NEXT: s_lshr_b32 s8, s3, 24
+; VI-NEXT: s_lshr_b32 s8, s4, 16
+; VI-NEXT: s_lshr_b32 s9, s4, 24
+; VI-NEXT: s_lshr_b32 s10, s5, 16
+; VI-NEXT: s_lshr_b32 s11, s5, 24
+; VI-NEXT: s_lshr_b32 s12, s6, 16
+; VI-NEXT: s_lshr_b32 s13, s6, 24
+; VI-NEXT: s_lshr_b32 s14, s7, 16
+; VI-NEXT: s_lshr_b32 s15, s7, 24
+; VI-NEXT: s_bfe_u32 s24, s4, 0x80008
+; VI-NEXT: s_bfe_u32 s25, s5, 0x80008
+; VI-NEXT: s_bfe_u32 s26, s6, 0x80008
+; VI-NEXT: s_bfe_u32 s27, s7, 0x80008
+; VI-NEXT: s_add_i32 s15, s15, s15
+; VI-NEXT: s_add_i32 s14, s14, s14
+; VI-NEXT: s_add_i32 s13, s13, s13
+; VI-NEXT: s_add_i32 s12, s12, s12
+; VI-NEXT: s_add_i32 s11, s11, s11
+; VI-NEXT: s_add_i32 s10, s10, s10
; VI-NEXT: s_add_i32 s9, s9, s9
-; VI-NEXT: s_bfe_u32 s24, s3, 0x80008
; VI-NEXT: s_add_i32 s8, s8, s8
-; VI-NEXT: s_and_b32 s9, s9, 0xff
+; VI-NEXT: s_lshr_b32 s16, s0, 16
+; VI-NEXT: s_lshr_b32 s17, s0, 24
+; VI-NEXT: s_lshr_b32 s18, s1, 16
+; VI-NEXT: s_lshr_b32 s19, s1, 24
+; VI-NEXT: s_lshr_b32 s20, s2, 16
+; VI-NEXT: s_lshr_b32 s21, s2, 24
+; VI-NEXT: s_lshr_b32 s22, s3, 16
+; VI-NEXT: s_lshr_b32 s23, s3, 24
+; VI-NEXT: s_lshl_b32 s15, s15, 8
+; VI-NEXT: s_and_b32 s14, s14, 0xff
+; VI-NEXT: s_add_i32 s7, s7, s7
+; VI-NEXT: s_add_i32 s27, s27, s27
+; VI-NEXT: s_lshl_b32 s13, s13, 8
+; VI-NEXT: s_and_b32 s12, s12, 0xff
+; VI-NEXT: s_add_i32 s6, s6, s6
+; VI-NEXT: s_add_i32 s26, s26, s26
+; VI-NEXT: s_lshl_b32 s11, s11, 8
+; VI-NEXT: s_and_b32 s10, s10, 0xff
+; VI-NEXT: s_add_i32 s5, s5, s5
+; VI-NEXT: s_add_i32 s25, s25, s25
+; VI-NEXT: s_lshl_b32 s9, s9, 8
+; VI-NEXT: s_and_b32 s8, s8, 0xff
+; VI-NEXT: s_add_i32 s4, s4, s4
; VI-NEXT: s_add_i32 s24, s24, s24
-; VI-NEXT: s_add_i32 s3, s3, s3
-; VI-NEXT: s_lshl_b32 s8, s8, 24
-; VI-NEXT: s_lshl_b32 s9, s9, 16
-; VI-NEXT: s_lshr_b32 s11, s2, 16
+; VI-NEXT: s_bfe_u32 s28, s0, 0x80008
+; VI-NEXT: s_bfe_u32 s29, s1, 0x80008
+; VI-NEXT: s_bfe_u32 s30, s2, 0x80008
+; VI-NEXT: s_bfe_u32 s31, s3, 0x80008
+; VI-NEXT: s_add_i32 s23, s23, s23
+; VI-NEXT: s_add_i32 s22, s22, s22
+; VI-NEXT: s_add_i32 s21, s21, s21
+; VI-NEXT: s_add_i32 s20, s20, s20
+; VI-NEXT: s_add_i32 s19, s19, s19
+; VI-NEXT: s_add_i32 s18, s18, s18
+; VI-NEXT: s_add_i32 s17, s17, s17
+; VI-NEXT: s_add_i32 s16, s16, s16
+; VI-NEXT: s_or_b32 s14, s14, s15
+; VI-NEXT: s_and_b32 s7, s7, 0xff
+; VI-NEXT: s_lshl_b32 s15, s27, 8
+; VI-NEXT: s_or_b32 s12, s12, s13
+; VI-NEXT: s_and_b32 s6, s6, 0xff
+; VI-NEXT: s_lshl_b32 s13, s26, 8
+; VI-NEXT: s_or_b32 s10, s10, s11
+; VI-NEXT: s_and_b32 s5, s5, 0xff
+; VI-NEXT: s_lshl_b32 s11, s25, 8
; VI-NEXT: s_or_b32 s8, s8, s9
-; VI-NEXT: s_and_b32 s3, s3, 0xff
+; VI-NEXT: s_and_b32 s4, s4, 0xff
; VI-NEXT: s_lshl_b32 s9, s24, 8
-; VI-NEXT: s_lshr_b32 s10, s2, 24
-; VI-NEXT: s_add_i32 s11, s11, s11
-; VI-NEXT: s_or_b32 s3, s3, s9
-; VI-NEXT: s_bfe_u32 s25, s2, 0x80008
-; VI-NEXT: s_add_i32 s10, s10, s10
-; VI-NEXT: s_and_b32 s3, s3, 0xffff
-; VI-NEXT: s_and_b32 s9, s11, 0xff
-; VI-NEXT: s_add_i32 s25, s25, s25
+; VI-NEXT: s_lshl_b32 s23, s23, 8
+; VI-NEXT: s_and_b32 s22, s22, 0xff
+; VI-NEXT: s_add_i32 s3, s3, s3
+; VI-NEXT: s_add_i32 s31, s31, s31
+; VI-NEXT: s_lshl_b32 s21, s21, 8
+; VI-NEXT: s_and_b32 s20, s20, 0xff
; VI-NEXT: s_add_i32 s2, s2, s2
-; VI-NEXT: s_or_b32 s3, s3, s8
-; VI-NEXT: s_lshl_b32 s8, s10, 24
-; VI-NEXT: s_lshl_b32 s9, s9, 16
-; VI-NEXT: s_lshr_b32 s13, s1, 16
-; VI-NEXT: s_or_b32 s8, s8, s9
-; VI-NEXT: s_and_b32 s2, s2, 0xff
-; VI-NEXT: s_lshl_b32 s9, s25, 8
-; VI-NEXT: s_lshr_b32 s12, s1, 24
-; VI-NEXT: s_add_i32 s13, s13, s13
-; VI-NEXT: s_or_b32 s2, s2, s9
-; VI-NEXT: s_bfe_u32 s26, s1, 0x80008
-; VI-NEXT: s_add_i32 s12, s12, s12
-; VI-NEXT: s_and_b32 s2, s2, 0xffff
-; VI-NEXT: s_and_b32 s9, s13, 0xff
-; VI-NEXT: s_add_i32 s26, s26, s26
+; VI-NEXT: s_add_i32 s30, s30, s30
+; VI-NEXT: s_lshl_b32 s19, s19, 8
+; VI-NEXT: s_and_b32 s18, s18, 0xff
; VI-NEXT: s_add_i32 s1, s1, s1
-; VI-NEXT: s_or_b32 s2, s2, s8
-; VI-NEXT: s_lshl_b32 s8, s12, 24
-; VI-NEXT: s_lshl_b32 s9, s9, 16
-; VI-NEXT: s_lshr_b32 s15, s0, 16
-; VI-NEXT: s_or_b32 s8, s8, s9
-; VI-NEXT: s_and_b32 s1, s1, 0xff
-; VI-NEXT: s_lshl_b32 s9, s26, 8
-; VI-NEXT: s_lshr_b32 s14, s0, 24
-; VI-NEXT: s_add_i32 s15, s15, s15
-; VI-NEXT: s_or_b32 s1, s1, s9
-; VI-NEXT: s_bfe_u32 s27, s0, 0x80008
-; VI-NEXT: s_add_i32 s14, s14, s14
-; VI-NEXT: s_and_b32 s1, s1, 0xffff
-; VI-NEXT: s_and_b32 s9, s15, 0xff
-; VI-NEXT: s_add_i32 s27, s27, s27
+; VI-NEXT: s_add_i32 s29, s29, s29
+; VI-NEXT: s_lshl_b32 s17, s17, 8
+; VI-NEXT: s_and_b32 s16, s16, 0xff
; VI-NEXT: s_add_i32 s0, s0, s0
-; VI-NEXT: s_or_b32 s1, s1, s8
-; VI-NEXT: s_lshl_b32 s8, s14, 24
-; VI-NEXT: s_lshl_b32 s9, s9, 16
-; VI-NEXT: s_lshr_b32 s17, s7, 16
-; VI-NEXT: s_or_b32 s8, s8, s9
-; VI-NEXT: s_and_b32 s0, s0, 0xff
-; VI-NEXT: s_lshl_b32 s9, s27, 8
-; VI-NEXT: s_lshr_b32 s16, s7, 24
-; VI-NEXT: s_add_i32 s17, s17, s17
-; VI-NEXT: s_or_b32 s0, s0, s9
-; VI-NEXT: s_bfe_u32 s28, s7, 0x80008
-; VI-NEXT: s_add_i32 s16, s16, s16
-; VI-NEXT: s_and_b32 s0, s0, 0xffff
-; VI-NEXT: s_and_b32 s9, s17, 0xff
; VI-NEXT: s_add_i32 s28, s28, s28
-; VI-NEXT: s_add_i32 s7, s7, s7
-; VI-NEXT: s_or_b32 s0, s0, s8
-; VI-NEXT: s_lshl_b32 s8, s16, 24
-; VI-NEXT: s_lshl_b32 s9, s9, 16
-; VI-NEXT: s_lshr_b32 s19, s6, 16
-; VI-NEXT: s_or_b32 s8, s8, s9
-; VI-NEXT: s_and_b32 s7, s7, 0xff
-; VI-NEXT: s_lshl_b32 s9, s28, 8
-; VI-NEXT: s_lshr_b32 s18, s6, 24
-; VI-NEXT: s_add_i32 s19, s19, s19
-; VI-NEXT: s_or_b32 s7, s7, s9
-; VI-NEXT: s_bfe_u32 s29, s6, 0x80008
-; VI-NEXT: s_add_i32 s18, s18, s18
+; VI-NEXT: s_or_b32 s7, s7, s15
+; VI-NEXT: s_or_b32 s6, s6, s13
+; VI-NEXT: s_or_b32 s5, s5, s11
+; VI-NEXT: s_or_b32 s4, s4, s9
+; VI-NEXT: s_or_b32 s22, s22, s23
+; VI-NEXT: s_and_b32 s3, s3, 0xff
+; VI-NEXT: s_lshl_b32 s23, s31, 8
+; VI-NEXT: s_or_b32 s20, s20, s21
+; VI-NEXT: s_and_b32 s2, s2, 0xff
+; VI-NEXT: s_lshl_b32 s21, s30, 8
+; VI-NEXT: s_or_b32 s18, s18, s19
+; VI-NEXT: s_and_b32 s1, s1, 0xff
+; VI-NEXT: s_lshl_b32 s19, s29, 8
+; VI-NEXT: s_or_b32 s16, s16, s17
+; VI-NEXT: s_and_b32 s0, s0, 0xff
+; VI-NEXT: s_lshl_b32 s17, s28, 8
+; VI-NEXT: s_lshl_b32 s14, s14, 16
; VI-NEXT: s_and_b32 s7, s7, 0xffff
-; VI-NEXT: s_and_b32 s9, s19, 0xff
-; VI-NEXT: s_add_i32 s29, s29, s29
-; VI-NEXT: s_add_i32 s6, s6, s6
-; VI-NEXT: s_or_b32 s7, s7, s8
-; VI-NEXT: s_lshl_b32 s8, s18, 24
-; VI-NEXT: s_lshl_b32 s9, s9, 16
-; VI-NEXT: s_lshr_b32 s21, s5, 16
-; VI-NEXT: s_or_b32 s8, s8, s9
-; VI-NEXT: s_and_b32 s6, s6, 0xff
-; VI-NEXT: s_lshl_b32 s9, s29, 8
-; VI-NEXT: s_lshr_b32 s20, s5, 24
-; VI-NEXT: s_add_i32 s21, s21, s21
-; VI-NEXT: s_or_b32 s6, s6, s9
-; VI-NEXT: s_bfe_u32 s30, s5, 0x80008
-; VI-NEXT: s_add_i32 s20, s20, s20
+; VI-NEXT: s_lshl_b32 s12, s12, 16
; VI-NEXT: s_and_b32 s6, s6, 0xffff
-; VI-NEXT: s_and_b32 s9, s21, 0xff
-; VI-NEXT: s_add_i32 s30, s30, s30
-; VI-NEXT: s_add_i32 s5, s5, s5
-; VI-NEXT: s_or_b32 s6, s6, s8
-; VI-NEXT: s_lshl_b32 s8, s20, 24
-; VI-NEXT: s_lshl_b32 s9, s9, 16
-; VI-NEXT: s_lshr_b32 s23, s4, 16
-; VI-NEXT: s_or_b32 s8, s8, s9
-; VI-NEXT: s_and_b32 s5, s5, 0xff
-; VI-NEXT: s_lshl_b32 s9, s30, 8
-; VI-NEXT: s_lshr_b32 s22, s4, 24
-; VI-NEXT: s_add_i32 s23, s23, s23
-; VI-NEXT: s_or_b32 s5, s5, s9
-; VI-NEXT: s_bfe_u32 s31, s4, 0x80008
-; VI-NEXT: s_add_i32 s22, s22, s22
+; VI-NEXT: s_lshl_b32 s10, s10, 16
; VI-NEXT: s_and_b32 s5, s5, 0xffff
-; VI-NEXT: s_and_b32 s9, s23, 0xff
-; VI-NEXT: s_add_i32 s31, s31, s31
-; VI-NEXT: s_add_i32 s4, s4, s4
-; VI-NEXT: s_or_b32 s5, s5, s8
-; VI-NEXT: s_lshl_b32 s8, s22, 24
-; VI-NEXT: s_lshl_b32 s9, s9, 16
-; VI-NEXT: s_or_b32 s8, s8, s9
-; VI-NEXT: s_and_b32 s4, s4, 0xff
-; VI-NEXT: s_lshl_b32 s9, s31, 8
-; VI-NEXT: s_or_b32 s4, s4, s9
+; VI-NEXT: s_lshl_b32 s8, s8, 16
; VI-NEXT: s_and_b32 s4, s4, 0xffff
+; VI-NEXT: s_or_b32 s3, s3, s23
+; VI-NEXT: s_or_b32 s2, s2, s21
+; VI-NEXT: s_or_b32 s1, s1, s19
+; VI-NEXT: s_or_b32 s0, s0, s17
+; VI-NEXT: s_or_b32 s7, s7, s14
+; VI-NEXT: s_or_b32 s6, s6, s12
+; VI-NEXT: s_or_b32 s5, s5, s10
; VI-NEXT: s_or_b32 s4, s4, s8
+; VI-NEXT: s_lshl_b32 s22, s22, 16
+; VI-NEXT: s_and_b32 s3, s3, 0xffff
+; VI-NEXT: s_lshl_b32 s20, s20, 16
+; VI-NEXT: s_and_b32 s2, s2, 0xffff
+; VI-NEXT: s_lshl_b32 s18, s18, 16
+; VI-NEXT: s_and_b32 s1, s1, 0xffff
+; VI-NEXT: s_lshl_b32 s16, s16, 16
+; VI-NEXT: s_and_b32 s0, s0, 0xffff
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: s_or_b32 s3, s3, s22
+; VI-NEXT: s_or_b32 s2, s2, s20
+; VI-NEXT: s_or_b32 s1, s1, s18
+; VI-NEXT: s_or_b32 s0, s0, s16
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: v_mov_b32_e32 v4, 0
; VI-NEXT: v_mov_b32_e32 v0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll b/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll
index 6505e390355a8c..e492a32cf009c9 100644
--- a/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll
+++ b/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll
@@ -1,6 +1,6 @@
; RUN: opt -S -mtriple=amdgcn-- -codegenprepare < %s | FileCheck -check-prefix=OPT %s
; RUN: opt -S -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -codegenprepare < %s | FileCheck -check-prefix=OPT %s
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
; This particular case will actually be worse in terms of code size
@@ -120,15 +120,12 @@ ret:
; GCN-LABEL: {{^}}sink_ubfe_i16:
; GCN-NOT: lshr
; VI: s_load_dword [[ARG:s[0-9]+]], s[2:3], 0x2c
-; VI: s_bfe_u32 [[BFE:s[0-9]+]], [[ARG]], 0xc0004
; GCN: s_cbranch_scc{{[0-1]}}
-; SI: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x70004
-; VI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7f
+; GCN: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x70004
; GCN: .LBB2_3:
-; SI: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80004
-; VI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0xff
+; GCN: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80004
; GCN: buffer_store_short
; GCN: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll
index 93e14a205f05d4..8ef74c70df7a1f 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll
@@ -1650,15 +1650,10 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
; VI-NEXT: s_mov_b32 s8, s2
; VI-NEXT: s_mov_b32 s9, s3
; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; VI-NEXT: v_mov_b32_e32 v1, 0xffff
; VI-NEXT: s_mov_b32 s4, s0
; VI-NEXT: s_mov_b32 s5, s1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_ffbh_u32_e32 v2, v0
-; VI-NEXT: v_min_u32_e32 v2, 32, v2
-; VI-NEXT: v_add_u32_e32 v2, vcc, -16, v2
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; VI-NEXT: v_ffbh_u32_e32 v0, v0
; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
@@ -1696,11 +1691,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_ushort v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_ffbh_u32_e32 v2, v1
-; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
-; GFX10-NEXT: v_min_u32_e32 v2, 32, v2
-; GFX10-NEXT: v_add_nc_u32_e32 v2, -16, v2
-; GFX10-NEXT: v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo
+; GFX10-NEXT: v_ffbh_u32_e32 v1, v1
; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
@@ -1727,13 +1718,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_u16 v1, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_clz_i32_u32_e32 v2, v1
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_min_u32_e32 v2, 32, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v2, -16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo
+; GFX11-NEXT: v_clz_i32_u32_e32 v1, v1
; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
diff --git a/llvm/test/CodeGen/AMDGPU/cttz.ll b/llvm/test/CodeGen/AMDGPU/cttz.ll
index ccd23a91c35733..d7e3296c914a11 100644
--- a/llvm/test/CodeGen/AMDGPU/cttz.ll
+++ b/llvm/test/CodeGen/AMDGPU/cttz.ll
@@ -1402,15 +1402,10 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
; VI-NEXT: s_mov_b32 s8, s2
; VI-NEXT: s_mov_b32 s9, s3
; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; VI-NEXT: v_mov_b32_e32 v1, 0xffff
; VI-NEXT: s_mov_b32 s4, s0
; VI-NEXT: s_mov_b32 s5, s1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_e32 v2, 0x10000, v0
-; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; VI-NEXT: v_ffbl_b32_e32 v2, v2
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; VI-NEXT: v_ffbl_b32_e32 v0, v0
; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
@@ -1448,10 +1443,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_ushort v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_or_b32_e32 v2, 0x10000, v1
-; GFX10-NEXT: v_cmp_ne_u32_sdwa vcc_lo, v1, v0 src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT: v_ffbl_b32_e32 v2, v2
-; GFX10-NEXT: v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo
+; GFX10-NEXT: v_ffbl_b32_e32 v1, v1
; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
index 4c7c8bc1c027d7..dc6ee7c562f48e 100644
--- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
@@ -1460,13 +1460,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_load_ubyte v0, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v1, 0xff
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_e32 v2, 0x100, v0
-; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; VI-NEXT: v_ffbl_b32_e32 v2, v2
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; VI-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc
+; VI-NEXT: v_ffbl_b32_e32 v2, v0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_byte v[0:1], v2
@@ -1558,19 +1553,12 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
; VI-NEXT: flat_load_ubyte v2, v[2:3]
; VI-NEXT: flat_load_ubyte v0, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_readfirstlane_b32 s2, v2
+; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_readfirstlane_b32 s3, v0
-; VI-NEXT: s_lshl_b32 s2, s2, 8
-; VI-NEXT: s_or_b32 s2, s2, s3
-; VI-NEXT: s_or_b32 s3, s2, 0x10000
-; VI-NEXT: s_and_b32 s2, s2, 0xffff
-; VI-NEXT: s_ff1_i32_b32 s3, s3
-; VI-NEXT: s_cmp_lg_u32 s2, 0
-; VI-NEXT: s_cselect_b32 s2, s3, 0xffff
+; VI-NEXT: v_or_b32_e32 v0, v1, v0
+; VI-NEXT: v_ffbl_b32_e32 v2, v0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/fneg.ll b/llvm/test/CodeGen/AMDGPU/fneg.ll
index e447429539e6ff..9c3f5f1cd672d8 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg.ll
@@ -635,12 +635,7 @@ define amdgpu_kernel void @s_fneg_v2i16(ptr addrspace(1) %out, i32 %arg) {
; VI-NEXT: s_load_dword s4, s[2:3], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s2, s4, 16
-; VI-NEXT: s_xor_b32 s3, s4, 0x8000
-; VI-NEXT: s_xor_b32 s2, s2, 0x8000
-; VI-NEXT: s_and_b32 s3, s3, 0xffff
-; VI-NEXT: s_lshl_b32 s2, s2, 16
-; VI-NEXT: s_or_b32 s2, s3, s2
+; VI-NEXT: s_xor_b32 s2, s4, 0x80008000
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
@@ -721,11 +716,9 @@ define amdgpu_kernel void @s_fneg_v2i16_fp_use(ptr addrspace(1) %out, i32 %arg)
; VI-NEXT: v_mov_b32_e32 v0, 0x4000
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s2, s4, 16
-; VI-NEXT: s_xor_b32 s2, s2, 0x8000
-; VI-NEXT: s_xor_b32 s3, s4, 0x8000
; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_add_f16_e64 v1, s3, 2.0
-; VI-NEXT: v_add_f16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_sub_f16_e64 v1, 2.0, s4
+; VI-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v2, v1, v0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
diff --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
index a4bde5c9d82153..c06a3dab329822 100644
--- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
@@ -20,13 +20,13 @@ define amdgpu_kernel void @s_lshr_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s4, s2, 0xffff
-; VI-NEXT: s_lshr_b32 s2, s2, 16
-; VI-NEXT: s_lshr_b32 s5, s3, 16
-; VI-NEXT: s_lshr_b32 s2, s2, s5
-; VI-NEXT: s_lshr_b32 s3, s4, s3
-; VI-NEXT: s_lshl_b32 s2, s2, 16
-; VI-NEXT: s_or_b32 s2, s3, s2
+; VI-NEXT: s_lshr_b32 s4, s3, 16
+; VI-NEXT: s_lshr_b32 s5, s2, 16
+; VI-NEXT: s_and_b32 s2, s2, 0xffff
+; VI-NEXT: s_lshr_b32 s4, s5, s4
+; VI-NEXT: s_lshr_b32 s2, s2, s3
+; VI-NEXT: s_lshl_b32 s3, s4, 16
+; VI-NEXT: s_or_b32 s2, s2, s3
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll
index 05ef2698c1f774..1371e82b18b03e 100644
--- a/llvm/test/CodeGen/AMDGPU/min.ll
+++ b/llvm/test/CodeGen/AMDGPU/min.ll
@@ -434,13 +434,13 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32],
;
; VI-LABEL: s_test_imin_sle_i8:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[6:7], 0x28
-; VI-NEXT: s_load_dword s3, s[6:7], 0x4c
+; VI-NEXT: s_load_dword s2, s[6:7], 0x4c
+; VI-NEXT: s_load_dword s3, s[6:7], 0x28
; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_sext_i32_i8 s2, s2
; VI-NEXT: s_sext_i32_i8 s3, s3
-; VI-NEXT: s_min_i32 s2, s2, s3
+; VI-NEXT: s_min_i32 s2, s3, s2
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
@@ -449,14 +449,14 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32],
;
; GFX9-LABEL: s_test_imin_sle_i8:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dword s2, s[6:7], 0x28
-; GFX9-NEXT: s_load_dword s3, s[6:7], 0x4c
+; GFX9-NEXT: s_load_dword s2, s[6:7], 0x4c
+; GFX9-NEXT: s_load_dword s3, s[6:7], 0x28
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_sext_i32_i8 s2, s2
; GFX9-NEXT: s_sext_i32_i8 s3, s3
-; GFX9-NEXT: s_min_i32 s2, s2, s3
+; GFX9-NEXT: s_min_i32 s2, s3, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: global_store_byte v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
@@ -464,14 +464,14 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32],
; GFX10-LABEL: s_test_imin_sle_i8:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x2
-; GFX10-NEXT: s_load_dword s2, s[6:7], 0x28
-; GFX10-NEXT: s_load_dword s3, s[6:7], 0x4c
+; GFX10-NEXT: s_load_dword s2, s[6:7], 0x4c
+; GFX10-NEXT: s_load_dword s3, s[6:7], 0x28
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_sext_i32_i8 s2, s2
; GFX10-NEXT: s_sext_i32_i8 s3, s3
-; GFX10-NEXT: s_min_i32 s2, s2, s3
+; GFX10-NEXT: s_min_i32 s2, s3, s2
; GFX10-NEXT: v_mov_b32_e32 v1, s2
; GFX10-NEXT: global_store_byte v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
@@ -479,15 +479,15 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32],
; GFX11-LABEL: s_test_imin_sle_i8:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x2
-; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x28
-; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x4c
+; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x4c
+; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x28
; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_sext_i32_i8 s2, s4
; GFX11-NEXT: s_sext_i32_i8 s3, s5
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_min_i32 s2, s2, s3
+; GFX11-NEXT: s_min_i32 s2, s3, s2
; GFX11-NEXT: v_mov_b32_e32 v1, s2
; GFX11-NEXT: global_store_b8 v0, v1, s[0:1]
; GFX11-NEXT: s_nop 0
@@ -590,28 +590,30 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32]
; VI-LABEL: s_test_imin_sle_v4i8:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[6:7], 0x28
-; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; VI-NEXT: s_load_dword s3, s[6:7], 0x4c
+; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_ashr_i32 s4, s2, 24
-; VI-NEXT: s_bfe_i32 s5, s2, 0x80010
-; VI-NEXT: s_bfe_i32 s6, s2, 0x80008
-; VI-NEXT: s_sext_i32_i8 s2, s2
-; VI-NEXT: s_ashr_i32 s7, s3, 24
-; VI-NEXT: s_bfe_i32 s8, s3, 0x80010
-; VI-NEXT: s_bfe_i32 s9, s3, 0x80008
+; VI-NEXT: s_ashr_i32 s6, s3, 24
+; VI-NEXT: s_min_i32 s4, s4, s6
+; VI-NEXT: s_bfe_i32 s6, s3, 0x80010
+; VI-NEXT: s_bfe_i32 s8, s2, 0x80010
+; VI-NEXT: s_min_i32 s6, s8, s6
+; VI-NEXT: s_sext_i32_i16 s5, s2
+; VI-NEXT: s_sext_i32_i16 s7, s3
+; VI-NEXT: s_lshl_b32 s4, s4, 8
+; VI-NEXT: s_and_b32 s6, s6, 0xff
+; VI-NEXT: s_or_b32 s4, s6, s4
+; VI-NEXT: s_ashr_i32 s6, s7, 8
+; VI-NEXT: s_ashr_i32 s5, s5, 8
; VI-NEXT: s_sext_i32_i8 s3, s3
+; VI-NEXT: s_sext_i32_i8 s2, s2
+; VI-NEXT: s_min_i32 s5, s5, s6
; VI-NEXT: s_min_i32 s2, s2, s3
-; VI-NEXT: s_min_i32 s3, s6, s9
-; VI-NEXT: s_min_i32 s5, s5, s8
-; VI-NEXT: s_min_i32 s4, s4, s7
-; VI-NEXT: s_and_b32 s5, s5, 0xff
-; VI-NEXT: s_lshl_b32 s3, s3, 8
+; VI-NEXT: s_lshl_b32 s5, s5, 8
; VI-NEXT: s_and_b32 s2, s2, 0xff
-; VI-NEXT: s_lshl_b32 s4, s4, 24
-; VI-NEXT: s_lshl_b32 s5, s5, 16
-; VI-NEXT: s_or_b32 s2, s2, s3
-; VI-NEXT: s_or_b32 s4, s4, s5
+; VI-NEXT: s_or_b32 s2, s2, s5
+; VI-NEXT: s_lshl_b32 s4, s4, 16
; VI-NEXT: s_and_b32 s2, s2, 0xffff
; VI-NEXT: s_or_b32 s2, s2, s4
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -622,36 +624,35 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32]
;
; GFX9-LABEL: s_test_imin_sle_v4i8:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dword s3, s[6:7], 0x4c
; GFX9-NEXT: s_load_dword s2, s[6:7], 0x28
+; GFX9-NEXT: s_load_dword s3, s[6:7], 0x4c
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshr_b32 s5, s2, 16
-; GFX9-NEXT: s_lshr_b32 s8, s3, 16
-; GFX9-NEXT: s_ashr_i32 s9, s3, 24
-; GFX9-NEXT: s_ashr_i32 s6, s2, 24
-; GFX9-NEXT: s_bfe_i32 s8, s8, 0x80000
-; GFX9-NEXT: v_mov_b32_e32 v1, s9
-; GFX9-NEXT: s_bfe_i32 s5, s5, 0x80000
+; GFX9-NEXT: s_sext_i32_i16 s5, s2
; GFX9-NEXT: s_sext_i32_i16 s7, s3
-; GFX9-NEXT: v_min_i16_e32 v1, s6, v1
-; GFX9-NEXT: v_mov_b32_e32 v2, s8
-; GFX9-NEXT: s_sext_i32_i16 s4, s2
-; GFX9-NEXT: s_lshr_b32 s7, s7, 8
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1
-; GFX9-NEXT: v_min_i16_e32 v2, s5, v2
-; GFX9-NEXT: s_lshr_b32 s4, s4, 8
-; GFX9-NEXT: s_bfe_i32 s3, s3, 0x80000
-; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_mov_b32_e32 v2, s7
-; GFX9-NEXT: s_bfe_i32 s2, s2, 0x80000
-; GFX9-NEXT: v_min_i16_e32 v2, s4, v2
-; GFX9-NEXT: v_mov_b32_e32 v3, s3
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; GFX9-NEXT: v_min_i16_e32 v3, s2, v3
-; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT: s_ashr_i32 s7, s7, 8
+; GFX9-NEXT: s_ashr_i32 s5, s5, 8
+; GFX9-NEXT: s_ashr_i32 s4, s2, 24
+; GFX9-NEXT: s_ashr_i32 s6, s3, 24
+; GFX9-NEXT: s_min_i32 s5, s5, s7
+; GFX9-NEXT: s_sext_i32_i8 s7, s3
+; GFX9-NEXT: s_sext_i32_i8 s8, s2
+; GFX9-NEXT: s_bfe_i32 s3, s3, 0x80010
+; GFX9-NEXT: s_bfe_i32 s2, s2, 0x80010
+; GFX9-NEXT: s_min_i32 s7, s8, s7
+; GFX9-NEXT: s_min_i32 s4, s4, s6
+; GFX9-NEXT: s_min_i32 s2, s2, s3
+; GFX9-NEXT: s_lshl_b32 s5, s5, 8
+; GFX9-NEXT: s_and_b32 s7, s7, 0xff
+; GFX9-NEXT: s_lshl_b32 s4, s4, 8
+; GFX9-NEXT: s_and_b32 s2, s2, 0xff
+; GFX9-NEXT: s_or_b32 s5, s7, s5
+; GFX9-NEXT: s_or_b32 s2, s2, s4
+; GFX9-NEXT: s_and_b32 s5, s5, 0xffff
+; GFX9-NEXT: s_lshl_b32 s2, s2, 16
+; GFX9-NEXT: s_or_b32 s2, s5, s2
+; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
@@ -661,70 +662,69 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32]
; GFX10-NEXT: s_load_dword s2, s[6:7], 0x28
; GFX10-NEXT: s_load_dword s3, s[6:7], 0x4c
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_sext_i32_i16 s4, s2
+; GFX10-NEXT: s_sext_i32_i16 s5, s2
; GFX10-NEXT: s_sext_i32_i16 s7, s3
-; GFX10-NEXT: s_ashr_i32 s6, s2, 24
-; GFX10-NEXT: s_ashr_i32 s9, s3, 24
-; GFX10-NEXT: s_lshr_b32 s4, s4, 8
-; GFX10-NEXT: s_lshr_b32 s7, s7, 8
-; GFX10-NEXT: v_min_i16 v0, s6, s9
-; GFX10-NEXT: v_min_i16 v1, s4, s7
-; GFX10-NEXT: s_lshr_b32 s5, s2, 16
-; GFX10-NEXT: s_lshr_b32 s8, s3, 16
-; GFX10-NEXT: s_bfe_i32 s2, s2, 0x80000
-; GFX10-NEXT: s_bfe_i32 s5, s5, 0x80000
-; GFX10-NEXT: s_bfe_i32 s4, s8, 0x80000
-; GFX10-NEXT: s_bfe_i32 s3, s3, 0x80000
-; GFX10-NEXT: v_min_i16 v2, s5, s4
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 8, v0
-; GFX10-NEXT: v_lshlrev_b32_e32 v1, 8, v1
-; GFX10-NEXT: v_min_i16 v3, s2, s3
-; GFX10-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-NEXT: v_mov_b32_e32 v2, 0
-; GFX10-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-NEXT: s_ashr_i32 s4, s2, 24
+; GFX10-NEXT: s_ashr_i32 s6, s3, 24
+; GFX10-NEXT: s_sext_i32_i8 s8, s3
+; GFX10-NEXT: s_sext_i32_i8 s9, s2
+; GFX10-NEXT: s_bfe_i32 s3, s3, 0x80010
+; GFX10-NEXT: s_bfe_i32 s2, s2, 0x80010
+; GFX10-NEXT: s_ashr_i32 s7, s7, 8
+; GFX10-NEXT: s_ashr_i32 s5, s5, 8
+; GFX10-NEXT: s_min_i32 s8, s9, s8
+; GFX10-NEXT: s_min_i32 s4, s4, s6
+; GFX10-NEXT: s_min_i32 s2, s2, s3
+; GFX10-NEXT: s_min_i32 s3, s5, s7
+; GFX10-NEXT: s_and_b32 s5, s8, 0xff
+; GFX10-NEXT: s_lshl_b32 s4, s4, 8
+; GFX10-NEXT: s_lshl_b32 s3, s3, 8
+; GFX10-NEXT: s_and_b32 s2, s2, 0xff
+; GFX10-NEXT: s_or_b32 s3, s5, s3
+; GFX10-NEXT: s_or_b32 s2, s2, s4
+; GFX10-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX10-NEXT: s_lshl_b32 s2, s2, 16
+; GFX10-NEXT: s_or_b32 s2, s3, s2
+; GFX10-NEXT: v_mov_b32_e32 v1, s2
+; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_imin_sle_v4i8:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x28
-; GFX11-NEXT: s_load_b32 s1, s[2:3], 0x4c
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_sext_i32_i16 s4, s0
-; GFX11-NEXT: s_lshr_b32 s5, s0, 16
-; GFX11-NEXT: s_sext_i32_i16 s7, s1
-; GFX11-NEXT: s_lshr_b32 s8, s1, 16
-; GFX11-NEXT: s_ashr_i32 s6, s0, 24
-; GFX11-NEXT: s_bfe_i32 s0, s0, 0x80000
-; GFX11-NEXT: s_ashr_i32 s9, s1, 24
-; GFX11-NEXT: s_bfe_i32 s1, s1, 0x80000
-; GFX11-NEXT: s_lshr_b32 s4, s4, 8
-; GFX11-NEXT: s_bfe_i32 s5, s5, 0x80000
-; GFX11-NEXT: s_lshr_b32 s7, s7, 8
-; GFX11-NEXT: s_bfe_i32 s8, s8, 0x80000
-; GFX11-NEXT: v_min_i16 v0, s6, s9
-; GFX11-NEXT: v_min_i16 v1, s0, s1
-; GFX11-NEXT: v_min_i16 v2, s5, s8
-; GFX11-NEXT: v_min_i16 v3, s4, s7
+; GFX11-NEXT: s_clause 0x2
+; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x28
+; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x4c
; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 8, v0
-; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 8, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX11-NEXT: v_or_b32_e32 v1, v1, v3
-; GFX11-NEXT: v_mov_b32_e32 v2, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-NEXT: s_sext_i32_i16 s3, s4
+; GFX11-NEXT: s_sext_i32_i16 s7, s5
+; GFX11-NEXT: s_ashr_i32 s2, s4, 24
+; GFX11-NEXT: s_ashr_i32 s6, s5, 24
+; GFX11-NEXT: s_sext_i32_i8 s8, s5
+; GFX11-NEXT: s_sext_i32_i8 s9, s4
+; GFX11-NEXT: s_bfe_i32 s5, s5, 0x80010
+; GFX11-NEXT: s_bfe_i32 s4, s4, 0x80010
+; GFX11-NEXT: s_ashr_i32 s7, s7, 8
+; GFX11-NEXT: s_ashr_i32 s3, s3, 8
+; GFX11-NEXT: s_min_i32 s8, s9, s8
+; GFX11-NEXT: s_min_i32 s2, s2, s6
+; GFX11-NEXT: s_min_i32 s4, s4, s5
+; GFX11-NEXT: s_min_i32 s3, s3, s7
+; GFX11-NEXT: s_and_b32 s5, s8, 0xff
+; GFX11-NEXT: s_lshl_b32 s2, s2, 8
+; GFX11-NEXT: s_lshl_b32 s3, s3, 8
+; GFX11-NEXT: s_and_b32 s4, s4, 0xff
+; GFX11-NEXT: s_or_b32 s3, s5, s3
+; GFX11-NEXT: s_or_b32 s2, s4, s2
+; GFX11-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX11-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_or_b32 s2, s3, s2
+; GFX11-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -789,11 +789,11 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_ashr_i32 s4, s2, 16
-; VI-NEXT: s_sext_i32_i16 s2, s2
-; VI-NEXT: s_ashr_i32 s5, s3, 16
+; VI-NEXT: s_ashr_i32 s4, s3, 16
+; VI-NEXT: s_ashr_i32 s5, s2, 16
; VI-NEXT: s_sext_i32_i16 s3, s3
-; VI-NEXT: s_min_i32 s4, s4, s5
+; VI-NEXT: s_sext_i32_i16 s2, s2
+; VI-NEXT: s_min_i32 s4, s5, s4
; VI-NEXT: s_min_i32 s2, s2, s3
; VI-NEXT: s_lshl_b32 s3, s4, 16
; VI-NEXT: s_and_b32 s2, s2, 0xffff
@@ -953,24 +953,24 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16
; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8
; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_ashr_i32 s6, s1, 16
-; VI-NEXT: s_sext_i32_i16 s1, s1
-; VI-NEXT: s_ashr_i32 s8, s3, 16
+; VI-NEXT: s_ashr_i32 s6, s3, 16
+; VI-NEXT: s_ashr_i32 s7, s1, 16
; VI-NEXT: s_sext_i32_i16 s3, s3
-; VI-NEXT: s_ashr_i32 s7, s0, 16
-; VI-NEXT: s_sext_i32_i16 s0, s0
-; VI-NEXT: s_ashr_i32 s9, s2, 16
-; VI-NEXT: s_sext_i32_i16 s2, s2
-; VI-NEXT: s_min_i32 s6, s6, s8
+; VI-NEXT: s_sext_i32_i16 s1, s1
+; VI-NEXT: s_min_i32 s6, s7, s6
; VI-NEXT: s_min_i32 s1, s1, s3
-; VI-NEXT: s_min_i32 s7, s7, s9
-; VI-NEXT: s_min_i32 s0, s0, s2
-; VI-NEXT: s_lshl_b32 s2, s6, 16
+; VI-NEXT: s_lshl_b32 s6, s6, 16
; VI-NEXT: s_and_b32 s1, s1, 0xffff
-; VI-NEXT: s_or_b32 s1, s1, s2
-; VI-NEXT: s_lshl_b32 s2, s7, 16
+; VI-NEXT: s_or_b32 s1, s1, s6
+; VI-NEXT: s_ashr_i32 s3, s2, 16
+; VI-NEXT: s_ashr_i32 s6, s0, 16
+; VI-NEXT: s_sext_i32_i16 s2, s2
+; VI-NEXT: s_sext_i32_i16 s0, s0
+; VI-NEXT: s_min_i32 s3, s6, s3
+; VI-NEXT: s_min_i32 s0, s0, s2
+; VI-NEXT: s_lshl_b32 s3, s3, 16
; VI-NEXT: s_and_b32 s0, s0, 0xffff
-; VI-NEXT: s_or_b32 s0, s0, s2
+; VI-NEXT: s_or_b32 s0, s0, s3
; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -2661,19 +2661,22 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0,
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: flat_load_ushort v4, v[0:1]
-; VI-NEXT: flat_load_ushort v5, v[2:3]
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: flat_load_ushort v4, v[2:3]
+; VI-NEXT: flat_load_ushort v5, v[0:1]
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_and_b32_e32 v6, 0xffff, v4
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_cmp_lt_u32_e32 vcc, v4, v5
-; VI-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
+; VI-NEXT: v_and_b32_e32 v7, 0xffff, v5
+; VI-NEXT: v_cmp_lt_u32_e32 vcc, v7, v6
+; VI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
; VI-NEXT: flat_store_short v[0:1], v4
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; VI-NEXT: flat_store_byte v[2:3], v0
@@ -2687,7 +2690,7 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0,
; GFX9-NEXT: global_load_ushort v1, v0, s[12:13]
; GFX9-NEXT: global_load_ushort v2, v0, s[14:15]
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, v1, v2
+; GFX9-NEXT: v_cmp_lt_u32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:WORD_0
; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; GFX9-NEXT: global_store_short v0, v1, s[8:9]
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
@@ -2703,7 +2706,7 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0,
; GFX10-NEXT: global_load_ushort v1, v0, s[12:13]
; GFX10-NEXT: global_load_ushort v2, v0, s[14:15]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_cmp_lt_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT: v_cmp_lt_u32_sdwa vcc_lo, v1, v2 src0_sel:WORD_0 src1_sel:WORD_0
; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
; GFX10-NEXT: global_store_short v0, v1, s[8:9]
@@ -2716,11 +2719,15 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0,
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_load_u16 v1, v0, s[4:5]
-; GFX11-NEXT: global_load_u16 v2, v0, s[6:7]
+; GFX11-NEXT: global_load_u16 v1, v0, s[6:7]
+; GFX11-NEXT: global_load_u16 v2, v0, s[4:5]
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cmp_lt_u32_e32 vcc_lo, v1, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo
+; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cmp_lt_u32_e32 vcc_lo, v4, v3
+; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
@@ -3174,38 +3181,38 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16
; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10
; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s2, s11, 16
-; VI-NEXT: s_lshr_b32 s4, s10, 16
-; VI-NEXT: s_and_b32 s5, s10, 0xffff
-; VI-NEXT: s_lshr_b32 s10, s15, 16
+; VI-NEXT: s_and_b32 s2, s15, 0xffff
; VI-NEXT: s_and_b32 s3, s11, 0xffff
-; VI-NEXT: s_and_b32 s11, s15, 0xffff
-; VI-NEXT: s_lshr_b32 s15, s14, 16
-; VI-NEXT: s_min_u32 s2, s2, s10
+; VI-NEXT: s_lshr_b32 s4, s15, 16
+; VI-NEXT: s_lshr_b32 s5, s11, 16
+; VI-NEXT: s_min_u32 s2, s3, s2
+; VI-NEXT: s_min_u32 s3, s5, s4
+; VI-NEXT: s_lshl_b32 s3, s3, 16
+; VI-NEXT: s_or_b32 s2, s2, s3
+; VI-NEXT: s_and_b32 s3, s14, 0xffff
+; VI-NEXT: s_and_b32 s4, s10, 0xffff
+; VI-NEXT: s_min_u32 s3, s4, s3
+; VI-NEXT: s_lshr_b32 s4, s14, 16
+; VI-NEXT: s_lshr_b32 s5, s10, 16
+; VI-NEXT: s_min_u32 s4, s5, s4
+; VI-NEXT: s_lshl_b32 s4, s4, 16
+; VI-NEXT: s_or_b32 s3, s3, s4
+; VI-NEXT: s_and_b32 s4, s13, 0xffff
+; VI-NEXT: s_and_b32 s5, s9, 0xffff
+; VI-NEXT: s_min_u32 s4, s5, s4
+; VI-NEXT: s_lshr_b32 s5, s13, 16
; VI-NEXT: s_lshr_b32 s6, s9, 16
-; VI-NEXT: s_and_b32 s7, s9, 0xffff
-; VI-NEXT: s_lshr_b32 s9, s8, 16
-; VI-NEXT: s_and_b32 s14, s14, 0xffff
-; VI-NEXT: s_lshr_b32 s16, s13, 16
-; VI-NEXT: s_lshr_b32 s17, s12, 16
-; VI-NEXT: s_min_u32 s4, s4, s15
-; VI-NEXT: s_min_u32 s3, s3, s11
-; VI-NEXT: s_lshl_b32 s2, s2, 16
-; VI-NEXT: s_and_b32 s8, s8, 0xffff
-; VI-NEXT: s_and_b32 s13, s13, 0xffff
-; VI-NEXT: s_and_b32 s12, s12, 0xffff
-; VI-NEXT: s_min_u32 s9, s9, s17
-; VI-NEXT: s_min_u32 s6, s6, s16
-; VI-NEXT: s_min_u32 s5, s5, s14
-; VI-NEXT: s_or_b32 s2, s3, s2
-; VI-NEXT: s_lshl_b32 s3, s4, 16
-; VI-NEXT: s_min_u32 s8, s8, s12
-; VI-NEXT: s_min_u32 s7, s7, s13
-; VI-NEXT: s_or_b32 s3, s5, s3
-; VI-NEXT: s_lshl_b32 s4, s6, 16
-; VI-NEXT: s_lshl_b32 s5, s9, 16
-; VI-NEXT: s_or_b32 s4, s7, s4
-; VI-NEXT: s_or_b32 s5, s8, s5
+; VI-NEXT: s_min_u32 s5, s6, s5
+; VI-NEXT: s_lshl_b32 s5, s5, 16
+; VI-NEXT: s_or_b32 s4, s4, s5
+; VI-NEXT: s_and_b32 s5, s12, 0xffff
+; VI-NEXT: s_and_b32 s6, s8, 0xffff
+; VI-NEXT: s_min_u32 s5, s6, s5
+; VI-NEXT: s_lshr_b32 s6, s12, 16
+; VI-NEXT: s_lshr_b32 s7, s8, 16
+; VI-NEXT: s_min_u32 s6, s7, s6
+; VI-NEXT: s_lshl_b32 s6, s6, 16
+; VI-NEXT: s_or_b32 s5, s5, s6
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v0, s5
; VI-NEXT: v_mov_b32_e32 v1, s4
@@ -3536,9 +3543,9 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1
; VI-NEXT: s_load_dword s2, s[6:7], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_sext_i32_i16 s3, s2
-; VI-NEXT: s_ashr_i32 s2, s2, 16
-; VI-NEXT: s_min_i32 s2, s3, s2
+; VI-NEXT: s_ashr_i32 s3, s2, 16
+; VI-NEXT: s_sext_i32_i16 s2, s2
+; VI-NEXT: s_min_i32 s2, s2, s3
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
@@ -3551,9 +3558,9 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_sext_i32_i16 s3, s2
-; GFX9-NEXT: s_ashr_i32 s2, s2, 16
-; GFX9-NEXT: s_min_i32 s2, s3, s2
+; GFX9-NEXT: s_ashr_i32 s3, s2, 16
+; GFX9-NEXT: s_sext_i32_i16 s2, s2
+; GFX9-NEXT: s_min_i32 s2, s2, s3
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
@@ -3565,9 +3572,9 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_sext_i32_i16 s3, s2
-; GFX10-NEXT: s_ashr_i32 s2, s2, 16
-; GFX10-NEXT: s_min_i32 s2, s3, s2
+; GFX10-NEXT: s_ashr_i32 s3, s2, 16
+; GFX10-NEXT: s_sext_i32_i16 s2, s2
+; GFX10-NEXT: s_min_i32 s2, s2, s3
; GFX10-NEXT: v_mov_b32_e32 v1, s2
; GFX10-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
@@ -3579,10 +3586,10 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1
; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_sext_i32_i16 s2, s4
-; GFX11-NEXT: s_ashr_i32 s3, s4, 16
+; GFX11-NEXT: s_ashr_i32 s2, s4, 16
+; GFX11-NEXT: s_sext_i32_i16 s3, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_min_i32 s2, s2, s3
+; GFX11-NEXT: s_min_i32 s2, s3, s2
; GFX11-NEXT: v_mov_b32_e32 v1, s2
; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-NEXT: s_nop 0
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
index 5a1cc72644d47d..b1066e0f8f26ad 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
@@ -1787,15 +1787,14 @@ define amdgpu_kernel void @add_bb_v2i16(ptr addrspace(1) %out, ptr addrspace(1)
; NOSDWA-NEXT: flat_load_dword v1, v[0:1]
; NOSDWA-NEXT: flat_load_dword v2, v[2:3]
; NOSDWA-NEXT: v_mov_b32_e32 v0, s4
-; NOSDWA-NEXT: s_waitcnt vmcnt(1)
-; NOSDWA-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; NOSDWA-NEXT: s_waitcnt vmcnt(0)
-; NOSDWA-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; NOSDWA-NEXT: v_add_u32_e32 v3, vcc, v1, v2
+; NOSDWA-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; NOSDWA-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; NOSDWA-NEXT: v_add_u32_e32 v1, vcc, v1, v2
-; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, v3, v4
-; NOSDWA-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; NOSDWA-NEXT: v_or_b32_e32 v2, v1, v2
+; NOSDWA-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; NOSDWA-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; NOSDWA-NEXT: v_or_b32_e32 v2, v3, v1
; NOSDWA-NEXT: v_mov_b32_e32 v1, s5
; NOSDWA-NEXT: flat_store_dword v[0:1], v2
; NOSDWA-NEXT: s_endpgm
@@ -1813,9 +1812,9 @@ define amdgpu_kernel void @add_bb_v2i16(ptr addrspace(1) %out, ptr addrspace(1)
; GFX89-NEXT: flat_load_dword v2, v[2:3]
; GFX89-NEXT: v_mov_b32_e32 v0, s4
; GFX89-NEXT: s_waitcnt vmcnt(0)
-; GFX89-NEXT: v_add_u32_sdwa v3, vcc, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX89-NEXT: v_add_u32_e32 v1, vcc, v1, v2
-; GFX89-NEXT: v_or_b32_sdwa v2, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX89-NEXT: v_add_u32_e32 v3, vcc, v1, v2
+; GFX89-NEXT: v_add_u32_sdwa v1, vcc, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX89-NEXT: v_or_b32_sdwa v2, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX89-NEXT: v_mov_b32_e32 v1, s5
; GFX89-NEXT: flat_store_dword v[0:1], v2
; GFX89-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll b/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll
index 4e3dccb975fe8e..fdae5d411d3621 100644
--- a/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll
@@ -521,13 +521,10 @@ define amdgpu_kernel void @v_sext_in_reg_i32_to_i64_move_use(ptr addrspace(1) %o
; FUNC-LABEL: {{^}}s_sext_in_reg_i1_i16:
; GCN: s_load_dword [[VAL:s[0-9]+]]
-; SI: s_bfe_i32 [[BFE:s[0-9]+]], [[VAL]], 0x10000
-; SI: v_mov_b32_e32 [[VBFE:v[0-9]+]], [[BFE]]
-; SI: buffer_store_short [[VBFE]]
+; GCN: s_bfe_i32 [[BFE:s[0-9]+]], [[VAL]], 0x10000
+; GCN: v_mov_b32_e32 [[VBFE:v[0-9]+]], [[BFE]]
+; GCN: buffer_store_short [[VBFE]]
-; GFX89: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 15
-; GFX89: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}}
-; GFX89: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 15
define amdgpu_kernel void @s_sext_in_reg_i1_i16(ptr addrspace(1) %out, ptr addrspace(4) %ptr) #0 {
%ld = load i32, ptr addrspace(4) %ptr
%in = trunc i32 %ld to i16
@@ -622,9 +619,9 @@ define amdgpu_kernel void @s_sext_in_reg_i2_i16_arg(ptr addrspace(1) %out, i16 %
; SI: v_mov_b32_e32 [[VSEXT:v[0-9]+]], [[SSEXT]]
; SI: buffer_store_short [[VSEXT]]
-; GFX89: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8{{$}}
-; GFX89: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}}
-; GFX89: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8{{$}}
+; GFX89: s_bfe_i32 [[BFE:s[0-9]+]], [[VAL]], 0x80000
+; GFX89: v_mov_b32_e32 [[VBFE:v[0-9]+]], [[BFE]]
+; GFX89: buffer_store_short [[VBFE]]
define amdgpu_kernel void @s_sext_in_reg_i8_i16_arg(ptr addrspace(1) %out, i16 %in) #0 {
%shl = shl i16 %in, 8
%sext = ashr i16 %shl, 8
diff --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
index 9b9f03ff74aa3f..44dd0b6e27e740 100644
--- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
@@ -27,9 +27,9 @@ define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_mov_b32 s4, s0
; VI-NEXT: s_mov_b32 s5, s1
-; VI-NEXT: s_lshr_b32 s0, s2, 16
-; VI-NEXT: s_lshr_b32 s1, s3, 16
-; VI-NEXT: s_lshl_b32 s0, s0, s1
+; VI-NEXT: s_lshr_b32 s0, s3, 16
+; VI-NEXT: s_lshr_b32 s1, s2, 16
+; VI-NEXT: s_lshl_b32 s0, s1, s0
; VI-NEXT: s_lshl_b32 s1, s2, s3
; VI-NEXT: s_lshl_b32 s0, s0, 16
; VI-NEXT: s_and_b32 s1, s1, 0xffff
diff --git a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll
index 3446e0384cc545..24efb1252dcffe 100644
--- a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll
@@ -22,19 +22,19 @@ define amdgpu_kernel void @s_abs_v2i16(ptr addrspace(1) %out, <2 x i16> %val) #0
; VI-NEXT: s_load_dword s4, s[2:3], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s2, s4, 16
-; VI-NEXT: s_sub_i32 s3, 0, s4
+; VI-NEXT: s_sub_i32 s2, 0, s4
+; VI-NEXT: s_lshr_b32 s3, s4, 16
; VI-NEXT: s_ashr_i32 s5, s4, 16
+; VI-NEXT: s_sub_i32 s3, 0, s3
+; VI-NEXT: s_sext_i32_i16 s2, s2
; VI-NEXT: s_sext_i32_i16 s4, s4
-; VI-NEXT: s_sub_i32 s2, 0, s2
; VI-NEXT: s_sext_i32_i16 s3, s3
-; VI-NEXT: s_sext_i32_i16 s2, s2
-; VI-NEXT: s_max_i32 s3, s4, s3
-; VI-NEXT: s_max_i32 s2, s5, s2
-; VI-NEXT: s_add_i32 s3, s3, 2
-; VI-NEXT: s_lshl_b32 s2, s2, 16
-; VI-NEXT: s_and_b32 s3, s3, 0xffff
-; VI-NEXT: s_or_b32 s2, s2, s3
+; VI-NEXT: s_max_i32 s2, s4, s2
+; VI-NEXT: s_max_i32 s3, s5, s3
+; VI-NEXT: s_add_i32 s2, s2, 2
+; VI-NEXT: s_lshl_b32 s3, s3, 16
+; VI-NEXT: s_and_b32 s2, s2, 0xffff
+; VI-NEXT: s_or_b32 s2, s3, s2
; VI-NEXT: s_add_i32 s2, s2, 0x20000
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -171,19 +171,19 @@ define amdgpu_kernel void @s_abs_v2i16_2(ptr addrspace(1) %out, <2 x i16> %val)
; VI-NEXT: s_load_dword s4, s[2:3], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s2, s4, 16
-; VI-NEXT: s_sub_i32 s3, 0, s4
+; VI-NEXT: s_sub_i32 s2, 0, s4
+; VI-NEXT: s_lshr_b32 s3, s4, 16
; VI-NEXT: s_ashr_i32 s5, s4, 16
+; VI-NEXT: s_sub_i32 s3, 0, s3
+; VI-NEXT: s_sext_i32_i16 s2, s2
; VI-NEXT: s_sext_i32_i16 s4, s4
-; VI-NEXT: s_sub_i32 s2, 0, s2
; VI-NEXT: s_sext_i32_i16 s3, s3
-; VI-NEXT: s_sext_i32_i16 s2, s2
-; VI-NEXT: s_max_i32 s3, s4, s3
-; VI-NEXT: s_max_i32 s2, s5, s2
-; VI-NEXT: s_add_i32 s3, s3, 2
-; VI-NEXT: s_lshl_b32 s2, s2, 16
-; VI-NEXT: s_and_b32 s3, s3, 0xffff
-; VI-NEXT: s_or_b32 s2, s2, s3
+; VI-NEXT: s_max_i32 s2, s4, s2
+; VI-NEXT: s_max_i32 s3, s5, s3
+; VI-NEXT: s_add_i32 s2, s2, 2
+; VI-NEXT: s_lshl_b32 s3, s3, 16
+; VI-NEXT: s_and_b32 s2, s2, 0xffff
+; VI-NEXT: s_or_b32 s2, s3, s2
; VI-NEXT: s_add_i32 s2, s2, 0x20000
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -331,31 +331,31 @@ define amdgpu_kernel void @s_abs_v4i16(ptr addrspace(1) %out, <4 x i16> %val) #0
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s4, s2, 16
-; VI-NEXT: s_lshr_b32 s5, s3, 16
-; VI-NEXT: s_sub_i32 s6, 0, s3
-; VI-NEXT: s_sub_i32 s7, 0, s2
-; VI-NEXT: s_sub_i32 s5, 0, s5
-; VI-NEXT: s_sub_i32 s4, 0, s4
+; VI-NEXT: s_lshr_b32 s7, s2, 16
+; VI-NEXT: s_sub_i32 s7, 0, s7
+; VI-NEXT: s_sub_i32 s4, 0, s3
+; VI-NEXT: s_lshr_b32 s6, s3, 16
; VI-NEXT: s_ashr_i32 s8, s2, 16
-; VI-NEXT: s_ashr_i32 s9, s3, 16
-; VI-NEXT: s_sext_i32_i16 s2, s2
-; VI-NEXT: s_sext_i32_i16 s3, s3
; VI-NEXT: s_sext_i32_i16 s7, s7
-; VI-NEXT: s_sext_i32_i16 s6, s6
+; VI-NEXT: s_sub_i32 s5, 0, s2
+; VI-NEXT: s_sub_i32 s6, 0, s6
+; VI-NEXT: s_max_i32 s7, s8, s7
+; VI-NEXT: s_ashr_i32 s8, s3, 16
; VI-NEXT: s_sext_i32_i16 s4, s4
+; VI-NEXT: s_sext_i32_i16 s3, s3
+; VI-NEXT: s_sext_i32_i16 s6, s6
; VI-NEXT: s_sext_i32_i16 s5, s5
-; VI-NEXT: s_max_i32 s3, s3, s6
-; VI-NEXT: s_max_i32 s2, s2, s7
-; VI-NEXT: s_max_i32 s5, s9, s5
-; VI-NEXT: s_max_i32 s4, s8, s4
-; VI-NEXT: s_add_i32 s2, s2, 2
+; VI-NEXT: s_sext_i32_i16 s2, s2
+; VI-NEXT: s_max_i32 s3, s3, s4
+; VI-NEXT: s_max_i32 s6, s8, s6
+; VI-NEXT: s_max_i32 s2, s2, s5
; VI-NEXT: s_add_i32 s3, s3, 2
-; VI-NEXT: s_lshl_b32 s4, s4, 16
-; VI-NEXT: s_lshl_b32 s5, s5, 16
+; VI-NEXT: s_lshl_b32 s4, s6, 16
; VI-NEXT: s_and_b32 s3, s3, 0xffff
+; VI-NEXT: s_add_i32 s2, s2, 2
+; VI-NEXT: s_or_b32 s3, s4, s3
+; VI-NEXT: s_lshl_b32 s4, s7, 16
; VI-NEXT: s_and_b32 s2, s2, 0xffff
-; VI-NEXT: s_or_b32 s3, s5, s3
; VI-NEXT: s_or_b32 s2, s4, s2
; VI-NEXT: s_add_i32 s3, s3, 0x20000
; VI-NEXT: s_add_i32 s2, s2, 0x20000
@@ -559,21 +559,21 @@ define amdgpu_kernel void @s_min_max_v2i16(ptr addrspace(1) %out0, ptr addrspace
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: s_ashr_i32 s2, s0, 16
-; VI-NEXT: s_sext_i32_i16 s0, s0
-; VI-NEXT: s_ashr_i32 s3, s1, 16
+; VI-NEXT: s_ashr_i32 s2, s1, 16
+; VI-NEXT: s_ashr_i32 s3, s0, 16
; VI-NEXT: s_sext_i32_i16 s1, s1
+; VI-NEXT: s_sext_i32_i16 s0, s0
; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_max_i32 s4, s2, s3
+; VI-NEXT: s_max_i32 s4, s3, s2
; VI-NEXT: s_max_i32 s5, s0, s1
; VI-NEXT: s_lshl_b32 s4, s4, 16
; VI-NEXT: s_and_b32 s5, s5, 0xffff
-; VI-NEXT: s_min_i32 s2, s2, s3
+; VI-NEXT: s_min_i32 s2, s3, s2
; VI-NEXT: s_min_i32 s0, s0, s1
; VI-NEXT: s_or_b32 s4, s5, s4
-; VI-NEXT: s_lshl_b32 s1, s2, 16
+; VI-NEXT: s_lshl_b32 s2, s2, 16
; VI-NEXT: s_and_b32 s0, s0, 0xffff
-; VI-NEXT: s_or_b32 s0, s0, s1
+; VI-NEXT: s_or_b32 s0, s0, s2
; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
@@ -661,12 +661,12 @@ define amdgpu_kernel void @v_min_max_v2i16(ptr addrspace(1) %out0, ptr addrspace
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: v_max_i32_sdwa v6, sext(v4), sext(v5) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; VI-NEXT: v_max_i32_sdwa v7, sext(v4), sext(v5) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_min_i32_sdwa v8, sext(v4), sext(v5) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; VI-NEXT: v_min_i32_sdwa v4, sext(v4), sext(v5) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_or_b32_sdwa v5, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_max_i32_sdwa v6, sext(v4), sext(v5) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT: v_max_i32_sdwa v7, sext(v4), sext(v5) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; VI-NEXT: v_min_i32_sdwa v8, sext(v4), sext(v5) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT: v_min_i32_sdwa v4, sext(v4), sext(v5) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; VI-NEXT: v_or_b32_sdwa v5, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v4, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: flat_store_dword v[0:1], v5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dword v[2:3], v4
@@ -748,37 +748,37 @@ define amdgpu_kernel void @s_min_max_v4i16(ptr addrspace(1) %out0, ptr addrspace
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: s_ashr_i32 s0, s5, 16
-; VI-NEXT: s_ashr_i32 s1, s4, 16
-; VI-NEXT: s_sext_i32_i16 s2, s5
-; VI-NEXT: s_sext_i32_i16 s3, s4
-; VI-NEXT: s_ashr_i32 s4, s7, 16
-; VI-NEXT: s_ashr_i32 s5, s6, 16
-; VI-NEXT: s_sext_i32_i16 s7, s7
+; VI-NEXT: s_ashr_i32 s0, s7, 16
+; VI-NEXT: s_ashr_i32 s1, s5, 16
+; VI-NEXT: s_sext_i32_i16 s3, s7
+; VI-NEXT: s_sext_i32_i16 s5, s5
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_max_i32 s2, s1, s0
+; VI-NEXT: s_max_i32 s7, s5, s3
+; VI-NEXT: s_lshl_b32 s2, s2, 16
+; VI-NEXT: s_and_b32 s7, s7, 0xffff
+; VI-NEXT: s_or_b32 s2, s7, s2
+; VI-NEXT: s_ashr_i32 s7, s6, 16
+; VI-NEXT: s_ashr_i32 s8, s4, 16
; VI-NEXT: s_sext_i32_i16 s6, s6
-; VI-NEXT: s_max_i32 s8, s1, s5
-; VI-NEXT: s_max_i32 s9, s0, s4
-; VI-NEXT: s_max_i32 s10, s3, s6
-; VI-NEXT: s_max_i32 s11, s2, s7
-; VI-NEXT: s_min_i32 s0, s0, s4
-; VI-NEXT: s_min_i32 s2, s2, s7
+; VI-NEXT: s_sext_i32_i16 s4, s4
+; VI-NEXT: s_min_i32 s0, s1, s0
+; VI-NEXT: s_min_i32 s1, s5, s3
+; VI-NEXT: s_max_i32 s9, s8, s7
+; VI-NEXT: s_max_i32 s10, s4, s6
+; VI-NEXT: s_lshl_b32 s0, s0, 16
+; VI-NEXT: s_and_b32 s1, s1, 0xffff
; VI-NEXT: s_lshl_b32 s9, s9, 16
-; VI-NEXT: s_and_b32 s11, s11, 0xffff
-; VI-NEXT: s_lshl_b32 s8, s8, 16
; VI-NEXT: s_and_b32 s10, s10, 0xffff
-; VI-NEXT: s_min_i32 s1, s1, s5
-; VI-NEXT: s_min_i32 s3, s3, s6
-; VI-NEXT: s_lshl_b32 s0, s0, 16
-; VI-NEXT: s_and_b32 s2, s2, 0xffff
-; VI-NEXT: s_or_b32 s9, s11, s9
-; VI-NEXT: s_or_b32 s8, s10, s8
-; VI-NEXT: s_or_b32 s0, s2, s0
+; VI-NEXT: v_mov_b32_e32 v5, s2
+; VI-NEXT: s_or_b32 s0, s1, s0
+; VI-NEXT: s_min_i32 s1, s8, s7
+; VI-NEXT: s_min_i32 s2, s4, s6
+; VI-NEXT: s_or_b32 s9, s10, s9
; VI-NEXT: s_lshl_b32 s1, s1, 16
-; VI-NEXT: s_and_b32 s2, s3, 0xffff
-; VI-NEXT: v_mov_b32_e32 v4, s8
-; VI-NEXT: v_mov_b32_e32 v5, s9
+; VI-NEXT: s_and_b32 s2, s2, 0xffff
+; VI-NEXT: v_mov_b32_e32 v4, s9
; VI-NEXT: s_or_b32 s1, s2, s1
; VI-NEXT: v_mov_b32_e32 v6, s1
; VI-NEXT: v_mov_b32_e32 v7, s0
@@ -899,42 +899,34 @@ define amdgpu_kernel void @v_min_max_v2i16_user(ptr addrspace(1) %out0, ptr addr
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_readfirstlane_b32 s0, v4
-; VI-NEXT: v_readfirstlane_b32 s1, v5
-; VI-NEXT: s_ashr_i32 s3, s0, 16
-; VI-NEXT: s_ashr_i32 s5, s1, 16
-; VI-NEXT: s_cmp_gt_i32 s3, s5
-; VI-NEXT: s_sext_i32_i16 s2, s0
-; VI-NEXT: s_sext_i32_i16 s4, s1
-; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1]
-; VI-NEXT: s_and_b64 s[0:1], s[0:1], exec
-; VI-NEXT: s_cselect_b32 s0, s3, s5
-; VI-NEXT: s_cselect_b32 s3, s5, s3
-; VI-NEXT: s_lshl_b32 s5, s0, 16
-; VI-NEXT: s_cmp_gt_i32 s2, s4
-; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[0:1]
-; VI-NEXT: s_and_b64 s[0:1], s[0:1], exec
-; VI-NEXT: s_cselect_b32 s0, s2, s4
-; VI-NEXT: s_cselect_b32 s1, s4, s2
-; VI-NEXT: s_and_b32 s0, s0, 0xffff
-; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v4
-; VI-NEXT: s_lshl_b32 s2, s3, 16
-; VI-NEXT: s_and_b32 s1, s1, 0xffff
-; VI-NEXT: s_or_b32 s0, s0, s5
-; VI-NEXT: v_or_b32_e32 v4, v5, v4
-; VI-NEXT: s_or_b32 s1, s1, s2
-; VI-NEXT: v_mov_b32_e32 v5, s0
-; VI-NEXT: v_and_b32_e32 v4, 3, v4
-; VI-NEXT: v_mov_b32_e32 v6, s1
-; VI-NEXT: flat_store_dword v[0:1], v5
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_ashrrev_i32_e32 v10, 16, v4
+; VI-NEXT: v_ashrrev_i32_e32 v11, 16, v5
+; VI-NEXT: v_bfe_i32 v6, v4, 0, 16
+; VI-NEXT: v_bfe_i32 v7, v5, 0, 16
+; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v4
+; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v5
+; VI-NEXT: v_cmp_gt_i32_e32 vcc, v10, v11
+; VI-NEXT: v_cndmask_b32_e32 v10, v9, v8, vcc
+; VI-NEXT: v_cmp_gt_i32_e64 s[0:1], v6, v7
+; VI-NEXT: v_cndmask_b32_e64 v6, v5, v4, s[0:1]
+; VI-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc
+; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v10
+; VI-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[0:1]
+; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; VI-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[0:1]
+; VI-NEXT: v_or_b32_sdwa v6, v6, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b32_e32 v5, 1, v5
+; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; VI-NEXT: flat_store_dword v[0:1], v6
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: flat_store_dword v[2:3], v6
+; VI-NEXT: v_or_b32_e32 v0, v9, v5
+; VI-NEXT: v_or_b32_sdwa v4, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v0, 3, v0
+; VI-NEXT: flat_store_dword v[2:3], v4
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: flat_store_byte v[0:1], v4
+; VI-NEXT: flat_store_byte v[0:1], v0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
;
@@ -1021,19 +1013,19 @@ define amdgpu_kernel void @u_min_max_v2i16(ptr addrspace(1) %out0, ptr addrspace
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_lshr_b32 s2, s0, 16
-; VI-NEXT: s_lshr_b32 s3, s1, 16
-; VI-NEXT: s_and_b32 s0, s0, 0xffff
-; VI-NEXT: s_and_b32 s1, s1, 0xffff
-; VI-NEXT: s_max_u32 s5, s2, s3
+; VI-NEXT: s_and_b32 s2, s1, 0xffff
+; VI-NEXT: s_and_b32 s3, s0, 0xffff
+; VI-NEXT: s_lshr_b32 s1, s1, 16
+; VI-NEXT: s_lshr_b32 s0, s0, 16
+; VI-NEXT: s_max_u32 s5, s0, s1
; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: s_max_u32 s4, s0, s1
+; VI-NEXT: s_max_u32 s4, s3, s2
; VI-NEXT: s_lshl_b32 s5, s5, 16
; VI-NEXT: s_min_u32 s0, s0, s1
-; VI-NEXT: s_min_u32 s1, s2, s3
; VI-NEXT: s_or_b32 s4, s4, s5
-; VI-NEXT: s_lshl_b32 s1, s1, 16
-; VI-NEXT: s_or_b32 s0, s0, s1
+; VI-NEXT: s_min_u32 s2, s3, s2
+; VI-NEXT: s_lshl_b32 s0, s0, 16
+; VI-NEXT: s_or_b32 s0, s2, s0
; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
diff --git a/llvm/test/CodeGen/AMDGPU/sra.ll b/llvm/test/CodeGen/AMDGPU/sra.ll
index cd06a060a50cd8..1540c3e5c403fd 100644
--- a/llvm/test/CodeGen/AMDGPU/sra.ll
+++ b/llvm/test/CodeGen/AMDGPU/sra.ll
@@ -187,15 +187,14 @@ define amdgpu_kernel void @ashr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_mov_b32 s4, s0
; VI-NEXT: s_mov_b32 s5, s1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_readfirstlane_b32 s0, v1
-; VI-NEXT: v_readfirstlane_b32 s1, v0
-; VI-NEXT: s_ashr_i32 s2, s1, 16
-; VI-NEXT: s_sext_i32_i16 s1, s1
+; VI-NEXT: v_readfirstlane_b32 s0, v0
+; VI-NEXT: v_readfirstlane_b32 s1, v1
+; VI-NEXT: s_lshr_b32 s2, s1, 16
; VI-NEXT: s_ashr_i32 s3, s0, 16
; VI-NEXT: s_sext_i32_i16 s0, s0
-; VI-NEXT: s_ashr_i32 s0, s1, s0
-; VI-NEXT: s_ashr_i32 s1, s2, s3
-; VI-NEXT: s_lshl_b32 s1, s1, 16
+; VI-NEXT: s_ashr_i32 s2, s3, s2
+; VI-NEXT: s_ashr_i32 s0, s0, s1
+; VI-NEXT: s_lshl_b32 s1, s2, 16
; VI-NEXT: s_and_b32 s0, s0, 0xffff
; VI-NEXT: s_or_b32 s0, s0, s1
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -282,43 +281,41 @@ define amdgpu_kernel void @ashr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %i
;
; VI-LABEL: ashr_v4i16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_readfirstlane_b32 s0, v2
-; VI-NEXT: v_readfirstlane_b32 s1, v3
-; VI-NEXT: v_readfirstlane_b32 s2, v0
-; VI-NEXT: v_readfirstlane_b32 s3, v1
-; VI-NEXT: s_ashr_i32 s8, s3, 16
-; VI-NEXT: s_sext_i32_i16 s3, s3
-; VI-NEXT: s_ashr_i32 s9, s2, 16
-; VI-NEXT: s_sext_i32_i16 s2, s2
-; VI-NEXT: s_ashr_i32 s10, s1, 16
-; VI-NEXT: s_sext_i32_i16 s1, s1
-; VI-NEXT: s_ashr_i32 s11, s0, 16
-; VI-NEXT: s_sext_i32_i16 s0, s0
-; VI-NEXT: s_ashr_i32 s0, s2, s0
-; VI-NEXT: s_ashr_i32 s2, s9, s11
-; VI-NEXT: s_ashr_i32 s1, s3, s1
-; VI-NEXT: s_ashr_i32 s3, s8, s10
-; VI-NEXT: s_lshl_b32 s3, s3, 16
-; VI-NEXT: s_and_b32 s1, s1, 0xffff
-; VI-NEXT: s_lshl_b32 s2, s2, 16
-; VI-NEXT: s_and_b32 s0, s0, 0xffff
-; VI-NEXT: s_or_b32 s1, s1, s3
-; VI-NEXT: s_or_b32 s0, s0, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: v_readfirstlane_b32 s4, v0
+; VI-NEXT: v_readfirstlane_b32 s5, v2
+; VI-NEXT: v_readfirstlane_b32 s6, v1
+; VI-NEXT: v_readfirstlane_b32 s7, v3
+; VI-NEXT: s_lshr_b32 s8, s7, 16
+; VI-NEXT: s_ashr_i32 s9, s6, 16
+; VI-NEXT: s_sext_i32_i16 s6, s6
+; VI-NEXT: s_lshr_b32 s10, s5, 16
+; VI-NEXT: s_ashr_i32 s11, s4, 16
+; VI-NEXT: s_sext_i32_i16 s4, s4
+; VI-NEXT: s_ashr_i32 s8, s9, s8
+; VI-NEXT: s_ashr_i32 s6, s6, s7
+; VI-NEXT: s_ashr_i32 s7, s11, s10
+; VI-NEXT: s_ashr_i32 s4, s4, s5
+; VI-NEXT: s_lshl_b32 s5, s8, 16
+; VI-NEXT: s_and_b32 s6, s6, 0xffff
+; VI-NEXT: s_lshl_b32 s7, s7, 16
+; VI-NEXT: s_and_b32 s4, s4, 0xffff
+; VI-NEXT: s_or_b32 s5, s6, s5
+; VI-NEXT: s_or_b32 s4, s4, s7
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: ashr_v4i16:
diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
index 5a821db6ff0408..327a85e80da9dc 100644
--- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
@@ -117,23 +117,21 @@ define amdgpu_kernel void @s_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s6, s[6:7], 0x0
-; VI-NEXT: s_load_dword s7, s[0:1], 0x0
-; VI-NEXT: s_mov_b32 s0, s4
-; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_load_dword s2, s[6:7], 0x0
+; VI-NEXT: s_load_dword s0, s[0:1], 0x0
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s4, s6, 16
-; VI-NEXT: s_lshr_b32 s5, s7, 16
-; VI-NEXT: s_sub_i32 s6, s6, s7
-; VI-NEXT: s_sub_i32 s4, s4, s5
-; VI-NEXT: s_and_b32 s5, s6, 0xffff
-; VI-NEXT: s_lshl_b32 s4, s4, 16
-; VI-NEXT: s_or_b32 s4, s5, s4
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_sub_i32 s1, s2, s0
+; VI-NEXT: s_lshr_b32 s0, s0, 16
+; VI-NEXT: s_lshr_b32 s2, s2, 16
+; VI-NEXT: s_sub_i32 s0, s2, s0
+; VI-NEXT: s_and_b32 s1, s1, 0xffff
+; VI-NEXT: s_lshl_b32 s0, s0, 16
+; VI-NEXT: s_or_b32 s0, s1, s0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX10-LABEL: s_test_sub_v2i16:
@@ -235,9 +233,9 @@ define amdgpu_kernel void @s_test_sub_v2i16_kernarg(ptr addrspace(1) %out, <2 x
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_mov_b32 s4, s0
; VI-NEXT: s_mov_b32 s5, s1
-; VI-NEXT: s_lshr_b32 s0, s2, 16
-; VI-NEXT: s_lshr_b32 s1, s3, 16
-; VI-NEXT: s_sub_i32 s0, s0, s1
+; VI-NEXT: s_lshr_b32 s0, s3, 16
+; VI-NEXT: s_lshr_b32 s1, s2, 16
+; VI-NEXT: s_sub_i32 s0, s1, s0
; VI-NEXT: s_sub_i32 s1, s2, s3
; VI-NEXT: s_lshl_b32 s0, s0, 16
; VI-NEXT: s_and_b32 s1, s1, 0xffff
diff --git a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
index ee99fcc5863340..6195dd1992469b 100644
--- a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
@@ -92,10 +92,11 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg
; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0xff
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_max_i16_e64 v1, s2, 0
-; SDAG-VI-NEXT: v_max_i16_e64 v2, s3, 0
-; SDAG-VI-NEXT: v_min_i16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; SDAG-VI-NEXT: v_min_i16_e32 v1, 0xff, v1
+; SDAG-VI-NEXT: s_sext_i32_i16 s2, s2
+; SDAG-VI-NEXT: s_sext_i32_i16 s3, s3
+; SDAG-VI-NEXT: v_med3_i32 v1, s2, 0, v0
+; SDAG-VI-NEXT: v_med3_i32 v0, s3, 0, v0
+; SDAG-VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; SDAG-VI-NEXT: v_or_b32_e32 v2, v1, v0
; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
@@ -417,12 +418,12 @@ define amdgpu_kernel void @vec_smax_smin_sgpr(ptr addrspace(1) %out, <2 x i16> i
; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0xff
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: s_lshr_b32 s2, s4, 16
-; SDAG-VI-NEXT: v_max_i16_e64 v1, s4, 0
-; SDAG-VI-NEXT: v_max_i16_e64 v2, s2, 0
-; SDAG-VI-NEXT: v_min_i16_e32 v1, 0xff, v1
-; SDAG-VI-NEXT: v_min_i16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; SDAG-VI-NEXT: v_or_b32_e32 v2, v1, v0
+; SDAG-VI-NEXT: s_ashr_i32 s2, s4, 16
+; SDAG-VI-NEXT: s_sext_i32_i16 s3, s4
+; SDAG-VI-NEXT: v_med3_i32 v1, s3, 0, v0
+; SDAG-VI-NEXT: v_med3_i32 v0, s2, 0, v0
+; SDAG-VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; SDAG-VI-NEXT: v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
; SDAG-VI-NEXT: flat_store_dword v[0:1], v2
diff --git a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
index 7a1f05f56a7517..f074f7bf67f770 100644
--- a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
+++ b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
@@ -323,7 +323,7 @@ define amdgpu_kernel void @widen_v2i8_constant_load(ptr addrspace(4) %arg) {
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s0, s[0:1], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s1, s0, 0xffffff00
+; VI-NEXT: s_and_b32 s1, s0, 0xff00
; VI-NEXT: s_add_i32 s0, s0, 12
; VI-NEXT: s_or_b32 s0, s0, 4
; VI-NEXT: s_and_b32 s0, s0, 0xff
diff --git a/llvm/test/CodeGen/AMDGPU/zero_extend.ll b/llvm/test/CodeGen/AMDGPU/zero_extend.ll
index af50e09f509a3b..ded6f5cda33b9a 100644
--- a/llvm/test/CodeGen/AMDGPU/zero_extend.ll
+++ b/llvm/test/CodeGen/AMDGPU/zero_extend.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GFX8,GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GFX9,GCN %s
; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 %s
; R600: {{^}}s_mad_zext_i32_to_i64:
@@ -53,7 +53,8 @@ define amdgpu_kernel void @s_cmp_zext_i1_to_i64(ptr addrspace(1) %out, i32 %a, i
; GCN-DAG: s_and_b32 [[MASK_A:s[0-9]+]], [[A]], 0xffff{{$}}
; GCN-DAG: s_and_b32 [[MASK_B:s[0-9]+]], [[B]], 0xffff{{$}}
-; GCN: s_cmp_eq_u32 [[MASK_A]], [[MASK_B]]
+; GFX8: s_cmp_eq_u32 [[MASK_A]], [[MASK_B]]
+; GFX9: s_cmp_eq_u32 [[MASK_B]], [[MASK_A]]
; GCN: s_cselect_b64 [[CC:s\[[0-9:]+\]]], -1, 0
; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CC]]
; GCN: buffer_store_short [[RESULT]]
More information about the llvm-commits
mailing list