[llvm] [AMDGPU] Disable i16 -> i32 promotion in CGP for DAGISel (PR #112334)

Pierre van Houtryve via llvm-commits llvm-commits at lists.llvm.org
Tue Oct 15 01:42:25 PDT 2024


https://github.com/Pierre-vh created https://github.com/llvm/llvm-project/pull/112334

Only promote in CGP if GlobalISel is enabled, or if the CL option is specifically used.

>From d05a3222b5637cb3a8cec49acf882b6b7fdd76a4 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Tue, 15 Oct 2024 10:40:13 +0200
Subject: [PATCH] [AMDGPU] Disable i16 -> i32 promotion in CGP for DAGISel

Only promote in CGP if GlobalISel is enabled, or if the CL option is specifically used.
---
 .../Target/AMDGPU/AMDGPUCodeGenPrepare.cpp    |   7 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  35 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.h       |   1 +
 llvm/test/CodeGen/AMDGPU/add.v2i16.ll         |  33 +-
 llvm/test/CodeGen/AMDGPU/anyext.ll            |   8 +-
 .../CodeGen/AMDGPU/calling-conventions.ll     | 413 +++++++++---------
 .../CodeGen/AMDGPU/cgp-bitfield-extract.ll    |   9 +-
 llvm/test/CodeGen/AMDGPU/ctlz.ll              |  21 +-
 llvm/test/CodeGen/AMDGPU/cttz.ll              |  12 +-
 llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll   |  20 +-
 llvm/test/CodeGen/AMDGPU/fneg.ll              |  13 +-
 llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll        |  14 +-
 llvm/test/CodeGen/AMDGPU/min.ll               | 369 ++++++++--------
 llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll     |  19 +-
 llvm/test/CodeGen/AMDGPU/sext-in-reg.ll       |  15 +-
 llvm/test/CodeGen/AMDGPU/shl.v2i16.ll         |   6 +-
 llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll     | 230 +++++-----
 llvm/test/CodeGen/AMDGPU/sra.ll               |  79 ++--
 llvm/test/CodeGen/AMDGPU/sub.v2i16.ll         |  34 +-
 llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll   |  21 +-
 llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll  |   2 +-
 llvm/test/CodeGen/AMDGPU/zero_extend.ll       |   7 +-
 22 files changed, 670 insertions(+), 698 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 7d3164c79089e0..4e3fb628e7ea7a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -401,7 +401,12 @@ bool AMDGPUCodeGenPrepareImpl::isSigned(const SelectInst &I) const {
 }
 
 bool AMDGPUCodeGenPrepareImpl::needsPromotionToI32(const Type *T) const {
-  if (!Widen16BitOps)
+  // Disable i16 -> i32 widening for the DAG path only if the option is not
+  // used.
+  if (Widen16BitOps.getNumOccurrences()) {
+    if (!Widen16BitOps)
+      return false;
+  } else if (!getCGPassBuilderOption().EnableGlobalISelOption)
     return false;
 
   const IntegerType *IntTy = dyn_cast<IntegerType>(T);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 8c197f23149612..c7bae4dd05fd34 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -6822,6 +6822,22 @@ static unsigned getExtOpcodeForPromotedOp(SDValue Op) {
   }
 }
 
+bool SITargetLowering::shouldPromoteUniformOpToI32(SDValue Op, EVT ExtTy, EVT OpTy) const {
+  switch(Op.getOpcode()) {
+    case ISD::SMIN:
+    case ISD::SMAX:
+    case ISD::UMIN:
+    case ISD::UMAX: {
+      if (!Subtarget->has16BitInsts() || OpTy.isVector())
+        return false;
+      unsigned Size = OpTy.getSizeInBits();
+      return !Op->isDivergent() && Size >= 2 && Size <= 16;
+    }
+    default:
+      return !isNarrowingProfitable(Op.getNode(), ExtTy, OpTy);
+  }
+}
+
 SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
                                                 DAGCombinerInfo &DCI) const {
   const unsigned Opc = Op.getOpcode();
@@ -6836,7 +6852,7 @@ SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
   auto ExtTy = OpTy.changeElementType(MVT::i32);
 
   if (DCI.isBeforeLegalizeOps() ||
-      isNarrowingProfitable(Op.getNode(), ExtTy, OpTy))
+      !shouldPromoteUniformOpToI32(Op, ExtTy, OpTy))
     return SDValue();
 
   auto &DAG = DCI.DAG;
@@ -6852,14 +6868,25 @@ SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
     RHS = Op->getOperand(1);
   }
 
-  const unsigned ExtOp = getExtOpcodeForPromotedOp(Op);
-  LHS = DAG.getNode(ExtOp, DL, ExtTy, {LHS});
+  // For SIGN_EXTEND, check for an existing SIGN_EXTEND_INREG and reproduce that instead.
+  // It leads to better code generation.
+  // TODO: DAGCombiner should take care of that but the combine doesn't apply all the time.
+  const auto ExtendOp = [&, ExtOp = getExtOpcodeForPromotedOp(Op)](SDValue Op){
+    if (ExtOp == ISD::SIGN_EXTEND && Op.getOpcode() == ISD::SIGN_EXTEND_INREG) {
+      SDValue InRegSrc = DAG.getNode(ISD::ANY_EXTEND, DL, ExtTy, Op.getOperand(0));
+      return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ExtTy, {InRegSrc, Op->getOperand(1)});
+    }
+
+    return DAG.getNode(ExtOp, DL, ExtTy, {Op});
+  };
+
+  LHS = ExtendOp(LHS);
 
   // Special case: for shifts, the RHS always needs a zext.
   if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
     RHS = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtTy, {RHS});
   else
-    RHS = DAG.getNode(ExtOp, DL, ExtTy, {RHS});
+    RHS = ExtendOp(RHS);
 
   // setcc always return i1/i1 vec so no need to truncate after.
   if (Opc == ISD::SETCC) {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 6c3edf37945e24..4fdecd8252125a 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -147,6 +147,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const;
+  bool shouldPromoteUniformOpToI32(SDValue Op, EVT ExtTy, EVT OpTy) const;
   SDValue promoteUniformOpToI32(SDValue Op, DAGCombinerInfo &DCI) const;
   SDValue lowerMUL(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerXMULO(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
index 4cc384e9d27188..bcd75255acef44 100644
--- a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
@@ -103,13 +103,13 @@ define amdgpu_kernel void @s_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshr_b32 s1, s2, 16
-; VI-NEXT:    s_lshr_b32 s3, s0, 16
+; VI-NEXT:    s_add_i32 s1, s2, s0
+; VI-NEXT:    s_lshr_b32 s0, s0, 16
+; VI-NEXT:    s_lshr_b32 s2, s2, 16
 ; VI-NEXT:    s_add_i32 s2, s2, s0
-; VI-NEXT:    s_add_i32 s1, s1, s3
-; VI-NEXT:    s_and_b32 s0, s2, 0xffff
-; VI-NEXT:    s_lshl_b32 s1, s1, 16
-; VI-NEXT:    s_or_b32 s0, s0, s1
+; VI-NEXT:    s_and_b32 s1, s1, 0xffff
+; VI-NEXT:    s_lshl_b32 s0, s2, 16
+; VI-NEXT:    s_or_b32 s0, s1, s0
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
@@ -170,16 +170,15 @@ define amdgpu_kernel void @s_test_add_self_v2i16(ptr addrspace(1) %out, ptr addr
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
-; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshr_b32 s0, s2, 16
-; VI-NEXT:    s_and_b32 s1, s2, 0xffff
+; VI-NEXT:    s_lshr_b32 s1, s2, 16
+; VI-NEXT:    s_add_i32 s0, s2, s2
 ; VI-NEXT:    s_add_i32 s1, s1, s1
-; VI-NEXT:    s_add_i32 s0, s0, s0
-; VI-NEXT:    s_lshl_b32 s0, s0, 16
-; VI-NEXT:    s_and_b32 s1, s1, 0xffff
-; VI-NEXT:    s_or_b32 s0, s1, s0
+; VI-NEXT:    s_and_b32 s0, s0, 0xffff
+; VI-NEXT:    s_lshl_b32 s1, s1, 16
+; VI-NEXT:    s_or_b32 s0, s0, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
@@ -230,12 +229,12 @@ define amdgpu_kernel void @s_test_add_v2i16_kernarg(ptr addrspace(1) %out, <2 x
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshr_b32 s4, s2, 16
-; VI-NEXT:    s_lshr_b32 s5, s3, 16
+; VI-NEXT:    s_lshr_b32 s4, s3, 16
+; VI-NEXT:    s_lshr_b32 s5, s2, 16
 ; VI-NEXT:    s_add_i32 s2, s2, s3
-; VI-NEXT:    s_add_i32 s4, s4, s5
+; VI-NEXT:    s_add_i32 s5, s5, s4
 ; VI-NEXT:    s_and_b32 s2, s2, 0xffff
-; VI-NEXT:    s_lshl_b32 s3, s4, 16
+; VI-NEXT:    s_lshl_b32 s3, s5, 16
 ; VI-NEXT:    s_or_b32 s2, s2, s3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
diff --git a/llvm/test/CodeGen/AMDGPU/anyext.ll b/llvm/test/CodeGen/AMDGPU/anyext.ll
index 8b6c8be9f37882..115cb40676da8c 100644
--- a/llvm/test/CodeGen/AMDGPU/anyext.ll
+++ b/llvm/test/CodeGen/AMDGPU/anyext.ll
@@ -27,11 +27,9 @@ define amdgpu_kernel void @anyext_i1_i32(ptr addrspace(1) %out, i32 %cond) #0 {
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX8-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; GFX8-NEXT:    v_not_b32_e32 v0, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -42,11 +40,9 @@ define amdgpu_kernel void @anyext_i1_i32(ptr addrspace(1) %out, i32 %cond) #0 {
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; GFX9-NEXT:    v_not_b32_e32 v0, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
index 29770738f83d57..e9ddc801b050c5 100644
--- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
+++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
@@ -980,7 +980,6 @@ define amdgpu_ps void @ps_mesa_inreg_i16(i16 inreg %arg0) {
 ;
 ; VI-LABEL: ps_mesa_inreg_i16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_and_b32 s0, 0xffff, s0
 ; VI-NEXT:    s_add_i32 s0, s0, s0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    flat_store_short v[0:1], v0
@@ -988,9 +987,8 @@ define amdgpu_ps void @ps_mesa_inreg_i16(i16 inreg %arg0) {
 ;
 ; GFX11-LABEL: ps_mesa_inreg_i16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_and_b32 s0, 0xffff, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_add_i32 s0, s0, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-NEXT:    global_store_b16 v[0:1], v0, off
 ; GFX11-NEXT:    s_nop 0
@@ -1140,20 +1138,20 @@ define amdgpu_kernel void @amd_kernel_v4i8(<4 x i8> %arg0) {
 ; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshr_b32 s2, s0, 16
-; VI-NEXT:    s_lshr_b32 s1, s0, 24
-; VI-NEXT:    s_add_i32 s2, s2, s2
+; VI-NEXT:    s_lshr_b32 s1, s0, 16
+; VI-NEXT:    s_lshr_b32 s2, s0, 24
 ; VI-NEXT:    s_bfe_u32 s3, s0, 0x80008
+; VI-NEXT:    s_add_i32 s2, s2, s2
 ; VI-NEXT:    s_add_i32 s1, s1, s1
-; VI-NEXT:    s_and_b32 s2, s2, 0xff
-; VI-NEXT:    s_add_i32 s3, s3, s3
+; VI-NEXT:    s_lshl_b32 s2, s2, 8
+; VI-NEXT:    s_and_b32 s1, s1, 0xff
 ; VI-NEXT:    s_add_i32 s0, s0, s0
-; VI-NEXT:    s_lshl_b32 s1, s1, 24
-; VI-NEXT:    s_lshl_b32 s2, s2, 16
+; VI-NEXT:    s_add_i32 s3, s3, s3
 ; VI-NEXT:    s_or_b32 s1, s1, s2
 ; VI-NEXT:    s_and_b32 s0, s0, 0xff
 ; VI-NEXT:    s_lshl_b32 s2, s3, 8
 ; VI-NEXT:    s_or_b32 s0, s0, s2
+; VI-NEXT:    s_lshl_b32 s1, s1, 16
 ; VI-NEXT:    s_and_b32 s0, s0, 0xffff
 ; VI-NEXT:    s_or_b32 s0, s0, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
@@ -1227,8 +1225,8 @@ define amdgpu_kernel void @amd_kernel_v3i8(<3 x i8> %arg0) {
 ; VI-NEXT:    s_lshr_b32 s1, s0, 16
 ; VI-NEXT:    s_bfe_u32 s2, s0, 0x80008
 ; VI-NEXT:    s_add_i32 s0, s0, s0
-; VI-NEXT:    s_add_i32 s1, s1, s1
 ; VI-NEXT:    s_add_i32 s2, s2, s2
+; VI-NEXT:    s_add_i32 s1, s1, s1
 ; VI-NEXT:    s_and_b32 s0, s0, 0xff
 ; VI-NEXT:    s_lshl_b32 s2, s2, 8
 ; VI-NEXT:    v_mov_b32_e32 v2, s1
@@ -1308,22 +1306,21 @@ define amdgpu_kernel void @amd_kernel_v5i8(<5 x i8> %arg0) {
 ; VI-NEXT:    v_mov_b32_e32 v0, 4
 ; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshr_b32 s3, s0, 16
-; VI-NEXT:    s_lshr_b32 s2, s0, 24
-; VI-NEXT:    s_add_i32 s3, s3, s3
+; VI-NEXT:    s_lshr_b32 s2, s0, 16
+; VI-NEXT:    s_lshr_b32 s3, s0, 24
 ; VI-NEXT:    s_bfe_u32 s4, s0, 0x80008
+; VI-NEXT:    s_add_i32 s3, s3, s3
 ; VI-NEXT:    s_add_i32 s2, s2, s2
-; VI-NEXT:    s_and_b32 s3, s3, 0xff
-; VI-NEXT:    s_add_i32 s4, s4, s4
+; VI-NEXT:    s_lshl_b32 s3, s3, 8
+; VI-NEXT:    s_and_b32 s2, s2, 0xff
 ; VI-NEXT:    s_add_i32 s0, s0, s0
-; VI-NEXT:    s_lshl_b32 s2, s2, 24
-; VI-NEXT:    s_lshl_b32 s3, s3, 16
-; VI-NEXT:    s_and_b32 s1, s1, 0xff
+; VI-NEXT:    s_add_i32 s4, s4, s4
 ; VI-NEXT:    s_or_b32 s2, s2, s3
 ; VI-NEXT:    s_and_b32 s0, s0, 0xff
 ; VI-NEXT:    s_lshl_b32 s3, s4, 8
-; VI-NEXT:    s_add_i32 s1, s1, s1
 ; VI-NEXT:    s_or_b32 s0, s0, s3
+; VI-NEXT:    s_add_i32 s1, s1, s1
+; VI-NEXT:    s_lshl_b32 s2, s2, 16
 ; VI-NEXT:    s_and_b32 s0, s0, 0xffff
 ; VI-NEXT:    v_mov_b32_e32 v2, s1
 ; VI-NEXT:    s_or_b32 s0, s0, s2
@@ -1423,37 +1420,37 @@ define amdgpu_kernel void @amd_kernel_v8i8(<8 x i8> %arg0) {
 ; VI-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshr_b32 s3, s1, 16
-; VI-NEXT:    s_lshr_b32 s2, s1, 24
+; VI-NEXT:    s_lshr_b32 s2, s0, 16
+; VI-NEXT:    s_lshr_b32 s3, s0, 24
+; VI-NEXT:    s_lshr_b32 s4, s1, 16
+; VI-NEXT:    s_lshr_b32 s5, s1, 24
+; VI-NEXT:    s_bfe_u32 s6, s0, 0x80008
+; VI-NEXT:    s_bfe_u32 s7, s1, 0x80008
+; VI-NEXT:    s_add_i32 s5, s5, s5
+; VI-NEXT:    s_add_i32 s4, s4, s4
 ; VI-NEXT:    s_add_i32 s3, s3, s3
-; VI-NEXT:    s_bfe_u32 s6, s1, 0x80008
 ; VI-NEXT:    s_add_i32 s2, s2, s2
-; VI-NEXT:    s_and_b32 s3, s3, 0xff
-; VI-NEXT:    s_add_i32 s6, s6, s6
+; VI-NEXT:    s_lshl_b32 s5, s5, 8
+; VI-NEXT:    s_and_b32 s4, s4, 0xff
 ; VI-NEXT:    s_add_i32 s1, s1, s1
-; VI-NEXT:    s_lshl_b32 s2, s2, 24
-; VI-NEXT:    s_lshl_b32 s3, s3, 16
-; VI-NEXT:    s_lshr_b32 s5, s0, 16
-; VI-NEXT:    s_or_b32 s2, s2, s3
-; VI-NEXT:    s_and_b32 s1, s1, 0xff
-; VI-NEXT:    s_lshl_b32 s3, s6, 8
-; VI-NEXT:    s_lshr_b32 s4, s0, 24
-; VI-NEXT:    s_add_i32 s5, s5, s5
-; VI-NEXT:    s_or_b32 s1, s1, s3
-; VI-NEXT:    s_bfe_u32 s7, s0, 0x80008
-; VI-NEXT:    s_add_i32 s4, s4, s4
-; VI-NEXT:    s_and_b32 s1, s1, 0xffff
-; VI-NEXT:    s_and_b32 s3, s5, 0xff
 ; VI-NEXT:    s_add_i32 s7, s7, s7
+; VI-NEXT:    s_lshl_b32 s3, s3, 8
+; VI-NEXT:    s_and_b32 s2, s2, 0xff
 ; VI-NEXT:    s_add_i32 s0, s0, s0
-; VI-NEXT:    s_or_b32 s1, s1, s2
-; VI-NEXT:    s_lshl_b32 s2, s4, 24
-; VI-NEXT:    s_lshl_b32 s3, s3, 16
+; VI-NEXT:    s_add_i32 s6, s6, s6
+; VI-NEXT:    s_or_b32 s4, s4, s5
+; VI-NEXT:    s_and_b32 s1, s1, 0xff
+; VI-NEXT:    s_lshl_b32 s5, s7, 8
 ; VI-NEXT:    s_or_b32 s2, s2, s3
 ; VI-NEXT:    s_and_b32 s0, s0, 0xff
-; VI-NEXT:    s_lshl_b32 s3, s7, 8
+; VI-NEXT:    s_lshl_b32 s3, s6, 8
+; VI-NEXT:    s_or_b32 s1, s1, s5
 ; VI-NEXT:    s_or_b32 s0, s0, s3
+; VI-NEXT:    s_lshl_b32 s4, s4, 16
+; VI-NEXT:    s_and_b32 s1, s1, 0xffff
+; VI-NEXT:    s_lshl_b32 s2, s2, 16
 ; VI-NEXT:    s_and_b32 s0, s0, 0xffff
+; VI-NEXT:    s_or_b32 s1, s1, s4
 ; VI-NEXT:    s_or_b32 s0, s0, s2
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -1595,69 +1592,69 @@ define amdgpu_kernel void @amd_kernel_v16i8(<16 x i8> %arg0) {
 ; VI-NEXT:    v_mov_b32_e32 v4, 0
 ; VI-NEXT:    v_mov_b32_e32 v5, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshr_b32 s5, s3, 16
-; VI-NEXT:    s_lshr_b32 s4, s3, 24
+; VI-NEXT:    s_lshr_b32 s4, s0, 16
+; VI-NEXT:    s_lshr_b32 s5, s0, 24
+; VI-NEXT:    s_lshr_b32 s6, s1, 16
+; VI-NEXT:    s_lshr_b32 s7, s1, 24
+; VI-NEXT:    s_lshr_b32 s8, s2, 16
+; VI-NEXT:    s_lshr_b32 s9, s2, 24
+; VI-NEXT:    s_lshr_b32 s10, s3, 16
+; VI-NEXT:    s_lshr_b32 s11, s3, 24
+; VI-NEXT:    s_bfe_u32 s12, s0, 0x80008
+; VI-NEXT:    s_bfe_u32 s13, s1, 0x80008
+; VI-NEXT:    s_bfe_u32 s14, s2, 0x80008
+; VI-NEXT:    s_bfe_u32 s15, s3, 0x80008
+; VI-NEXT:    s_add_i32 s11, s11, s11
+; VI-NEXT:    s_add_i32 s10, s10, s10
+; VI-NEXT:    s_add_i32 s9, s9, s9
+; VI-NEXT:    s_add_i32 s8, s8, s8
+; VI-NEXT:    s_add_i32 s7, s7, s7
+; VI-NEXT:    s_add_i32 s6, s6, s6
 ; VI-NEXT:    s_add_i32 s5, s5, s5
-; VI-NEXT:    s_bfe_u32 s12, s3, 0x80008
 ; VI-NEXT:    s_add_i32 s4, s4, s4
-; VI-NEXT:    s_and_b32 s5, s5, 0xff
-; VI-NEXT:    s_add_i32 s12, s12, s12
+; VI-NEXT:    s_lshl_b32 s11, s11, 8
+; VI-NEXT:    s_and_b32 s10, s10, 0xff
 ; VI-NEXT:    s_add_i32 s3, s3, s3
-; VI-NEXT:    s_lshl_b32 s4, s4, 24
-; VI-NEXT:    s_lshl_b32 s5, s5, 16
-; VI-NEXT:    s_lshr_b32 s7, s2, 16
-; VI-NEXT:    s_or_b32 s4, s4, s5
-; VI-NEXT:    s_and_b32 s3, s3, 0xff
-; VI-NEXT:    s_lshl_b32 s5, s12, 8
-; VI-NEXT:    s_lshr_b32 s6, s2, 24
-; VI-NEXT:    s_add_i32 s7, s7, s7
-; VI-NEXT:    s_or_b32 s3, s3, s5
-; VI-NEXT:    s_bfe_u32 s13, s2, 0x80008
-; VI-NEXT:    s_add_i32 s6, s6, s6
-; VI-NEXT:    s_and_b32 s3, s3, 0xffff
-; VI-NEXT:    s_and_b32 s5, s7, 0xff
-; VI-NEXT:    s_add_i32 s13, s13, s13
+; VI-NEXT:    s_add_i32 s15, s15, s15
+; VI-NEXT:    s_lshl_b32 s9, s9, 8
+; VI-NEXT:    s_and_b32 s8, s8, 0xff
 ; VI-NEXT:    s_add_i32 s2, s2, s2
-; VI-NEXT:    s_or_b32 s3, s3, s4
-; VI-NEXT:    s_lshl_b32 s4, s6, 24
-; VI-NEXT:    s_lshl_b32 s5, s5, 16
-; VI-NEXT:    s_lshr_b32 s9, s1, 16
-; VI-NEXT:    s_or_b32 s4, s4, s5
-; VI-NEXT:    s_and_b32 s2, s2, 0xff
-; VI-NEXT:    s_lshl_b32 s5, s13, 8
-; VI-NEXT:    s_lshr_b32 s8, s1, 24
-; VI-NEXT:    s_add_i32 s9, s9, s9
-; VI-NEXT:    s_or_b32 s2, s2, s5
-; VI-NEXT:    s_bfe_u32 s14, s1, 0x80008
-; VI-NEXT:    s_add_i32 s8, s8, s8
-; VI-NEXT:    s_and_b32 s2, s2, 0xffff
-; VI-NEXT:    s_and_b32 s5, s9, 0xff
 ; VI-NEXT:    s_add_i32 s14, s14, s14
+; VI-NEXT:    s_lshl_b32 s7, s7, 8
+; VI-NEXT:    s_and_b32 s6, s6, 0xff
 ; VI-NEXT:    s_add_i32 s1, s1, s1
-; VI-NEXT:    s_or_b32 s2, s2, s4
-; VI-NEXT:    s_lshl_b32 s4, s8, 24
-; VI-NEXT:    s_lshl_b32 s5, s5, 16
-; VI-NEXT:    s_lshr_b32 s11, s0, 16
-; VI-NEXT:    s_or_b32 s4, s4, s5
-; VI-NEXT:    s_and_b32 s1, s1, 0xff
-; VI-NEXT:    s_lshl_b32 s5, s14, 8
-; VI-NEXT:    s_lshr_b32 s10, s0, 24
-; VI-NEXT:    s_add_i32 s11, s11, s11
-; VI-NEXT:    s_or_b32 s1, s1, s5
-; VI-NEXT:    s_bfe_u32 s15, s0, 0x80008
-; VI-NEXT:    s_add_i32 s10, s10, s10
-; VI-NEXT:    s_and_b32 s1, s1, 0xffff
-; VI-NEXT:    s_and_b32 s5, s11, 0xff
-; VI-NEXT:    s_add_i32 s15, s15, s15
+; VI-NEXT:    s_add_i32 s13, s13, s13
+; VI-NEXT:    s_lshl_b32 s5, s5, 8
+; VI-NEXT:    s_and_b32 s4, s4, 0xff
 ; VI-NEXT:    s_add_i32 s0, s0, s0
-; VI-NEXT:    s_or_b32 s1, s1, s4
-; VI-NEXT:    s_lshl_b32 s4, s10, 24
-; VI-NEXT:    s_lshl_b32 s5, s5, 16
+; VI-NEXT:    s_add_i32 s12, s12, s12
+; VI-NEXT:    s_or_b32 s10, s10, s11
+; VI-NEXT:    s_and_b32 s3, s3, 0xff
+; VI-NEXT:    s_lshl_b32 s11, s15, 8
+; VI-NEXT:    s_or_b32 s8, s8, s9
+; VI-NEXT:    s_and_b32 s2, s2, 0xff
+; VI-NEXT:    s_lshl_b32 s9, s14, 8
+; VI-NEXT:    s_or_b32 s6, s6, s7
+; VI-NEXT:    s_and_b32 s1, s1, 0xff
+; VI-NEXT:    s_lshl_b32 s7, s13, 8
 ; VI-NEXT:    s_or_b32 s4, s4, s5
 ; VI-NEXT:    s_and_b32 s0, s0, 0xff
-; VI-NEXT:    s_lshl_b32 s5, s15, 8
+; VI-NEXT:    s_lshl_b32 s5, s12, 8
+; VI-NEXT:    s_or_b32 s3, s3, s11
+; VI-NEXT:    s_or_b32 s2, s2, s9
+; VI-NEXT:    s_or_b32 s1, s1, s7
 ; VI-NEXT:    s_or_b32 s0, s0, s5
+; VI-NEXT:    s_lshl_b32 s10, s10, 16
+; VI-NEXT:    s_and_b32 s3, s3, 0xffff
+; VI-NEXT:    s_lshl_b32 s8, s8, 16
+; VI-NEXT:    s_and_b32 s2, s2, 0xffff
+; VI-NEXT:    s_lshl_b32 s6, s6, 16
+; VI-NEXT:    s_and_b32 s1, s1, 0xffff
+; VI-NEXT:    s_lshl_b32 s4, s4, 16
 ; VI-NEXT:    s_and_b32 s0, s0, 0xffff
+; VI-NEXT:    s_or_b32 s3, s3, s10
+; VI-NEXT:    s_or_b32 s2, s2, s8
+; VI-NEXT:    s_or_b32 s1, s1, s6
 ; VI-NEXT:    s_or_b32 s0, s0, s4
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -1904,138 +1901,138 @@ define amdgpu_kernel void @amd_kernel_v32i8(<32 x i8> %arg0) {
 ; VI-NEXT:    v_mov_b32_e32 v4, 16
 ; VI-NEXT:    v_mov_b32_e32 v5, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshr_b32 s9, s3, 16
-; VI-NEXT:    s_lshr_b32 s8, s3, 24
+; VI-NEXT:    s_lshr_b32 s8, s4, 16
+; VI-NEXT:    s_lshr_b32 s9, s4, 24
+; VI-NEXT:    s_lshr_b32 s10, s5, 16
+; VI-NEXT:    s_lshr_b32 s11, s5, 24
+; VI-NEXT:    s_lshr_b32 s12, s6, 16
+; VI-NEXT:    s_lshr_b32 s13, s6, 24
+; VI-NEXT:    s_lshr_b32 s14, s7, 16
+; VI-NEXT:    s_lshr_b32 s15, s7, 24
+; VI-NEXT:    s_bfe_u32 s24, s4, 0x80008
+; VI-NEXT:    s_bfe_u32 s25, s5, 0x80008
+; VI-NEXT:    s_bfe_u32 s26, s6, 0x80008
+; VI-NEXT:    s_bfe_u32 s27, s7, 0x80008
+; VI-NEXT:    s_add_i32 s15, s15, s15
+; VI-NEXT:    s_add_i32 s14, s14, s14
+; VI-NEXT:    s_add_i32 s13, s13, s13
+; VI-NEXT:    s_add_i32 s12, s12, s12
+; VI-NEXT:    s_add_i32 s11, s11, s11
+; VI-NEXT:    s_add_i32 s10, s10, s10
 ; VI-NEXT:    s_add_i32 s9, s9, s9
-; VI-NEXT:    s_bfe_u32 s24, s3, 0x80008
 ; VI-NEXT:    s_add_i32 s8, s8, s8
-; VI-NEXT:    s_and_b32 s9, s9, 0xff
+; VI-NEXT:    s_lshr_b32 s16, s0, 16
+; VI-NEXT:    s_lshr_b32 s17, s0, 24
+; VI-NEXT:    s_lshr_b32 s18, s1, 16
+; VI-NEXT:    s_lshr_b32 s19, s1, 24
+; VI-NEXT:    s_lshr_b32 s20, s2, 16
+; VI-NEXT:    s_lshr_b32 s21, s2, 24
+; VI-NEXT:    s_lshr_b32 s22, s3, 16
+; VI-NEXT:    s_lshr_b32 s23, s3, 24
+; VI-NEXT:    s_lshl_b32 s15, s15, 8
+; VI-NEXT:    s_and_b32 s14, s14, 0xff
+; VI-NEXT:    s_add_i32 s7, s7, s7
+; VI-NEXT:    s_add_i32 s27, s27, s27
+; VI-NEXT:    s_lshl_b32 s13, s13, 8
+; VI-NEXT:    s_and_b32 s12, s12, 0xff
+; VI-NEXT:    s_add_i32 s6, s6, s6
+; VI-NEXT:    s_add_i32 s26, s26, s26
+; VI-NEXT:    s_lshl_b32 s11, s11, 8
+; VI-NEXT:    s_and_b32 s10, s10, 0xff
+; VI-NEXT:    s_add_i32 s5, s5, s5
+; VI-NEXT:    s_add_i32 s25, s25, s25
+; VI-NEXT:    s_lshl_b32 s9, s9, 8
+; VI-NEXT:    s_and_b32 s8, s8, 0xff
+; VI-NEXT:    s_add_i32 s4, s4, s4
 ; VI-NEXT:    s_add_i32 s24, s24, s24
-; VI-NEXT:    s_add_i32 s3, s3, s3
-; VI-NEXT:    s_lshl_b32 s8, s8, 24
-; VI-NEXT:    s_lshl_b32 s9, s9, 16
-; VI-NEXT:    s_lshr_b32 s11, s2, 16
+; VI-NEXT:    s_bfe_u32 s28, s0, 0x80008
+; VI-NEXT:    s_bfe_u32 s29, s1, 0x80008
+; VI-NEXT:    s_bfe_u32 s30, s2, 0x80008
+; VI-NEXT:    s_bfe_u32 s31, s3, 0x80008
+; VI-NEXT:    s_add_i32 s23, s23, s23
+; VI-NEXT:    s_add_i32 s22, s22, s22
+; VI-NEXT:    s_add_i32 s21, s21, s21
+; VI-NEXT:    s_add_i32 s20, s20, s20
+; VI-NEXT:    s_add_i32 s19, s19, s19
+; VI-NEXT:    s_add_i32 s18, s18, s18
+; VI-NEXT:    s_add_i32 s17, s17, s17
+; VI-NEXT:    s_add_i32 s16, s16, s16
+; VI-NEXT:    s_or_b32 s14, s14, s15
+; VI-NEXT:    s_and_b32 s7, s7, 0xff
+; VI-NEXT:    s_lshl_b32 s15, s27, 8
+; VI-NEXT:    s_or_b32 s12, s12, s13
+; VI-NEXT:    s_and_b32 s6, s6, 0xff
+; VI-NEXT:    s_lshl_b32 s13, s26, 8
+; VI-NEXT:    s_or_b32 s10, s10, s11
+; VI-NEXT:    s_and_b32 s5, s5, 0xff
+; VI-NEXT:    s_lshl_b32 s11, s25, 8
 ; VI-NEXT:    s_or_b32 s8, s8, s9
-; VI-NEXT:    s_and_b32 s3, s3, 0xff
+; VI-NEXT:    s_and_b32 s4, s4, 0xff
 ; VI-NEXT:    s_lshl_b32 s9, s24, 8
-; VI-NEXT:    s_lshr_b32 s10, s2, 24
-; VI-NEXT:    s_add_i32 s11, s11, s11
-; VI-NEXT:    s_or_b32 s3, s3, s9
-; VI-NEXT:    s_bfe_u32 s25, s2, 0x80008
-; VI-NEXT:    s_add_i32 s10, s10, s10
-; VI-NEXT:    s_and_b32 s3, s3, 0xffff
-; VI-NEXT:    s_and_b32 s9, s11, 0xff
-; VI-NEXT:    s_add_i32 s25, s25, s25
+; VI-NEXT:    s_lshl_b32 s23, s23, 8
+; VI-NEXT:    s_and_b32 s22, s22, 0xff
+; VI-NEXT:    s_add_i32 s3, s3, s3
+; VI-NEXT:    s_add_i32 s31, s31, s31
+; VI-NEXT:    s_lshl_b32 s21, s21, 8
+; VI-NEXT:    s_and_b32 s20, s20, 0xff
 ; VI-NEXT:    s_add_i32 s2, s2, s2
-; VI-NEXT:    s_or_b32 s3, s3, s8
-; VI-NEXT:    s_lshl_b32 s8, s10, 24
-; VI-NEXT:    s_lshl_b32 s9, s9, 16
-; VI-NEXT:    s_lshr_b32 s13, s1, 16
-; VI-NEXT:    s_or_b32 s8, s8, s9
-; VI-NEXT:    s_and_b32 s2, s2, 0xff
-; VI-NEXT:    s_lshl_b32 s9, s25, 8
-; VI-NEXT:    s_lshr_b32 s12, s1, 24
-; VI-NEXT:    s_add_i32 s13, s13, s13
-; VI-NEXT:    s_or_b32 s2, s2, s9
-; VI-NEXT:    s_bfe_u32 s26, s1, 0x80008
-; VI-NEXT:    s_add_i32 s12, s12, s12
-; VI-NEXT:    s_and_b32 s2, s2, 0xffff
-; VI-NEXT:    s_and_b32 s9, s13, 0xff
-; VI-NEXT:    s_add_i32 s26, s26, s26
+; VI-NEXT:    s_add_i32 s30, s30, s30
+; VI-NEXT:    s_lshl_b32 s19, s19, 8
+; VI-NEXT:    s_and_b32 s18, s18, 0xff
 ; VI-NEXT:    s_add_i32 s1, s1, s1
-; VI-NEXT:    s_or_b32 s2, s2, s8
-; VI-NEXT:    s_lshl_b32 s8, s12, 24
-; VI-NEXT:    s_lshl_b32 s9, s9, 16
-; VI-NEXT:    s_lshr_b32 s15, s0, 16
-; VI-NEXT:    s_or_b32 s8, s8, s9
-; VI-NEXT:    s_and_b32 s1, s1, 0xff
-; VI-NEXT:    s_lshl_b32 s9, s26, 8
-; VI-NEXT:    s_lshr_b32 s14, s0, 24
-; VI-NEXT:    s_add_i32 s15, s15, s15
-; VI-NEXT:    s_or_b32 s1, s1, s9
-; VI-NEXT:    s_bfe_u32 s27, s0, 0x80008
-; VI-NEXT:    s_add_i32 s14, s14, s14
-; VI-NEXT:    s_and_b32 s1, s1, 0xffff
-; VI-NEXT:    s_and_b32 s9, s15, 0xff
-; VI-NEXT:    s_add_i32 s27, s27, s27
+; VI-NEXT:    s_add_i32 s29, s29, s29
+; VI-NEXT:    s_lshl_b32 s17, s17, 8
+; VI-NEXT:    s_and_b32 s16, s16, 0xff
 ; VI-NEXT:    s_add_i32 s0, s0, s0
-; VI-NEXT:    s_or_b32 s1, s1, s8
-; VI-NEXT:    s_lshl_b32 s8, s14, 24
-; VI-NEXT:    s_lshl_b32 s9, s9, 16
-; VI-NEXT:    s_lshr_b32 s17, s7, 16
-; VI-NEXT:    s_or_b32 s8, s8, s9
-; VI-NEXT:    s_and_b32 s0, s0, 0xff
-; VI-NEXT:    s_lshl_b32 s9, s27, 8
-; VI-NEXT:    s_lshr_b32 s16, s7, 24
-; VI-NEXT:    s_add_i32 s17, s17, s17
-; VI-NEXT:    s_or_b32 s0, s0, s9
-; VI-NEXT:    s_bfe_u32 s28, s7, 0x80008
-; VI-NEXT:    s_add_i32 s16, s16, s16
-; VI-NEXT:    s_and_b32 s0, s0, 0xffff
-; VI-NEXT:    s_and_b32 s9, s17, 0xff
 ; VI-NEXT:    s_add_i32 s28, s28, s28
-; VI-NEXT:    s_add_i32 s7, s7, s7
-; VI-NEXT:    s_or_b32 s0, s0, s8
-; VI-NEXT:    s_lshl_b32 s8, s16, 24
-; VI-NEXT:    s_lshl_b32 s9, s9, 16
-; VI-NEXT:    s_lshr_b32 s19, s6, 16
-; VI-NEXT:    s_or_b32 s8, s8, s9
-; VI-NEXT:    s_and_b32 s7, s7, 0xff
-; VI-NEXT:    s_lshl_b32 s9, s28, 8
-; VI-NEXT:    s_lshr_b32 s18, s6, 24
-; VI-NEXT:    s_add_i32 s19, s19, s19
-; VI-NEXT:    s_or_b32 s7, s7, s9
-; VI-NEXT:    s_bfe_u32 s29, s6, 0x80008
-; VI-NEXT:    s_add_i32 s18, s18, s18
+; VI-NEXT:    s_or_b32 s7, s7, s15
+; VI-NEXT:    s_or_b32 s6, s6, s13
+; VI-NEXT:    s_or_b32 s5, s5, s11
+; VI-NEXT:    s_or_b32 s4, s4, s9
+; VI-NEXT:    s_or_b32 s22, s22, s23
+; VI-NEXT:    s_and_b32 s3, s3, 0xff
+; VI-NEXT:    s_lshl_b32 s23, s31, 8
+; VI-NEXT:    s_or_b32 s20, s20, s21
+; VI-NEXT:    s_and_b32 s2, s2, 0xff
+; VI-NEXT:    s_lshl_b32 s21, s30, 8
+; VI-NEXT:    s_or_b32 s18, s18, s19
+; VI-NEXT:    s_and_b32 s1, s1, 0xff
+; VI-NEXT:    s_lshl_b32 s19, s29, 8
+; VI-NEXT:    s_or_b32 s16, s16, s17
+; VI-NEXT:    s_and_b32 s0, s0, 0xff
+; VI-NEXT:    s_lshl_b32 s17, s28, 8
+; VI-NEXT:    s_lshl_b32 s14, s14, 16
 ; VI-NEXT:    s_and_b32 s7, s7, 0xffff
-; VI-NEXT:    s_and_b32 s9, s19, 0xff
-; VI-NEXT:    s_add_i32 s29, s29, s29
-; VI-NEXT:    s_add_i32 s6, s6, s6
-; VI-NEXT:    s_or_b32 s7, s7, s8
-; VI-NEXT:    s_lshl_b32 s8, s18, 24
-; VI-NEXT:    s_lshl_b32 s9, s9, 16
-; VI-NEXT:    s_lshr_b32 s21, s5, 16
-; VI-NEXT:    s_or_b32 s8, s8, s9
-; VI-NEXT:    s_and_b32 s6, s6, 0xff
-; VI-NEXT:    s_lshl_b32 s9, s29, 8
-; VI-NEXT:    s_lshr_b32 s20, s5, 24
-; VI-NEXT:    s_add_i32 s21, s21, s21
-; VI-NEXT:    s_or_b32 s6, s6, s9
-; VI-NEXT:    s_bfe_u32 s30, s5, 0x80008
-; VI-NEXT:    s_add_i32 s20, s20, s20
+; VI-NEXT:    s_lshl_b32 s12, s12, 16
 ; VI-NEXT:    s_and_b32 s6, s6, 0xffff
-; VI-NEXT:    s_and_b32 s9, s21, 0xff
-; VI-NEXT:    s_add_i32 s30, s30, s30
-; VI-NEXT:    s_add_i32 s5, s5, s5
-; VI-NEXT:    s_or_b32 s6, s6, s8
-; VI-NEXT:    s_lshl_b32 s8, s20, 24
-; VI-NEXT:    s_lshl_b32 s9, s9, 16
-; VI-NEXT:    s_lshr_b32 s23, s4, 16
-; VI-NEXT:    s_or_b32 s8, s8, s9
-; VI-NEXT:    s_and_b32 s5, s5, 0xff
-; VI-NEXT:    s_lshl_b32 s9, s30, 8
-; VI-NEXT:    s_lshr_b32 s22, s4, 24
-; VI-NEXT:    s_add_i32 s23, s23, s23
-; VI-NEXT:    s_or_b32 s5, s5, s9
-; VI-NEXT:    s_bfe_u32 s31, s4, 0x80008
-; VI-NEXT:    s_add_i32 s22, s22, s22
+; VI-NEXT:    s_lshl_b32 s10, s10, 16
 ; VI-NEXT:    s_and_b32 s5, s5, 0xffff
-; VI-NEXT:    s_and_b32 s9, s23, 0xff
-; VI-NEXT:    s_add_i32 s31, s31, s31
-; VI-NEXT:    s_add_i32 s4, s4, s4
-; VI-NEXT:    s_or_b32 s5, s5, s8
-; VI-NEXT:    s_lshl_b32 s8, s22, 24
-; VI-NEXT:    s_lshl_b32 s9, s9, 16
-; VI-NEXT:    s_or_b32 s8, s8, s9
-; VI-NEXT:    s_and_b32 s4, s4, 0xff
-; VI-NEXT:    s_lshl_b32 s9, s31, 8
-; VI-NEXT:    s_or_b32 s4, s4, s9
+; VI-NEXT:    s_lshl_b32 s8, s8, 16
 ; VI-NEXT:    s_and_b32 s4, s4, 0xffff
+; VI-NEXT:    s_or_b32 s3, s3, s23
+; VI-NEXT:    s_or_b32 s2, s2, s21
+; VI-NEXT:    s_or_b32 s1, s1, s19
+; VI-NEXT:    s_or_b32 s0, s0, s17
+; VI-NEXT:    s_or_b32 s7, s7, s14
+; VI-NEXT:    s_or_b32 s6, s6, s12
+; VI-NEXT:    s_or_b32 s5, s5, s10
 ; VI-NEXT:    s_or_b32 s4, s4, s8
+; VI-NEXT:    s_lshl_b32 s22, s22, 16
+; VI-NEXT:    s_and_b32 s3, s3, 0xffff
+; VI-NEXT:    s_lshl_b32 s20, s20, 16
+; VI-NEXT:    s_and_b32 s2, s2, 0xffff
+; VI-NEXT:    s_lshl_b32 s18, s18, 16
+; VI-NEXT:    s_and_b32 s1, s1, 0xffff
+; VI-NEXT:    s_lshl_b32 s16, s16, 16
+; VI-NEXT:    s_and_b32 s0, s0, 0xffff
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_mov_b32_e32 v2, s6
 ; VI-NEXT:    v_mov_b32_e32 v3, s7
+; VI-NEXT:    s_or_b32 s3, s3, s22
+; VI-NEXT:    s_or_b32 s2, s2, s20
+; VI-NEXT:    s_or_b32 s1, s1, s18
+; VI-NEXT:    s_or_b32 s0, s0, s16
 ; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; VI-NEXT:    v_mov_b32_e32 v4, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll b/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll
index 6505e390355a8c..e492a32cf009c9 100644
--- a/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll
+++ b/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll
@@ -1,6 +1,6 @@
 ; RUN: opt -S -mtriple=amdgcn-- -codegenprepare < %s | FileCheck -check-prefix=OPT %s
 ; RUN: opt -S -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -codegenprepare < %s | FileCheck -check-prefix=OPT %s
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 ; This particular case will actually be worse in terms of code size
@@ -120,15 +120,12 @@ ret:
 ; GCN-LABEL: {{^}}sink_ubfe_i16:
 ; GCN-NOT: lshr
 ; VI: s_load_dword [[ARG:s[0-9]+]], s[2:3], 0x2c
-; VI: s_bfe_u32 [[BFE:s[0-9]+]], [[ARG]], 0xc0004
 ; GCN: s_cbranch_scc{{[0-1]}}
 
-; SI: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x70004
-; VI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7f
+; GCN: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x70004
 
 ; GCN: .LBB2_3:
-; SI: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80004
-; VI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0xff
+; GCN: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80004
 
 ; GCN: buffer_store_short
 ; GCN: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll
index 93e14a205f05d4..8ef74c70df7a1f 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll
@@ -1650,15 +1650,10 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; VI-NEXT:    s_mov_b32 s8, s2
 ; VI-NEXT:    s_mov_b32 s9, s3
 ; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
-; VI-NEXT:    v_mov_b32_e32 v1, 0xffff
 ; VI-NEXT:    s_mov_b32 s4, s0
 ; VI-NEXT:    s_mov_b32 s5, s1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_ffbh_u32_e32 v2, v0
-; VI-NEXT:    v_min_u32_e32 v2, 32, v2
-; VI-NEXT:    v_add_u32_e32 v2, vcc, -16, v2
-; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; VI-NEXT:    v_ffbh_u32_e32 v0, v0
 ; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
@@ -1696,11 +1691,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_load_ushort v1, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_ffbh_u32_e32 v2, v1
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
-; GFX10-NEXT:    v_min_u32_e32 v2, 32, v2
-; GFX10-NEXT:    v_add_nc_u32_e32 v2, -16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo
+; GFX10-NEXT:    v_ffbh_u32_e32 v1, v1
 ; GFX10-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -1727,13 +1718,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_load_u16 v1, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_clz_i32_u32_e32 v2, v1
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_min_u32_e32 v2, 32, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, -16, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo
+; GFX11-NEXT:    v_clz_i32_u32_e32 v1, v1
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
diff --git a/llvm/test/CodeGen/AMDGPU/cttz.ll b/llvm/test/CodeGen/AMDGPU/cttz.ll
index ccd23a91c35733..d7e3296c914a11 100644
--- a/llvm/test/CodeGen/AMDGPU/cttz.ll
+++ b/llvm/test/CodeGen/AMDGPU/cttz.ll
@@ -1402,15 +1402,10 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; VI-NEXT:    s_mov_b32 s8, s2
 ; VI-NEXT:    s_mov_b32 s9, s3
 ; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
-; VI-NEXT:    v_mov_b32_e32 v1, 0xffff
 ; VI-NEXT:    s_mov_b32 s4, s0
 ; VI-NEXT:    s_mov_b32 s5, s1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_e32 v2, 0x10000, v0
-; VI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; VI-NEXT:    v_ffbl_b32_e32 v2, v2
-; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; VI-NEXT:    v_ffbl_b32_e32 v0, v0
 ; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
@@ -1448,10 +1443,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_load_ushort v1, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_or_b32_e32 v2, 0x10000, v1
-; GFX10-NEXT:    v_cmp_ne_u32_sdwa vcc_lo, v1, v0 src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_ffbl_b32_e32 v2, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo
+; GFX10-NEXT:    v_ffbl_b32_e32 v1, v1
 ; GFX10-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
index 4c7c8bc1c027d7..dc6ee7c562f48e 100644
--- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
@@ -1460,13 +1460,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    flat_load_ubyte v0, v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v1, 0xff
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_e32 v2, 0x100, v0
-; VI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; VI-NEXT:    v_ffbl_b32_e32 v2, v2
-; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v2, v1, v2, vcc
+; VI-NEXT:    v_ffbl_b32_e32 v2, v0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_byte v[0:1], v2
@@ -1558,19 +1553,12 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; VI-NEXT:    flat_load_ubyte v2, v[2:3]
 ; VI-NEXT:    flat_load_ubyte v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_readfirstlane_b32 s2, v2
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_readfirstlane_b32 s3, v0
-; VI-NEXT:    s_lshl_b32 s2, s2, 8
-; VI-NEXT:    s_or_b32 s2, s2, s3
-; VI-NEXT:    s_or_b32 s3, s2, 0x10000
-; VI-NEXT:    s_and_b32 s2, s2, 0xffff
-; VI-NEXT:    s_ff1_i32_b32 s3, s3
-; VI-NEXT:    s_cmp_lg_u32 s2, 0
-; VI-NEXT:    s_cselect_b32 s2, s3, 0xffff
+; VI-NEXT:    v_or_b32_e32 v0, v1, v0
+; VI-NEXT:    v_ffbl_b32_e32 v2, v0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    flat_store_short v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/fneg.ll b/llvm/test/CodeGen/AMDGPU/fneg.ll
index e447429539e6ff..9c3f5f1cd672d8 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg.ll
@@ -635,12 +635,7 @@ define amdgpu_kernel void @s_fneg_v2i16(ptr addrspace(1) %out, i32 %arg) {
 ; VI-NEXT:    s_load_dword s4, s[2:3], 0x2c
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshr_b32 s2, s4, 16
-; VI-NEXT:    s_xor_b32 s3, s4, 0x8000
-; VI-NEXT:    s_xor_b32 s2, s2, 0x8000
-; VI-NEXT:    s_and_b32 s3, s3, 0xffff
-; VI-NEXT:    s_lshl_b32 s2, s2, 16
-; VI-NEXT:    s_or_b32 s2, s3, s2
+; VI-NEXT:    s_xor_b32 s2, s4, 0x80008000
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
@@ -721,11 +716,9 @@ define amdgpu_kernel void @s_fneg_v2i16_fp_use(ptr addrspace(1) %out, i32 %arg)
 ; VI-NEXT:    v_mov_b32_e32 v0, 0x4000
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s2, s4, 16
-; VI-NEXT:    s_xor_b32 s2, s2, 0x8000
-; VI-NEXT:    s_xor_b32 s3, s4, 0x8000
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_add_f16_e64 v1, s3, 2.0
-; VI-NEXT:    v_add_f16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_sub_f16_e64 v1, 2.0, s4
+; VI-NEXT:    v_sub_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_e32 v2, v1, v0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
diff --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
index a4bde5c9d82153..c06a3dab329822 100644
--- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
@@ -20,13 +20,13 @@ define amdgpu_kernel void @s_lshr_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_and_b32 s4, s2, 0xffff
-; VI-NEXT:    s_lshr_b32 s2, s2, 16
-; VI-NEXT:    s_lshr_b32 s5, s3, 16
-; VI-NEXT:    s_lshr_b32 s2, s2, s5
-; VI-NEXT:    s_lshr_b32 s3, s4, s3
-; VI-NEXT:    s_lshl_b32 s2, s2, 16
-; VI-NEXT:    s_or_b32 s2, s3, s2
+; VI-NEXT:    s_lshr_b32 s4, s3, 16
+; VI-NEXT:    s_lshr_b32 s5, s2, 16
+; VI-NEXT:    s_and_b32 s2, s2, 0xffff
+; VI-NEXT:    s_lshr_b32 s4, s5, s4
+; VI-NEXT:    s_lshr_b32 s2, s2, s3
+; VI-NEXT:    s_lshl_b32 s3, s4, 16
+; VI-NEXT:    s_or_b32 s2, s2, s3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll
index 05ef2698c1f774..1371e82b18b03e 100644
--- a/llvm/test/CodeGen/AMDGPU/min.ll
+++ b/llvm/test/CodeGen/AMDGPU/min.ll
@@ -434,13 +434,13 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32],
 ;
 ; VI-LABEL: s_test_imin_sle_i8:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s2, s[6:7], 0x28
-; VI-NEXT:    s_load_dword s3, s[6:7], 0x4c
+; VI-NEXT:    s_load_dword s2, s[6:7], 0x4c
+; VI-NEXT:    s_load_dword s3, s[6:7], 0x28
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_sext_i32_i8 s2, s2
 ; VI-NEXT:    s_sext_i32_i8 s3, s3
-; VI-NEXT:    s_min_i32 s2, s2, s3
+; VI-NEXT:    s_min_i32 s2, s3, s2
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
@@ -449,14 +449,14 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32],
 ;
 ; GFX9-LABEL: s_test_imin_sle_i8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dword s2, s[6:7], 0x28
-; GFX9-NEXT:    s_load_dword s3, s[6:7], 0x4c
+; GFX9-NEXT:    s_load_dword s2, s[6:7], 0x4c
+; GFX9-NEXT:    s_load_dword s3, s[6:7], 0x28
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_sext_i32_i8 s2, s2
 ; GFX9-NEXT:    s_sext_i32_i8 s3, s3
-; GFX9-NEXT:    s_min_i32 s2, s2, s3
+; GFX9-NEXT:    s_min_i32 s2, s3, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
@@ -464,14 +464,14 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32],
 ; GFX10-LABEL: s_test_imin_sle_i8:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_clause 0x2
-; GFX10-NEXT:    s_load_dword s2, s[6:7], 0x28
-; GFX10-NEXT:    s_load_dword s3, s[6:7], 0x4c
+; GFX10-NEXT:    s_load_dword s2, s[6:7], 0x4c
+; GFX10-NEXT:    s_load_dword s3, s[6:7], 0x28
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_sext_i32_i8 s2, s2
 ; GFX10-NEXT:    s_sext_i32_i8 s3, s3
-; GFX10-NEXT:    s_min_i32 s2, s2, s3
+; GFX10-NEXT:    s_min_i32 s2, s3, s2
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
@@ -479,15 +479,15 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32],
 ; GFX11-LABEL: s_test_imin_sle_i8:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b32 s4, s[2:3], 0x28
-; GFX11-NEXT:    s_load_b32 s5, s[2:3], 0x4c
+; GFX11-NEXT:    s_load_b32 s4, s[2:3], 0x4c
+; GFX11-NEXT:    s_load_b32 s5, s[2:3], 0x28
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_sext_i32_i8 s2, s4
 ; GFX11-NEXT:    s_sext_i32_i8 s3, s5
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_min_i32 s2, s2, s3
+; GFX11-NEXT:    s_min_i32 s2, s3, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-NEXT:    global_store_b8 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_nop 0
@@ -590,28 +590,30 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32]
 ; VI-LABEL: s_test_imin_sle_v4i8:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x28
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s3, s[6:7], 0x4c
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_ashr_i32 s4, s2, 24
-; VI-NEXT:    s_bfe_i32 s5, s2, 0x80010
-; VI-NEXT:    s_bfe_i32 s6, s2, 0x80008
-; VI-NEXT:    s_sext_i32_i8 s2, s2
-; VI-NEXT:    s_ashr_i32 s7, s3, 24
-; VI-NEXT:    s_bfe_i32 s8, s3, 0x80010
-; VI-NEXT:    s_bfe_i32 s9, s3, 0x80008
+; VI-NEXT:    s_ashr_i32 s6, s3, 24
+; VI-NEXT:    s_min_i32 s4, s4, s6
+; VI-NEXT:    s_bfe_i32 s6, s3, 0x80010
+; VI-NEXT:    s_bfe_i32 s8, s2, 0x80010
+; VI-NEXT:    s_min_i32 s6, s8, s6
+; VI-NEXT:    s_sext_i32_i16 s5, s2
+; VI-NEXT:    s_sext_i32_i16 s7, s3
+; VI-NEXT:    s_lshl_b32 s4, s4, 8
+; VI-NEXT:    s_and_b32 s6, s6, 0xff
+; VI-NEXT:    s_or_b32 s4, s6, s4
+; VI-NEXT:    s_ashr_i32 s6, s7, 8
+; VI-NEXT:    s_ashr_i32 s5, s5, 8
 ; VI-NEXT:    s_sext_i32_i8 s3, s3
+; VI-NEXT:    s_sext_i32_i8 s2, s2
+; VI-NEXT:    s_min_i32 s5, s5, s6
 ; VI-NEXT:    s_min_i32 s2, s2, s3
-; VI-NEXT:    s_min_i32 s3, s6, s9
-; VI-NEXT:    s_min_i32 s5, s5, s8
-; VI-NEXT:    s_min_i32 s4, s4, s7
-; VI-NEXT:    s_and_b32 s5, s5, 0xff
-; VI-NEXT:    s_lshl_b32 s3, s3, 8
+; VI-NEXT:    s_lshl_b32 s5, s5, 8
 ; VI-NEXT:    s_and_b32 s2, s2, 0xff
-; VI-NEXT:    s_lshl_b32 s4, s4, 24
-; VI-NEXT:    s_lshl_b32 s5, s5, 16
-; VI-NEXT:    s_or_b32 s2, s2, s3
-; VI-NEXT:    s_or_b32 s4, s4, s5
+; VI-NEXT:    s_or_b32 s2, s2, s5
+; VI-NEXT:    s_lshl_b32 s4, s4, 16
 ; VI-NEXT:    s_and_b32 s2, s2, 0xffff
 ; VI-NEXT:    s_or_b32 s2, s2, s4
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -622,36 +624,35 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32]
 ;
 ; GFX9-LABEL: s_test_imin_sle_v4i8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dword s3, s[6:7], 0x4c
 ; GFX9-NEXT:    s_load_dword s2, s[6:7], 0x28
+; GFX9-NEXT:    s_load_dword s3, s[6:7], 0x4c
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshr_b32 s5, s2, 16
-; GFX9-NEXT:    s_lshr_b32 s8, s3, 16
-; GFX9-NEXT:    s_ashr_i32 s9, s3, 24
-; GFX9-NEXT:    s_ashr_i32 s6, s2, 24
-; GFX9-NEXT:    s_bfe_i32 s8, s8, 0x80000
-; GFX9-NEXT:    v_mov_b32_e32 v1, s9
-; GFX9-NEXT:    s_bfe_i32 s5, s5, 0x80000
+; GFX9-NEXT:    s_sext_i32_i16 s5, s2
 ; GFX9-NEXT:    s_sext_i32_i16 s7, s3
-; GFX9-NEXT:    v_min_i16_e32 v1, s6, v1
-; GFX9-NEXT:    v_mov_b32_e32 v2, s8
-; GFX9-NEXT:    s_sext_i32_i16 s4, s2
-; GFX9-NEXT:    s_lshr_b32 s7, s7, 8
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX9-NEXT:    v_min_i16_e32 v2, s5, v2
-; GFX9-NEXT:    s_lshr_b32 s4, s4, 8
-; GFX9-NEXT:    s_bfe_i32 s3, s3, 0x80000
-; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_mov_b32_e32 v2, s7
-; GFX9-NEXT:    s_bfe_i32 s2, s2, 0x80000
-; GFX9-NEXT:    v_min_i16_e32 v2, s4, v2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; GFX9-NEXT:    v_min_i16_e32 v3, s2, v3
-; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    s_ashr_i32 s7, s7, 8
+; GFX9-NEXT:    s_ashr_i32 s5, s5, 8
+; GFX9-NEXT:    s_ashr_i32 s4, s2, 24
+; GFX9-NEXT:    s_ashr_i32 s6, s3, 24
+; GFX9-NEXT:    s_min_i32 s5, s5, s7
+; GFX9-NEXT:    s_sext_i32_i8 s7, s3
+; GFX9-NEXT:    s_sext_i32_i8 s8, s2
+; GFX9-NEXT:    s_bfe_i32 s3, s3, 0x80010
+; GFX9-NEXT:    s_bfe_i32 s2, s2, 0x80010
+; GFX9-NEXT:    s_min_i32 s7, s8, s7
+; GFX9-NEXT:    s_min_i32 s4, s4, s6
+; GFX9-NEXT:    s_min_i32 s2, s2, s3
+; GFX9-NEXT:    s_lshl_b32 s5, s5, 8
+; GFX9-NEXT:    s_and_b32 s7, s7, 0xff
+; GFX9-NEXT:    s_lshl_b32 s4, s4, 8
+; GFX9-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX9-NEXT:    s_or_b32 s5, s7, s5
+; GFX9-NEXT:    s_or_b32 s2, s2, s4
+; GFX9-NEXT:    s_and_b32 s5, s5, 0xffff
+; GFX9-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX9-NEXT:    s_or_b32 s2, s5, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -661,70 +662,69 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32]
 ; GFX10-NEXT:    s_load_dword s2, s[6:7], 0x28
 ; GFX10-NEXT:    s_load_dword s3, s[6:7], 0x4c
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_sext_i32_i16 s4, s2
+; GFX10-NEXT:    s_sext_i32_i16 s5, s2
 ; GFX10-NEXT:    s_sext_i32_i16 s7, s3
-; GFX10-NEXT:    s_ashr_i32 s6, s2, 24
-; GFX10-NEXT:    s_ashr_i32 s9, s3, 24
-; GFX10-NEXT:    s_lshr_b32 s4, s4, 8
-; GFX10-NEXT:    s_lshr_b32 s7, s7, 8
-; GFX10-NEXT:    v_min_i16 v0, s6, s9
-; GFX10-NEXT:    v_min_i16 v1, s4, s7
-; GFX10-NEXT:    s_lshr_b32 s5, s2, 16
-; GFX10-NEXT:    s_lshr_b32 s8, s3, 16
-; GFX10-NEXT:    s_bfe_i32 s2, s2, 0x80000
-; GFX10-NEXT:    s_bfe_i32 s5, s5, 0x80000
-; GFX10-NEXT:    s_bfe_i32 s4, s8, 0x80000
-; GFX10-NEXT:    s_bfe_i32 s3, s3, 0x80000
-; GFX10-NEXT:    v_min_i16 v2, s5, s4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX10-NEXT:    v_min_i16 v3, s2, s3
-; GFX10-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0
-; GFX10-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-NEXT:    s_ashr_i32 s4, s2, 24
+; GFX10-NEXT:    s_ashr_i32 s6, s3, 24
+; GFX10-NEXT:    s_sext_i32_i8 s8, s3
+; GFX10-NEXT:    s_sext_i32_i8 s9, s2
+; GFX10-NEXT:    s_bfe_i32 s3, s3, 0x80010
+; GFX10-NEXT:    s_bfe_i32 s2, s2, 0x80010
+; GFX10-NEXT:    s_ashr_i32 s7, s7, 8
+; GFX10-NEXT:    s_ashr_i32 s5, s5, 8
+; GFX10-NEXT:    s_min_i32 s8, s9, s8
+; GFX10-NEXT:    s_min_i32 s4, s4, s6
+; GFX10-NEXT:    s_min_i32 s2, s2, s3
+; GFX10-NEXT:    s_min_i32 s3, s5, s7
+; GFX10-NEXT:    s_and_b32 s5, s8, 0xff
+; GFX10-NEXT:    s_lshl_b32 s4, s4, 8
+; GFX10-NEXT:    s_lshl_b32 s3, s3, 8
+; GFX10-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX10-NEXT:    s_or_b32 s3, s5, s3
+; GFX10-NEXT:    s_or_b32 s2, s2, s4
+; GFX10-NEXT:    s_and_b32 s3, s3, 0xffff
+; GFX10-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX10-NEXT:    s_or_b32 s2, s3, s2
+; GFX10-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_test_imin_sle_v4i8:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b32 s0, s[2:3], 0x28
-; GFX11-NEXT:    s_load_b32 s1, s[2:3], 0x4c
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_sext_i32_i16 s4, s0
-; GFX11-NEXT:    s_lshr_b32 s5, s0, 16
-; GFX11-NEXT:    s_sext_i32_i16 s7, s1
-; GFX11-NEXT:    s_lshr_b32 s8, s1, 16
-; GFX11-NEXT:    s_ashr_i32 s6, s0, 24
-; GFX11-NEXT:    s_bfe_i32 s0, s0, 0x80000
-; GFX11-NEXT:    s_ashr_i32 s9, s1, 24
-; GFX11-NEXT:    s_bfe_i32 s1, s1, 0x80000
-; GFX11-NEXT:    s_lshr_b32 s4, s4, 8
-; GFX11-NEXT:    s_bfe_i32 s5, s5, 0x80000
-; GFX11-NEXT:    s_lshr_b32 s7, s7, 8
-; GFX11-NEXT:    s_bfe_i32 s8, s8, 0x80000
-; GFX11-NEXT:    v_min_i16 v0, s6, s9
-; GFX11-NEXT:    v_min_i16 v1, s0, s1
-; GFX11-NEXT:    v_min_i16 v2, s5, s8
-; GFX11-NEXT:    v_min_i16 v3, s4, s7
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    s_load_b32 s4, s[2:3], 0x28
+; GFX11-NEXT:    s_load_b32 s5, s[2:3], 0x4c
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v0, v2, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX11-NEXT:    s_sext_i32_i16 s3, s4
+; GFX11-NEXT:    s_sext_i32_i16 s7, s5
+; GFX11-NEXT:    s_ashr_i32 s2, s4, 24
+; GFX11-NEXT:    s_ashr_i32 s6, s5, 24
+; GFX11-NEXT:    s_sext_i32_i8 s8, s5
+; GFX11-NEXT:    s_sext_i32_i8 s9, s4
+; GFX11-NEXT:    s_bfe_i32 s5, s5, 0x80010
+; GFX11-NEXT:    s_bfe_i32 s4, s4, 0x80010
+; GFX11-NEXT:    s_ashr_i32 s7, s7, 8
+; GFX11-NEXT:    s_ashr_i32 s3, s3, 8
+; GFX11-NEXT:    s_min_i32 s8, s9, s8
+; GFX11-NEXT:    s_min_i32 s2, s2, s6
+; GFX11-NEXT:    s_min_i32 s4, s4, s5
+; GFX11-NEXT:    s_min_i32 s3, s3, s7
+; GFX11-NEXT:    s_and_b32 s5, s8, 0xff
+; GFX11-NEXT:    s_lshl_b32 s2, s2, 8
+; GFX11-NEXT:    s_lshl_b32 s3, s3, 8
+; GFX11-NEXT:    s_and_b32 s4, s4, 0xff
+; GFX11-NEXT:    s_or_b32 s3, s5, s3
+; GFX11-NEXT:    s_or_b32 s2, s4, s2
+; GFX11-NEXT:    s_and_b32 s3, s3, 0xffff
+; GFX11-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_or_b32 s2, s3, s2
+; GFX11-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -789,11 +789,11 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_ashr_i32 s4, s2, 16
-; VI-NEXT:    s_sext_i32_i16 s2, s2
-; VI-NEXT:    s_ashr_i32 s5, s3, 16
+; VI-NEXT:    s_ashr_i32 s4, s3, 16
+; VI-NEXT:    s_ashr_i32 s5, s2, 16
 ; VI-NEXT:    s_sext_i32_i16 s3, s3
-; VI-NEXT:    s_min_i32 s4, s4, s5
+; VI-NEXT:    s_sext_i32_i16 s2, s2
+; VI-NEXT:    s_min_i32 s4, s5, s4
 ; VI-NEXT:    s_min_i32 s2, s2, s3
 ; VI-NEXT:    s_lshl_b32 s3, s4, 16
 ; VI-NEXT:    s_and_b32 s2, s2, 0xffff
@@ -953,24 +953,24 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_ashr_i32 s6, s1, 16
-; VI-NEXT:    s_sext_i32_i16 s1, s1
-; VI-NEXT:    s_ashr_i32 s8, s3, 16
+; VI-NEXT:    s_ashr_i32 s6, s3, 16
+; VI-NEXT:    s_ashr_i32 s7, s1, 16
 ; VI-NEXT:    s_sext_i32_i16 s3, s3
-; VI-NEXT:    s_ashr_i32 s7, s0, 16
-; VI-NEXT:    s_sext_i32_i16 s0, s0
-; VI-NEXT:    s_ashr_i32 s9, s2, 16
-; VI-NEXT:    s_sext_i32_i16 s2, s2
-; VI-NEXT:    s_min_i32 s6, s6, s8
+; VI-NEXT:    s_sext_i32_i16 s1, s1
+; VI-NEXT:    s_min_i32 s6, s7, s6
 ; VI-NEXT:    s_min_i32 s1, s1, s3
-; VI-NEXT:    s_min_i32 s7, s7, s9
-; VI-NEXT:    s_min_i32 s0, s0, s2
-; VI-NEXT:    s_lshl_b32 s2, s6, 16
+; VI-NEXT:    s_lshl_b32 s6, s6, 16
 ; VI-NEXT:    s_and_b32 s1, s1, 0xffff
-; VI-NEXT:    s_or_b32 s1, s1, s2
-; VI-NEXT:    s_lshl_b32 s2, s7, 16
+; VI-NEXT:    s_or_b32 s1, s1, s6
+; VI-NEXT:    s_ashr_i32 s3, s2, 16
+; VI-NEXT:    s_ashr_i32 s6, s0, 16
+; VI-NEXT:    s_sext_i32_i16 s2, s2
+; VI-NEXT:    s_sext_i32_i16 s0, s0
+; VI-NEXT:    s_min_i32 s3, s6, s3
+; VI-NEXT:    s_min_i32 s0, s0, s2
+; VI-NEXT:    s_lshl_b32 s3, s3, 16
 ; VI-NEXT:    s_and_b32 s0, s0, 0xffff
-; VI-NEXT:    s_or_b32 s0, s0, s2
+; VI-NEXT:    s_or_b32 s0, s0, s3
 ; VI-NEXT:    v_mov_b32_e32 v2, s4
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -2661,19 +2661,22 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0,
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx8 s[0:7], s[6:7], 0x0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_mov_b32_e32 v2, s6
 ; VI-NEXT:    v_mov_b32_e32 v3, s7
-; VI-NEXT:    flat_load_ushort v4, v[0:1]
-; VI-NEXT:    flat_load_ushort v5, v[2:3]
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    flat_load_ushort v4, v[2:3]
+; VI-NEXT:    flat_load_ushort v5, v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_and_b32_e32 v6, 0xffff, v4
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_cmp_lt_u32_e32 vcc, v4, v5
-; VI-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
+; VI-NEXT:    v_and_b32_e32 v7, 0xffff, v5
+; VI-NEXT:    v_cmp_lt_u32_e32 vcc, v7, v6
+; VI-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
 ; VI-NEXT:    flat_store_short v[0:1], v4
 ; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; VI-NEXT:    flat_store_byte v[2:3], v0
@@ -2687,7 +2690,7 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0,
 ; GFX9-NEXT:    global_load_ushort v1, v0, s[12:13]
 ; GFX9-NEXT:    global_load_ushort v2, v0, s[14:15]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cmp_lt_u32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:WORD_0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; GFX9-NEXT:    global_store_short v0, v1, s[8:9]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
@@ -2703,7 +2706,7 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0,
 ; GFX10-NEXT:    global_load_ushort v1, v0, s[12:13]
 ; GFX10-NEXT:    global_load_ushort v2, v0, s[14:15]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT:    v_cmp_lt_u32_sdwa vcc_lo, v1, v2 src0_sel:WORD_0 src1_sel:WORD_0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
 ; GFX10-NEXT:    global_store_short v0, v1, s[8:9]
@@ -2716,11 +2719,15 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0,
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_u16 v1, v0, s[4:5]
-; GFX11-NEXT:    global_load_u16 v2, v0, s[6:7]
+; GFX11-NEXT:    global_load_u16 v1, v0, s[6:7]
+; GFX11-NEXT:    global_load_u16 v2, v0, s[4:5]
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v1, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc_lo
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v4, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
@@ -3174,38 +3181,38 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16
 ; VI-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x10
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshr_b32 s2, s11, 16
-; VI-NEXT:    s_lshr_b32 s4, s10, 16
-; VI-NEXT:    s_and_b32 s5, s10, 0xffff
-; VI-NEXT:    s_lshr_b32 s10, s15, 16
+; VI-NEXT:    s_and_b32 s2, s15, 0xffff
 ; VI-NEXT:    s_and_b32 s3, s11, 0xffff
-; VI-NEXT:    s_and_b32 s11, s15, 0xffff
-; VI-NEXT:    s_lshr_b32 s15, s14, 16
-; VI-NEXT:    s_min_u32 s2, s2, s10
+; VI-NEXT:    s_lshr_b32 s4, s15, 16
+; VI-NEXT:    s_lshr_b32 s5, s11, 16
+; VI-NEXT:    s_min_u32 s2, s3, s2
+; VI-NEXT:    s_min_u32 s3, s5, s4
+; VI-NEXT:    s_lshl_b32 s3, s3, 16
+; VI-NEXT:    s_or_b32 s2, s2, s3
+; VI-NEXT:    s_and_b32 s3, s14, 0xffff
+; VI-NEXT:    s_and_b32 s4, s10, 0xffff
+; VI-NEXT:    s_min_u32 s3, s4, s3
+; VI-NEXT:    s_lshr_b32 s4, s14, 16
+; VI-NEXT:    s_lshr_b32 s5, s10, 16
+; VI-NEXT:    s_min_u32 s4, s5, s4
+; VI-NEXT:    s_lshl_b32 s4, s4, 16
+; VI-NEXT:    s_or_b32 s3, s3, s4
+; VI-NEXT:    s_and_b32 s4, s13, 0xffff
+; VI-NEXT:    s_and_b32 s5, s9, 0xffff
+; VI-NEXT:    s_min_u32 s4, s5, s4
+; VI-NEXT:    s_lshr_b32 s5, s13, 16
 ; VI-NEXT:    s_lshr_b32 s6, s9, 16
-; VI-NEXT:    s_and_b32 s7, s9, 0xffff
-; VI-NEXT:    s_lshr_b32 s9, s8, 16
-; VI-NEXT:    s_and_b32 s14, s14, 0xffff
-; VI-NEXT:    s_lshr_b32 s16, s13, 16
-; VI-NEXT:    s_lshr_b32 s17, s12, 16
-; VI-NEXT:    s_min_u32 s4, s4, s15
-; VI-NEXT:    s_min_u32 s3, s3, s11
-; VI-NEXT:    s_lshl_b32 s2, s2, 16
-; VI-NEXT:    s_and_b32 s8, s8, 0xffff
-; VI-NEXT:    s_and_b32 s13, s13, 0xffff
-; VI-NEXT:    s_and_b32 s12, s12, 0xffff
-; VI-NEXT:    s_min_u32 s9, s9, s17
-; VI-NEXT:    s_min_u32 s6, s6, s16
-; VI-NEXT:    s_min_u32 s5, s5, s14
-; VI-NEXT:    s_or_b32 s2, s3, s2
-; VI-NEXT:    s_lshl_b32 s3, s4, 16
-; VI-NEXT:    s_min_u32 s8, s8, s12
-; VI-NEXT:    s_min_u32 s7, s7, s13
-; VI-NEXT:    s_or_b32 s3, s5, s3
-; VI-NEXT:    s_lshl_b32 s4, s6, 16
-; VI-NEXT:    s_lshl_b32 s5, s9, 16
-; VI-NEXT:    s_or_b32 s4, s7, s4
-; VI-NEXT:    s_or_b32 s5, s8, s5
+; VI-NEXT:    s_min_u32 s5, s6, s5
+; VI-NEXT:    s_lshl_b32 s5, s5, 16
+; VI-NEXT:    s_or_b32 s4, s4, s5
+; VI-NEXT:    s_and_b32 s5, s12, 0xffff
+; VI-NEXT:    s_and_b32 s6, s8, 0xffff
+; VI-NEXT:    s_min_u32 s5, s6, s5
+; VI-NEXT:    s_lshr_b32 s6, s12, 16
+; VI-NEXT:    s_lshr_b32 s7, s8, 16
+; VI-NEXT:    s_min_u32 s6, s7, s6
+; VI-NEXT:    s_lshl_b32 s6, s6, 16
+; VI-NEXT:    s_or_b32 s5, s5, s6
 ; VI-NEXT:    v_mov_b32_e32 v5, s1
 ; VI-NEXT:    v_mov_b32_e32 v0, s5
 ; VI-NEXT:    v_mov_b32_e32 v1, s4
@@ -3536,9 +3543,9 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_sext_i32_i16 s3, s2
-; VI-NEXT:    s_ashr_i32 s2, s2, 16
-; VI-NEXT:    s_min_i32 s2, s3, s2
+; VI-NEXT:    s_ashr_i32 s3, s2, 16
+; VI-NEXT:    s_sext_i32_i16 s2, s2
+; VI-NEXT:    s_min_i32 s2, s2, s3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
@@ -3551,9 +3558,9 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_sext_i32_i16 s3, s2
-; GFX9-NEXT:    s_ashr_i32 s2, s2, 16
-; GFX9-NEXT:    s_min_i32 s2, s3, s2
+; GFX9-NEXT:    s_ashr_i32 s3, s2, 16
+; GFX9-NEXT:    s_sext_i32_i16 s2, s2
+; GFX9-NEXT:    s_min_i32 s2, s2, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
@@ -3565,9 +3572,9 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_sext_i32_i16 s3, s2
-; GFX10-NEXT:    s_ashr_i32 s2, s2, 16
-; GFX10-NEXT:    s_min_i32 s2, s3, s2
+; GFX10-NEXT:    s_ashr_i32 s3, s2, 16
+; GFX10-NEXT:    s_sext_i32_i16 s2, s2
+; GFX10-NEXT:    s_min_i32 s2, s2, s3
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
@@ -3579,10 +3586,10 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_sext_i32_i16 s2, s4
-; GFX11-NEXT:    s_ashr_i32 s3, s4, 16
+; GFX11-NEXT:    s_ashr_i32 s2, s4, 16
+; GFX11-NEXT:    s_sext_i32_i16 s3, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_min_i32 s2, s2, s3
+; GFX11-NEXT:    s_min_i32 s2, s3, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_nop 0
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
index 5a1cc72644d47d..b1066e0f8f26ad 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
@@ -1787,15 +1787,14 @@ define amdgpu_kernel void @add_bb_v2i16(ptr addrspace(1) %out, ptr addrspace(1)
 ; NOSDWA-NEXT:    flat_load_dword v1, v[0:1]
 ; NOSDWA-NEXT:    flat_load_dword v2, v[2:3]
 ; NOSDWA-NEXT:    v_mov_b32_e32 v0, s4
-; NOSDWA-NEXT:    s_waitcnt vmcnt(1)
-; NOSDWA-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
 ; NOSDWA-NEXT:    s_waitcnt vmcnt(0)
-; NOSDWA-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; NOSDWA-NEXT:    v_add_u32_e32 v3, vcc, v1, v2
+; NOSDWA-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; NOSDWA-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; NOSDWA-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
-; NOSDWA-NEXT:    v_add_u32_e32 v2, vcc, v3, v4
-; NOSDWA-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; NOSDWA-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; NOSDWA-NEXT:    v_or_b32_e32 v2, v1, v2
+; NOSDWA-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; NOSDWA-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; NOSDWA-NEXT:    v_or_b32_e32 v2, v3, v1
 ; NOSDWA-NEXT:    v_mov_b32_e32 v1, s5
 ; NOSDWA-NEXT:    flat_store_dword v[0:1], v2
 ; NOSDWA-NEXT:    s_endpgm
@@ -1813,9 +1812,9 @@ define amdgpu_kernel void @add_bb_v2i16(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX89-NEXT:    flat_load_dword v2, v[2:3]
 ; GFX89-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX89-NEXT:    s_waitcnt vmcnt(0)
-; GFX89-NEXT:    v_add_u32_sdwa v3, vcc, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX89-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
-; GFX89-NEXT:    v_or_b32_sdwa v2, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX89-NEXT:    v_add_u32_e32 v3, vcc, v1, v2
+; GFX89-NEXT:    v_add_u32_sdwa v1, vcc, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX89-NEXT:    v_or_b32_sdwa v2, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX89-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX89-NEXT:    flat_store_dword v[0:1], v2
 ; GFX89-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll b/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll
index 4e3dccb975fe8e..fdae5d411d3621 100644
--- a/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll
@@ -521,13 +521,10 @@ define amdgpu_kernel void @v_sext_in_reg_i32_to_i64_move_use(ptr addrspace(1) %o
 ; FUNC-LABEL: {{^}}s_sext_in_reg_i1_i16:
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 
-; SI: s_bfe_i32 [[BFE:s[0-9]+]], [[VAL]], 0x10000
-; SI: v_mov_b32_e32 [[VBFE:v[0-9]+]], [[BFE]]
-; SI: buffer_store_short [[VBFE]]
+; GCN: s_bfe_i32 [[BFE:s[0-9]+]], [[VAL]], 0x10000
+; GCN: v_mov_b32_e32 [[VBFE:v[0-9]+]], [[BFE]]
+; GCN: buffer_store_short [[VBFE]]
 
-; GFX89: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 15
-; GFX89: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}}
-; GFX89: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 15
 define amdgpu_kernel void @s_sext_in_reg_i1_i16(ptr addrspace(1) %out, ptr addrspace(4) %ptr) #0 {
   %ld = load i32, ptr addrspace(4) %ptr
   %in = trunc i32 %ld to i16
@@ -622,9 +619,9 @@ define amdgpu_kernel void @s_sext_in_reg_i2_i16_arg(ptr addrspace(1) %out, i16 %
 ; SI: v_mov_b32_e32 [[VSEXT:v[0-9]+]], [[SSEXT]]
 ; SI: buffer_store_short [[VSEXT]]
 
-; GFX89: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8{{$}}
-; GFX89: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}}
-; GFX89: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8{{$}}
+; GFX89: s_bfe_i32 [[BFE:s[0-9]+]], [[VAL]], 0x80000
+; GFX89: v_mov_b32_e32 [[VBFE:v[0-9]+]], [[BFE]]
+; GFX89: buffer_store_short [[VBFE]]
 define amdgpu_kernel void @s_sext_in_reg_i8_i16_arg(ptr addrspace(1) %out, i16 %in) #0 {
   %shl = shl i16 %in, 8
   %sext = ashr i16 %shl, 8
diff --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
index 9b9f03ff74aa3f..44dd0b6e27e740 100644
--- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
@@ -27,9 +27,9 @@ define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 s4, s0
 ; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    s_lshr_b32 s0, s2, 16
-; VI-NEXT:    s_lshr_b32 s1, s3, 16
-; VI-NEXT:    s_lshl_b32 s0, s0, s1
+; VI-NEXT:    s_lshr_b32 s0, s3, 16
+; VI-NEXT:    s_lshr_b32 s1, s2, 16
+; VI-NEXT:    s_lshl_b32 s0, s1, s0
 ; VI-NEXT:    s_lshl_b32 s1, s2, s3
 ; VI-NEXT:    s_lshl_b32 s0, s0, 16
 ; VI-NEXT:    s_and_b32 s1, s1, 0xffff
diff --git a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll
index 3446e0384cc545..24efb1252dcffe 100644
--- a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll
@@ -22,19 +22,19 @@ define amdgpu_kernel void @s_abs_v2i16(ptr addrspace(1) %out, <2 x i16> %val) #0
 ; VI-NEXT:    s_load_dword s4, s[2:3], 0x2c
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshr_b32 s2, s4, 16
-; VI-NEXT:    s_sub_i32 s3, 0, s4
+; VI-NEXT:    s_sub_i32 s2, 0, s4
+; VI-NEXT:    s_lshr_b32 s3, s4, 16
 ; VI-NEXT:    s_ashr_i32 s5, s4, 16
+; VI-NEXT:    s_sub_i32 s3, 0, s3
+; VI-NEXT:    s_sext_i32_i16 s2, s2
 ; VI-NEXT:    s_sext_i32_i16 s4, s4
-; VI-NEXT:    s_sub_i32 s2, 0, s2
 ; VI-NEXT:    s_sext_i32_i16 s3, s3
-; VI-NEXT:    s_sext_i32_i16 s2, s2
-; VI-NEXT:    s_max_i32 s3, s4, s3
-; VI-NEXT:    s_max_i32 s2, s5, s2
-; VI-NEXT:    s_add_i32 s3, s3, 2
-; VI-NEXT:    s_lshl_b32 s2, s2, 16
-; VI-NEXT:    s_and_b32 s3, s3, 0xffff
-; VI-NEXT:    s_or_b32 s2, s2, s3
+; VI-NEXT:    s_max_i32 s2, s4, s2
+; VI-NEXT:    s_max_i32 s3, s5, s3
+; VI-NEXT:    s_add_i32 s2, s2, 2
+; VI-NEXT:    s_lshl_b32 s3, s3, 16
+; VI-NEXT:    s_and_b32 s2, s2, 0xffff
+; VI-NEXT:    s_or_b32 s2, s3, s2
 ; VI-NEXT:    s_add_i32 s2, s2, 0x20000
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -171,19 +171,19 @@ define amdgpu_kernel void @s_abs_v2i16_2(ptr addrspace(1) %out, <2 x i16> %val)
 ; VI-NEXT:    s_load_dword s4, s[2:3], 0x2c
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshr_b32 s2, s4, 16
-; VI-NEXT:    s_sub_i32 s3, 0, s4
+; VI-NEXT:    s_sub_i32 s2, 0, s4
+; VI-NEXT:    s_lshr_b32 s3, s4, 16
 ; VI-NEXT:    s_ashr_i32 s5, s4, 16
+; VI-NEXT:    s_sub_i32 s3, 0, s3
+; VI-NEXT:    s_sext_i32_i16 s2, s2
 ; VI-NEXT:    s_sext_i32_i16 s4, s4
-; VI-NEXT:    s_sub_i32 s2, 0, s2
 ; VI-NEXT:    s_sext_i32_i16 s3, s3
-; VI-NEXT:    s_sext_i32_i16 s2, s2
-; VI-NEXT:    s_max_i32 s3, s4, s3
-; VI-NEXT:    s_max_i32 s2, s5, s2
-; VI-NEXT:    s_add_i32 s3, s3, 2
-; VI-NEXT:    s_lshl_b32 s2, s2, 16
-; VI-NEXT:    s_and_b32 s3, s3, 0xffff
-; VI-NEXT:    s_or_b32 s2, s2, s3
+; VI-NEXT:    s_max_i32 s2, s4, s2
+; VI-NEXT:    s_max_i32 s3, s5, s3
+; VI-NEXT:    s_add_i32 s2, s2, 2
+; VI-NEXT:    s_lshl_b32 s3, s3, 16
+; VI-NEXT:    s_and_b32 s2, s2, 0xffff
+; VI-NEXT:    s_or_b32 s2, s3, s2
 ; VI-NEXT:    s_add_i32 s2, s2, 0x20000
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -331,31 +331,31 @@ define amdgpu_kernel void @s_abs_v4i16(ptr addrspace(1) %out, <4 x i16> %val) #0
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshr_b32 s4, s2, 16
-; VI-NEXT:    s_lshr_b32 s5, s3, 16
-; VI-NEXT:    s_sub_i32 s6, 0, s3
-; VI-NEXT:    s_sub_i32 s7, 0, s2
-; VI-NEXT:    s_sub_i32 s5, 0, s5
-; VI-NEXT:    s_sub_i32 s4, 0, s4
+; VI-NEXT:    s_lshr_b32 s7, s2, 16
+; VI-NEXT:    s_sub_i32 s7, 0, s7
+; VI-NEXT:    s_sub_i32 s4, 0, s3
+; VI-NEXT:    s_lshr_b32 s6, s3, 16
 ; VI-NEXT:    s_ashr_i32 s8, s2, 16
-; VI-NEXT:    s_ashr_i32 s9, s3, 16
-; VI-NEXT:    s_sext_i32_i16 s2, s2
-; VI-NEXT:    s_sext_i32_i16 s3, s3
 ; VI-NEXT:    s_sext_i32_i16 s7, s7
-; VI-NEXT:    s_sext_i32_i16 s6, s6
+; VI-NEXT:    s_sub_i32 s5, 0, s2
+; VI-NEXT:    s_sub_i32 s6, 0, s6
+; VI-NEXT:    s_max_i32 s7, s8, s7
+; VI-NEXT:    s_ashr_i32 s8, s3, 16
 ; VI-NEXT:    s_sext_i32_i16 s4, s4
+; VI-NEXT:    s_sext_i32_i16 s3, s3
+; VI-NEXT:    s_sext_i32_i16 s6, s6
 ; VI-NEXT:    s_sext_i32_i16 s5, s5
-; VI-NEXT:    s_max_i32 s3, s3, s6
-; VI-NEXT:    s_max_i32 s2, s2, s7
-; VI-NEXT:    s_max_i32 s5, s9, s5
-; VI-NEXT:    s_max_i32 s4, s8, s4
-; VI-NEXT:    s_add_i32 s2, s2, 2
+; VI-NEXT:    s_sext_i32_i16 s2, s2
+; VI-NEXT:    s_max_i32 s3, s3, s4
+; VI-NEXT:    s_max_i32 s6, s8, s6
+; VI-NEXT:    s_max_i32 s2, s2, s5
 ; VI-NEXT:    s_add_i32 s3, s3, 2
-; VI-NEXT:    s_lshl_b32 s4, s4, 16
-; VI-NEXT:    s_lshl_b32 s5, s5, 16
+; VI-NEXT:    s_lshl_b32 s4, s6, 16
 ; VI-NEXT:    s_and_b32 s3, s3, 0xffff
+; VI-NEXT:    s_add_i32 s2, s2, 2
+; VI-NEXT:    s_or_b32 s3, s4, s3
+; VI-NEXT:    s_lshl_b32 s4, s7, 16
 ; VI-NEXT:    s_and_b32 s2, s2, 0xffff
-; VI-NEXT:    s_or_b32 s3, s5, s3
 ; VI-NEXT:    s_or_b32 s2, s4, s2
 ; VI-NEXT:    s_add_i32 s3, s3, 0x20000
 ; VI-NEXT:    s_add_i32 s2, s2, 0x20000
@@ -559,21 +559,21 @@ define amdgpu_kernel void @s_min_max_v2i16(ptr addrspace(1) %out0, ptr addrspace
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    s_ashr_i32 s2, s0, 16
-; VI-NEXT:    s_sext_i32_i16 s0, s0
-; VI-NEXT:    s_ashr_i32 s3, s1, 16
+; VI-NEXT:    s_ashr_i32 s2, s1, 16
+; VI-NEXT:    s_ashr_i32 s3, s0, 16
 ; VI-NEXT:    s_sext_i32_i16 s1, s1
+; VI-NEXT:    s_sext_i32_i16 s0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    s_max_i32 s4, s2, s3
+; VI-NEXT:    s_max_i32 s4, s3, s2
 ; VI-NEXT:    s_max_i32 s5, s0, s1
 ; VI-NEXT:    s_lshl_b32 s4, s4, 16
 ; VI-NEXT:    s_and_b32 s5, s5, 0xffff
-; VI-NEXT:    s_min_i32 s2, s2, s3
+; VI-NEXT:    s_min_i32 s2, s3, s2
 ; VI-NEXT:    s_min_i32 s0, s0, s1
 ; VI-NEXT:    s_or_b32 s4, s5, s4
-; VI-NEXT:    s_lshl_b32 s1, s2, 16
+; VI-NEXT:    s_lshl_b32 s2, s2, 16
 ; VI-NEXT:    s_and_b32 s0, s0, 0xffff
-; VI-NEXT:    s_or_b32 s0, s0, s1
+; VI-NEXT:    s_or_b32 s0, s0, s2
 ; VI-NEXT:    v_mov_b32_e32 v4, s4
 ; VI-NEXT:    v_mov_b32_e32 v2, s6
 ; VI-NEXT:    v_mov_b32_e32 v3, s7
@@ -661,12 +661,12 @@ define amdgpu_kernel void @v_min_max_v2i16(ptr addrspace(1) %out0, ptr addrspace
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
-; VI-NEXT:    v_max_i32_sdwa v6, sext(v4), sext(v5) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; VI-NEXT:    v_max_i32_sdwa v7, sext(v4), sext(v5) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT:    v_min_i32_sdwa v8, sext(v4), sext(v5) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; VI-NEXT:    v_min_i32_sdwa v4, sext(v4), sext(v5) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT:    v_or_b32_sdwa v5, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_max_i32_sdwa v6, sext(v4), sext(v5) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_i32_sdwa v7, sext(v4), sext(v5) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; VI-NEXT:    v_min_i32_sdwa v8, sext(v4), sext(v5) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_min_i32_sdwa v4, sext(v4), sext(v5) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; VI-NEXT:    v_or_b32_sdwa v5, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v4, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    flat_store_dword v[0:1], v5
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    flat_store_dword v[2:3], v4
@@ -748,37 +748,37 @@ define amdgpu_kernel void @s_min_max_v4i16(ptr addrspace(1) %out0, ptr addrspace
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
-; VI-NEXT:    s_ashr_i32 s0, s5, 16
-; VI-NEXT:    s_ashr_i32 s1, s4, 16
-; VI-NEXT:    s_sext_i32_i16 s2, s5
-; VI-NEXT:    s_sext_i32_i16 s3, s4
-; VI-NEXT:    s_ashr_i32 s4, s7, 16
-; VI-NEXT:    s_ashr_i32 s5, s6, 16
-; VI-NEXT:    s_sext_i32_i16 s7, s7
+; VI-NEXT:    s_ashr_i32 s0, s7, 16
+; VI-NEXT:    s_ashr_i32 s1, s5, 16
+; VI-NEXT:    s_sext_i32_i16 s3, s7
+; VI-NEXT:    s_sext_i32_i16 s5, s5
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    s_max_i32 s2, s1, s0
+; VI-NEXT:    s_max_i32 s7, s5, s3
+; VI-NEXT:    s_lshl_b32 s2, s2, 16
+; VI-NEXT:    s_and_b32 s7, s7, 0xffff
+; VI-NEXT:    s_or_b32 s2, s7, s2
+; VI-NEXT:    s_ashr_i32 s7, s6, 16
+; VI-NEXT:    s_ashr_i32 s8, s4, 16
 ; VI-NEXT:    s_sext_i32_i16 s6, s6
-; VI-NEXT:    s_max_i32 s8, s1, s5
-; VI-NEXT:    s_max_i32 s9, s0, s4
-; VI-NEXT:    s_max_i32 s10, s3, s6
-; VI-NEXT:    s_max_i32 s11, s2, s7
-; VI-NEXT:    s_min_i32 s0, s0, s4
-; VI-NEXT:    s_min_i32 s2, s2, s7
+; VI-NEXT:    s_sext_i32_i16 s4, s4
+; VI-NEXT:    s_min_i32 s0, s1, s0
+; VI-NEXT:    s_min_i32 s1, s5, s3
+; VI-NEXT:    s_max_i32 s9, s8, s7
+; VI-NEXT:    s_max_i32 s10, s4, s6
+; VI-NEXT:    s_lshl_b32 s0, s0, 16
+; VI-NEXT:    s_and_b32 s1, s1, 0xffff
 ; VI-NEXT:    s_lshl_b32 s9, s9, 16
-; VI-NEXT:    s_and_b32 s11, s11, 0xffff
-; VI-NEXT:    s_lshl_b32 s8, s8, 16
 ; VI-NEXT:    s_and_b32 s10, s10, 0xffff
-; VI-NEXT:    s_min_i32 s1, s1, s5
-; VI-NEXT:    s_min_i32 s3, s3, s6
-; VI-NEXT:    s_lshl_b32 s0, s0, 16
-; VI-NEXT:    s_and_b32 s2, s2, 0xffff
-; VI-NEXT:    s_or_b32 s9, s11, s9
-; VI-NEXT:    s_or_b32 s8, s10, s8
-; VI-NEXT:    s_or_b32 s0, s2, s0
+; VI-NEXT:    v_mov_b32_e32 v5, s2
+; VI-NEXT:    s_or_b32 s0, s1, s0
+; VI-NEXT:    s_min_i32 s1, s8, s7
+; VI-NEXT:    s_min_i32 s2, s4, s6
+; VI-NEXT:    s_or_b32 s9, s10, s9
 ; VI-NEXT:    s_lshl_b32 s1, s1, 16
-; VI-NEXT:    s_and_b32 s2, s3, 0xffff
-; VI-NEXT:    v_mov_b32_e32 v4, s8
-; VI-NEXT:    v_mov_b32_e32 v5, s9
+; VI-NEXT:    s_and_b32 s2, s2, 0xffff
+; VI-NEXT:    v_mov_b32_e32 v4, s9
 ; VI-NEXT:    s_or_b32 s1, s2, s1
 ; VI-NEXT:    v_mov_b32_e32 v6, s1
 ; VI-NEXT:    v_mov_b32_e32 v7, s0
@@ -899,42 +899,34 @@ define amdgpu_kernel void @v_min_max_v2i16_user(ptr addrspace(1) %out0, ptr addr
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_readfirstlane_b32 s0, v4
-; VI-NEXT:    v_readfirstlane_b32 s1, v5
-; VI-NEXT:    s_ashr_i32 s3, s0, 16
-; VI-NEXT:    s_ashr_i32 s5, s1, 16
-; VI-NEXT:    s_cmp_gt_i32 s3, s5
-; VI-NEXT:    s_sext_i32_i16 s2, s0
-; VI-NEXT:    s_sext_i32_i16 s4, s1
-; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[0:1]
-; VI-NEXT:    s_and_b64 s[0:1], s[0:1], exec
-; VI-NEXT:    s_cselect_b32 s0, s3, s5
-; VI-NEXT:    s_cselect_b32 s3, s5, s3
-; VI-NEXT:    s_lshl_b32 s5, s0, 16
-; VI-NEXT:    s_cmp_gt_i32 s2, s4
-; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[0:1]
-; VI-NEXT:    s_and_b64 s[0:1], s[0:1], exec
-; VI-NEXT:    s_cselect_b32 s0, s2, s4
-; VI-NEXT:    s_cselect_b32 s1, s4, s2
-; VI-NEXT:    s_and_b32 s0, s0, 0xffff
-; VI-NEXT:    v_lshlrev_b32_e32 v4, 1, v4
-; VI-NEXT:    s_lshl_b32 s2, s3, 16
-; VI-NEXT:    s_and_b32 s1, s1, 0xffff
-; VI-NEXT:    s_or_b32 s0, s0, s5
-; VI-NEXT:    v_or_b32_e32 v4, v5, v4
-; VI-NEXT:    s_or_b32 s1, s1, s2
-; VI-NEXT:    v_mov_b32_e32 v5, s0
-; VI-NEXT:    v_and_b32_e32 v4, 3, v4
-; VI-NEXT:    v_mov_b32_e32 v6, s1
-; VI-NEXT:    flat_store_dword v[0:1], v5
+; VI-NEXT:    v_mov_b32_e32 v3, s3
+; VI-NEXT:    v_ashrrev_i32_e32 v10, 16, v4
+; VI-NEXT:    v_ashrrev_i32_e32 v11, 16, v5
+; VI-NEXT:    v_bfe_i32 v6, v4, 0, 16
+; VI-NEXT:    v_bfe_i32 v7, v5, 0, 16
+; VI-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
+; VI-NEXT:    v_lshrrev_b32_e32 v9, 16, v5
+; VI-NEXT:    v_cmp_gt_i32_e32 vcc, v10, v11
+; VI-NEXT:    v_cndmask_b32_e32 v10, v9, v8, vcc
+; VI-NEXT:    v_cmp_gt_i32_e64 s[0:1], v6, v7
+; VI-NEXT:    v_cndmask_b32_e64 v6, v5, v4, s[0:1]
+; VI-NEXT:    v_cndmask_b32_e32 v7, v8, v9, vcc
+; VI-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
+; VI-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[0:1]
+; VI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; VI-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[0:1]
+; VI-NEXT:    v_or_b32_sdwa v6, v6, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v5, 1, v5
+; VI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; VI-NEXT:    flat_store_dword v[0:1], v6
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    flat_store_dword v[2:3], v6
+; VI-NEXT:    v_or_b32_e32 v0, v9, v5
+; VI-NEXT:    v_or_b32_sdwa v4, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_and_b32_e32 v0, 3, v0
+; VI-NEXT:    flat_store_dword v[2:3], v4
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    flat_store_byte v[0:1], v4
+; VI-NEXT:    flat_store_byte v[0:1], v0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_endpgm
 ;
@@ -1021,19 +1013,19 @@ define amdgpu_kernel void @u_min_max_v2i16(ptr addrspace(1) %out0, ptr addrspace
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    s_lshr_b32 s2, s0, 16
-; VI-NEXT:    s_lshr_b32 s3, s1, 16
-; VI-NEXT:    s_and_b32 s0, s0, 0xffff
-; VI-NEXT:    s_and_b32 s1, s1, 0xffff
-; VI-NEXT:    s_max_u32 s5, s2, s3
+; VI-NEXT:    s_and_b32 s2, s1, 0xffff
+; VI-NEXT:    s_and_b32 s3, s0, 0xffff
+; VI-NEXT:    s_lshr_b32 s1, s1, 16
+; VI-NEXT:    s_lshr_b32 s0, s0, 16
+; VI-NEXT:    s_max_u32 s5, s0, s1
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    s_max_u32 s4, s0, s1
+; VI-NEXT:    s_max_u32 s4, s3, s2
 ; VI-NEXT:    s_lshl_b32 s5, s5, 16
 ; VI-NEXT:    s_min_u32 s0, s0, s1
-; VI-NEXT:    s_min_u32 s1, s2, s3
 ; VI-NEXT:    s_or_b32 s4, s4, s5
-; VI-NEXT:    s_lshl_b32 s1, s1, 16
-; VI-NEXT:    s_or_b32 s0, s0, s1
+; VI-NEXT:    s_min_u32 s2, s3, s2
+; VI-NEXT:    s_lshl_b32 s0, s0, 16
+; VI-NEXT:    s_or_b32 s0, s2, s0
 ; VI-NEXT:    v_mov_b32_e32 v4, s4
 ; VI-NEXT:    v_mov_b32_e32 v2, s6
 ; VI-NEXT:    v_mov_b32_e32 v3, s7
diff --git a/llvm/test/CodeGen/AMDGPU/sra.ll b/llvm/test/CodeGen/AMDGPU/sra.ll
index cd06a060a50cd8..1540c3e5c403fd 100644
--- a/llvm/test/CodeGen/AMDGPU/sra.ll
+++ b/llvm/test/CodeGen/AMDGPU/sra.ll
@@ -187,15 +187,14 @@ define amdgpu_kernel void @ashr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-NEXT:    s_mov_b32 s4, s0
 ; VI-NEXT:    s_mov_b32 s5, s1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_readfirstlane_b32 s0, v1
-; VI-NEXT:    v_readfirstlane_b32 s1, v0
-; VI-NEXT:    s_ashr_i32 s2, s1, 16
-; VI-NEXT:    s_sext_i32_i16 s1, s1
+; VI-NEXT:    v_readfirstlane_b32 s0, v0
+; VI-NEXT:    v_readfirstlane_b32 s1, v1
+; VI-NEXT:    s_lshr_b32 s2, s1, 16
 ; VI-NEXT:    s_ashr_i32 s3, s0, 16
 ; VI-NEXT:    s_sext_i32_i16 s0, s0
-; VI-NEXT:    s_ashr_i32 s0, s1, s0
-; VI-NEXT:    s_ashr_i32 s1, s2, s3
-; VI-NEXT:    s_lshl_b32 s1, s1, 16
+; VI-NEXT:    s_ashr_i32 s2, s3, s2
+; VI-NEXT:    s_ashr_i32 s0, s0, s1
+; VI-NEXT:    s_lshl_b32 s1, s2, 16
 ; VI-NEXT:    s_and_b32 s0, s0, 0xffff
 ; VI-NEXT:    s_or_b32 s0, s0, s1
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -282,43 +281,41 @@ define amdgpu_kernel void @ashr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ;
 ; VI-LABEL: ashr_v4i16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
-; VI-NEXT:    s_mov_b32 s10, s6
-; VI-NEXT:    s_mov_b32 s11, s7
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s10, s2
+; VI-NEXT:    s_mov_b32 s11, s3
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s8, s2
-; VI-NEXT:    s_mov_b32 s9, s3
+; VI-NEXT:    s_mov_b32 s8, s6
+; VI-NEXT:    s_mov_b32 s9, s7
 ; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_readfirstlane_b32 s0, v2
-; VI-NEXT:    v_readfirstlane_b32 s1, v3
-; VI-NEXT:    v_readfirstlane_b32 s2, v0
-; VI-NEXT:    v_readfirstlane_b32 s3, v1
-; VI-NEXT:    s_ashr_i32 s8, s3, 16
-; VI-NEXT:    s_sext_i32_i16 s3, s3
-; VI-NEXT:    s_ashr_i32 s9, s2, 16
-; VI-NEXT:    s_sext_i32_i16 s2, s2
-; VI-NEXT:    s_ashr_i32 s10, s1, 16
-; VI-NEXT:    s_sext_i32_i16 s1, s1
-; VI-NEXT:    s_ashr_i32 s11, s0, 16
-; VI-NEXT:    s_sext_i32_i16 s0, s0
-; VI-NEXT:    s_ashr_i32 s0, s2, s0
-; VI-NEXT:    s_ashr_i32 s2, s9, s11
-; VI-NEXT:    s_ashr_i32 s1, s3, s1
-; VI-NEXT:    s_ashr_i32 s3, s8, s10
-; VI-NEXT:    s_lshl_b32 s3, s3, 16
-; VI-NEXT:    s_and_b32 s1, s1, 0xffff
-; VI-NEXT:    s_lshl_b32 s2, s2, 16
-; VI-NEXT:    s_and_b32 s0, s0, 0xffff
-; VI-NEXT:    s_or_b32 s1, s1, s3
-; VI-NEXT:    s_or_b32 s0, s0, s2
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT:    v_readfirstlane_b32 s4, v0
+; VI-NEXT:    v_readfirstlane_b32 s5, v2
+; VI-NEXT:    v_readfirstlane_b32 s6, v1
+; VI-NEXT:    v_readfirstlane_b32 s7, v3
+; VI-NEXT:    s_lshr_b32 s8, s7, 16
+; VI-NEXT:    s_ashr_i32 s9, s6, 16
+; VI-NEXT:    s_sext_i32_i16 s6, s6
+; VI-NEXT:    s_lshr_b32 s10, s5, 16
+; VI-NEXT:    s_ashr_i32 s11, s4, 16
+; VI-NEXT:    s_sext_i32_i16 s4, s4
+; VI-NEXT:    s_ashr_i32 s8, s9, s8
+; VI-NEXT:    s_ashr_i32 s6, s6, s7
+; VI-NEXT:    s_ashr_i32 s7, s11, s10
+; VI-NEXT:    s_ashr_i32 s4, s4, s5
+; VI-NEXT:    s_lshl_b32 s5, s8, 16
+; VI-NEXT:    s_and_b32 s6, s6, 0xffff
+; VI-NEXT:    s_lshl_b32 s7, s7, 16
+; VI-NEXT:    s_and_b32 s4, s4, 0xffff
+; VI-NEXT:    s_or_b32 s5, s6, s5
+; VI-NEXT:    s_or_b32 s4, s4, s7
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: ashr_v4i16:
diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
index 5a821db6ff0408..327a85e80da9dc 100644
--- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
@@ -117,23 +117,21 @@ define amdgpu_kernel void @s_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_load_dword s6, s[6:7], 0x0
-; VI-NEXT:    s_load_dword s7, s[0:1], 0x0
-; VI-NEXT:    s_mov_b32 s0, s4
-; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_load_dword s2, s[6:7], 0x0
+; VI-NEXT:    s_load_dword s0, s[0:1], 0x0
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshr_b32 s4, s6, 16
-; VI-NEXT:    s_lshr_b32 s5, s7, 16
-; VI-NEXT:    s_sub_i32 s6, s6, s7
-; VI-NEXT:    s_sub_i32 s4, s4, s5
-; VI-NEXT:    s_and_b32 s5, s6, 0xffff
-; VI-NEXT:    s_lshl_b32 s4, s4, 16
-; VI-NEXT:    s_or_b32 s4, s5, s4
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    s_sub_i32 s1, s2, s0
+; VI-NEXT:    s_lshr_b32 s0, s0, 16
+; VI-NEXT:    s_lshr_b32 s2, s2, 16
+; VI-NEXT:    s_sub_i32 s0, s2, s0
+; VI-NEXT:    s_and_b32 s1, s1, 0xffff
+; VI-NEXT:    s_lshl_b32 s0, s0, 16
+; VI-NEXT:    s_or_b32 s0, s1, s0
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: s_test_sub_v2i16:
@@ -235,9 +233,9 @@ define amdgpu_kernel void @s_test_sub_v2i16_kernarg(ptr addrspace(1) %out, <2 x
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 s4, s0
 ; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    s_lshr_b32 s0, s2, 16
-; VI-NEXT:    s_lshr_b32 s1, s3, 16
-; VI-NEXT:    s_sub_i32 s0, s0, s1
+; VI-NEXT:    s_lshr_b32 s0, s3, 16
+; VI-NEXT:    s_lshr_b32 s1, s2, 16
+; VI-NEXT:    s_sub_i32 s0, s1, s0
 ; VI-NEXT:    s_sub_i32 s1, s2, s3
 ; VI-NEXT:    s_lshl_b32 s0, s0, 16
 ; VI-NEXT:    s_and_b32 s1, s1, 0xffff
diff --git a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
index ee99fcc5863340..6195dd1992469b 100644
--- a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
@@ -92,10 +92,11 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg
 ; SDAG-VI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; SDAG-VI-NEXT:    v_mov_b32_e32 v0, 0xff
 ; SDAG-VI-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT:    v_max_i16_e64 v1, s2, 0
-; SDAG-VI-NEXT:    v_max_i16_e64 v2, s3, 0
-; SDAG-VI-NEXT:    v_min_i16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; SDAG-VI-NEXT:    v_min_i16_e32 v1, 0xff, v1
+; SDAG-VI-NEXT:    s_sext_i32_i16 s2, s2
+; SDAG-VI-NEXT:    s_sext_i32_i16 s3, s3
+; SDAG-VI-NEXT:    v_med3_i32 v1, s2, 0, v0
+; SDAG-VI-NEXT:    v_med3_i32 v0, s3, 0, v0
+; SDAG-VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; SDAG-VI-NEXT:    v_or_b32_e32 v2, v1, v0
 ; SDAG-VI-NEXT:    v_mov_b32_e32 v0, s0
 ; SDAG-VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -417,12 +418,12 @@ define amdgpu_kernel void @vec_smax_smin_sgpr(ptr addrspace(1) %out, <2 x i16> i
 ; SDAG-VI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
 ; SDAG-VI-NEXT:    v_mov_b32_e32 v0, 0xff
 ; SDAG-VI-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT:    s_lshr_b32 s2, s4, 16
-; SDAG-VI-NEXT:    v_max_i16_e64 v1, s4, 0
-; SDAG-VI-NEXT:    v_max_i16_e64 v2, s2, 0
-; SDAG-VI-NEXT:    v_min_i16_e32 v1, 0xff, v1
-; SDAG-VI-NEXT:    v_min_i16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; SDAG-VI-NEXT:    v_or_b32_e32 v2, v1, v0
+; SDAG-VI-NEXT:    s_ashr_i32 s2, s4, 16
+; SDAG-VI-NEXT:    s_sext_i32_i16 s3, s4
+; SDAG-VI-NEXT:    v_med3_i32 v1, s3, 0, v0
+; SDAG-VI-NEXT:    v_med3_i32 v0, s2, 0, v0
+; SDAG-VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; SDAG-VI-NEXT:    v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; SDAG-VI-NEXT:    v_mov_b32_e32 v0, s0
 ; SDAG-VI-NEXT:    v_mov_b32_e32 v1, s1
 ; SDAG-VI-NEXT:    flat_store_dword v[0:1], v2
diff --git a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
index 7a1f05f56a7517..f074f7bf67f770 100644
--- a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
+++ b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
@@ -323,7 +323,7 @@ define amdgpu_kernel void @widen_v2i8_constant_load(ptr addrspace(4) %arg) {
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_and_b32 s1, s0, 0xffffff00
+; VI-NEXT:    s_and_b32 s1, s0, 0xff00
 ; VI-NEXT:    s_add_i32 s0, s0, 12
 ; VI-NEXT:    s_or_b32 s0, s0, 4
 ; VI-NEXT:    s_and_b32 s0, s0, 0xff
diff --git a/llvm/test/CodeGen/AMDGPU/zero_extend.ll b/llvm/test/CodeGen/AMDGPU/zero_extend.ll
index af50e09f509a3b..ded6f5cda33b9a 100644
--- a/llvm/test/CodeGen/AMDGPU/zero_extend.ll
+++ b/llvm/test/CodeGen/AMDGPU/zero_extend.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GFX8,GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GFX9,GCN %s
 ; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 %s
 
 ; R600: {{^}}s_mad_zext_i32_to_i64:
@@ -53,7 +53,8 @@ define amdgpu_kernel void @s_cmp_zext_i1_to_i64(ptr addrspace(1) %out, i32 %a, i
 
 ; GCN-DAG: s_and_b32 [[MASK_A:s[0-9]+]], [[A]], 0xffff{{$}}
 ; GCN-DAG: s_and_b32 [[MASK_B:s[0-9]+]], [[B]], 0xffff{{$}}
-; GCN: s_cmp_eq_u32 [[MASK_A]], [[MASK_B]]
+; GFX8: s_cmp_eq_u32 [[MASK_A]], [[MASK_B]]
+; GFX9: s_cmp_eq_u32 [[MASK_B]], [[MASK_A]]
 ; GCN: s_cselect_b64 [[CC:s\[[0-9:]+\]]], -1, 0
 ; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CC]]
 ; GCN: buffer_store_short [[RESULT]]



More information about the llvm-commits mailing list