[llvm] 13f38c9 - [LLVM][SelectionDAG] Align poison/undef binop folds with IR. (#149334)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Jul 30 03:20:34 PDT 2025
Author: Paul Walker
Date: 2025-07-30T11:20:30+01:00
New Revision: 13f38c97d597f3b07fb674d9b8c2b1db2bc0724f
URL: https://github.com/llvm/llvm-project/commit/13f38c97d597f3b07fb674d9b8c2b1db2bc0724f
DIFF: https://github.com/llvm/llvm-project/commit/13f38c97d597f3b07fb674d9b8c2b1db2bc0724f.diff
LOG: [LLVM][SelectionDAG] Align poison/undef binop folds with IR. (#149334)
The "at construction" binop folds in SelectionDAG::getNode() has
different behaviour when compared to the equivalent LLVM IR. This PR
makes the behaviour consistent while also extending the coverage to
include signed/unsigned max/min operations.
Added:
llvm/unittests/CodeGen/SelectionDAGNodeConstructionTest.cpp
llvm/unittests/CodeGen/SelectionDAGTestBase.h
Modified:
llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
llvm/test/CodeGen/AArch64/combine-and-like.ll
llvm/test/CodeGen/AMDGPU/saddsat.ll
llvm/test/CodeGen/AMDGPU/uaddsat.ll
llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll
llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll
llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll
llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll
llvm/test/CodeGen/X86/combine-add-ssat.ll
llvm/test/CodeGen/X86/combine-add-usat.ll
llvm/test/CodeGen/X86/combine-sub-ssat.ll
llvm/test/CodeGen/X86/combine-sub-usat.ll
llvm/test/CodeGen/X86/load-combine.ll
llvm/test/CodeGen/X86/pr33960.ll
llvm/unittests/CodeGen/CMakeLists.txt
llvm/unittests/CodeGen/SelectionDAGAddressAnalysisTest.cpp
llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 5c586f73aa125..02d1100c9e7cb 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -7843,20 +7843,43 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
}
}
- // Perform trivial constant folding.
- if (SDValue SV = FoldConstantArithmetic(Opcode, DL, VT, {N1, N2}, Flags))
- return SV;
+ if (N1.getOpcode() == ISD::POISON || N2.getOpcode() == ISD::POISON) {
+ switch (Opcode) {
+ case ISD::XOR:
+ case ISD::ADD:
+ case ISD::PTRADD:
+ case ISD::SUB:
+ case ISD::SIGN_EXTEND_INREG:
+ case ISD::UDIV:
+ case ISD::SDIV:
+ case ISD::UREM:
+ case ISD::SREM:
+ case ISD::MUL:
+ case ISD::AND:
+ case ISD::SSUBSAT:
+ case ISD::USUBSAT:
+ case ISD::UMIN:
+ case ISD::OR:
+ case ISD::SADDSAT:
+ case ISD::UADDSAT:
+ case ISD::UMAX:
+ case ISD::SMAX:
+ case ISD::SMIN:
+ // fold op(arg1, poison) -> poison, fold op(poison, arg2) -> poison.
+ return N2.getOpcode() == ISD::POISON ? N2 : N1;
+ }
+ }
// Canonicalize an UNDEF to the RHS, even over a constant.
- if (N1.isUndef()) {
+ if (N1.getOpcode() == ISD::UNDEF && N2.getOpcode() != ISD::UNDEF) {
if (TLI->isCommutativeBinOp(Opcode)) {
std::swap(N1, N2);
} else {
switch (Opcode) {
case ISD::PTRADD:
case ISD::SUB:
- // fold op(undef, arg2) -> undef, fold op(poison, arg2) ->poison.
- return N1.getOpcode() == ISD::POISON ? getPOISON(VT) : getUNDEF(VT);
+ // fold op(undef, non_undef_arg2) -> undef.
+ return N1;
case ISD::SIGN_EXTEND_INREG:
case ISD::UDIV:
case ISD::SDIV:
@@ -7864,18 +7887,17 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
case ISD::SREM:
case ISD::SSUBSAT:
case ISD::USUBSAT:
- // fold op(undef, arg2) -> 0, fold op(poison, arg2) -> poison.
- return N1.getOpcode() == ISD::POISON ? getPOISON(VT)
- : getConstant(0, DL, VT);
+ // fold op(undef, non_undef_arg2) -> 0.
+ return getConstant(0, DL, VT);
}
}
}
// Fold a bunch of operators when the RHS is undef.
- if (N2.isUndef()) {
+ if (N2.getOpcode() == ISD::UNDEF) {
switch (Opcode) {
case ISD::XOR:
- if (N1.isUndef())
+ if (N1.getOpcode() == ISD::UNDEF)
// Handle undef ^ undef -> 0 special case. This is a common
// idiom (misuse).
return getConstant(0, DL, VT);
@@ -7883,29 +7905,48 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
case ISD::ADD:
case ISD::PTRADD:
case ISD::SUB:
+ // fold op(arg1, undef) -> undef.
+ return N2;
case ISD::UDIV:
case ISD::SDIV:
case ISD::UREM:
case ISD::SREM:
- // fold op(arg1, undef) -> undef, fold op(arg1, poison) -> poison.
- return N2.getOpcode() == ISD::POISON ? getPOISON(VT) : getUNDEF(VT);
+ // fold op(arg1, undef) -> poison.
+ return getPOISON(VT);
case ISD::MUL:
case ISD::AND:
case ISD::SSUBSAT:
case ISD::USUBSAT:
- // fold op(arg1, undef) -> 0, fold op(arg1, poison) -> poison.
- return N2.getOpcode() == ISD::POISON ? getPOISON(VT)
- : getConstant(0, DL, VT);
+ case ISD::UMIN:
+ // fold op(undef, undef) -> undef, fold op(arg1, undef) -> 0.
+ return N1.getOpcode() == ISD::UNDEF ? N2 : getConstant(0, DL, VT);
case ISD::OR:
case ISD::SADDSAT:
case ISD::UADDSAT:
- // fold op(arg1, undef) -> an all-ones constant, fold op(arg1, poison) ->
- // poison.
- return N2.getOpcode() == ISD::POISON ? getPOISON(VT)
- : getAllOnesConstant(DL, VT);
+ case ISD::UMAX:
+ // fold op(undef, undef) -> undef, fold op(arg1, undef) -> -1.
+ return N1.getOpcode() == ISD::UNDEF ? N2 : getAllOnesConstant(DL, VT);
+ case ISD::SMAX:
+ // fold op(undef, undef) -> undef, fold op(arg1, undef) -> MAX_INT.
+ return N1.getOpcode() == ISD::UNDEF
+ ? N2
+ : getConstant(
+ APInt::getSignedMaxValue(VT.getScalarSizeInBits()), DL,
+ VT);
+ case ISD::SMIN:
+ // fold op(undef, undef) -> undef, fold op(arg1, undef) -> MIN_INT.
+ return N1.getOpcode() == ISD::UNDEF
+ ? N2
+ : getConstant(
+ APInt::getSignedMinValue(VT.getScalarSizeInBits()), DL,
+ VT);
}
}
+ // Perform trivial constant folding.
+ if (SDValue SV = FoldConstantArithmetic(Opcode, DL, VT, {N1, N2}, Flags))
+ return SV;
+
// Memoize this node if possible.
SDNode *N;
SDVTList VTs = getVTList(VT);
diff --git a/llvm/test/CodeGen/AArch64/combine-and-like.ll b/llvm/test/CodeGen/AArch64/combine-and-like.ll
index 15770c2e02ffd..ea1359b300620 100644
--- a/llvm/test/CodeGen/AArch64/combine-and-like.ll
+++ b/llvm/test/CodeGen/AArch64/combine-and-like.ll
@@ -4,7 +4,6 @@
define i32 @f(i32 %a0) {
; CHECK-LABEL: f:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w0, wzr
; CHECK-NEXT: ret
%1 = lshr i32 %a0, 2147483647
%2 = add i32 %1, 2147483647
diff --git a/llvm/test/CodeGen/AMDGPU/saddsat.ll b/llvm/test/CodeGen/AMDGPU/saddsat.ll
index 4e27cf20d3c98..019eb2c661edc 100644
--- a/llvm/test/CodeGen/AMDGPU/saddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/saddsat.ll
@@ -235,7 +235,7 @@ define <3 x i16> @v_saddsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) {
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX6-NEXT: v_med3_i32 v3, v2, s4, v4
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT: v_or_b32_e32 v2, 0xffff0000, v3
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3
; GFX6-NEXT: v_alignbit_b32 v1, v3, v1, 16
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/uaddsat.ll
index 79adc9ead62e1..923017400adb1 100644
--- a/llvm/test/CodeGen/AMDGPU/uaddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/uaddsat.ll
@@ -202,10 +202,9 @@ define <3 x i16> @v_uaddsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) {
; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5
; GFX6-NEXT: v_min_u32_e32 v0, 0xffff, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT: v_min_u32_e32 v3, 0xffff, v2
+; GFX6-NEXT: v_min_u32_e32 v2, 0xffff, v2
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT: v_or_b32_e32 v2, 0xffff0000, v3
-; GFX6-NEXT: v_alignbit_b32 v1, v3, v1, 16
+; GFX6-NEXT: v_alignbit_b32 v1, v2, v1, 16
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_uaddsat_v3i16:
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll
index c12265bd7f372..ed2f06b8136a2 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll
@@ -604,18 +604,18 @@ define i8 @test_vector_reduce_smax_v8i8(<8 x i8> %v) {
; GFX7-SDAG-LABEL: test_vector_reduce_smax_v8i8:
; GFX7-SDAG: ; %bb.0: ; %entry
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT: v_bfe_i32 v7, v7, 0, 8
-; GFX7-SDAG-NEXT: v_bfe_i32 v3, v3, 0, 8
-; GFX7-SDAG-NEXT: v_bfe_i32 v6, v6, 0, 8
-; GFX7-SDAG-NEXT: v_bfe_i32 v2, v2, 0, 8
-; GFX7-SDAG-NEXT: v_bfe_i32 v4, v4, 0, 8
-; GFX7-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 8
; GFX7-SDAG-NEXT: v_bfe_i32 v5, v5, 0, 8
; GFX7-SDAG-NEXT: v_bfe_i32 v1, v1, 0, 8
-; GFX7-SDAG-NEXT: v_max_i32_e32 v2, v2, v6
-; GFX7-SDAG-NEXT: v_max_i32_e32 v3, v3, v7
-; GFX7-SDAG-NEXT: v_max3_i32 v1, v1, v5, v3
-; GFX7-SDAG-NEXT: v_max3_i32 v0, v0, v4, v2
+; GFX7-SDAG-NEXT: v_bfe_i32 v4, v4, 0, 8
+; GFX7-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 8
+; GFX7-SDAG-NEXT: v_bfe_i32 v6, v6, 0, 8
+; GFX7-SDAG-NEXT: v_bfe_i32 v2, v2, 0, 8
+; GFX7-SDAG-NEXT: v_bfe_i32 v7, v7, 0, 8
+; GFX7-SDAG-NEXT: v_bfe_i32 v3, v3, 0, 8
+; GFX7-SDAG-NEXT: v_max_i32_e32 v0, v0, v4
+; GFX7-SDAG-NEXT: v_max_i32_e32 v1, v1, v5
+; GFX7-SDAG-NEXT: v_max3_i32 v1, v1, v3, v7
+; GFX7-SDAG-NEXT: v_max3_i32 v0, v0, v2, v6
; GFX7-SDAG-NEXT: v_max_i32_e32 v0, v0, v1
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -698,15 +698,15 @@ define i8 @test_vector_reduce_smax_v8i8(<8 x i8> %v) {
; GFX9-SDAG-LABEL: test_vector_reduce_smax_v8i8:
; GFX9-SDAG: ; %bb.0: ; %entry
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_bfe_i32 v5, v5, 0, 8
-; GFX9-SDAG-NEXT: v_bfe_i32 v1, v1, 0, 8
-; GFX9-SDAG-NEXT: v_max_i16_sdwa v3, sext(v3), sext(v7) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-SDAG-NEXT: v_max3_i16 v1, v1, v5, v3
-; GFX9-SDAG-NEXT: v_bfe_i32 v4, v4, 0, 8
-; GFX9-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 8
-; GFX9-SDAG-NEXT: v_max_i16_sdwa v2, sext(v2), sext(v6) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-SDAG-NEXT: v_bfe_i32 v7, v7, 0, 8
+; GFX9-SDAG-NEXT: v_bfe_i32 v3, v3, 0, 8
+; GFX9-SDAG-NEXT: v_max_i16_sdwa v1, sext(v1), sext(v5) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-SDAG-NEXT: v_max3_i16 v1, v1, v3, v7
+; GFX9-SDAG-NEXT: v_bfe_i32 v6, v6, 0, 8
+; GFX9-SDAG-NEXT: v_bfe_i32 v2, v2, 0, 8
+; GFX9-SDAG-NEXT: v_max_i16_sdwa v0, sext(v0), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX9-SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; GFX9-SDAG-NEXT: v_max3_i16 v0, v0, v4, v2
+; GFX9-SDAG-NEXT: v_max3_i16 v0, v0, v2, v6
; GFX9-SDAG-NEXT: v_lshrrev_b32_e32 v1, 8, v1
; GFX9-SDAG-NEXT: v_max_i16_sdwa v0, v0, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -741,20 +741,20 @@ define i8 @test_vector_reduce_smax_v8i8(<8 x i8> %v) {
; GFX10-SDAG-LABEL: test_vector_reduce_smax_v8i8:
; GFX10-SDAG: ; %bb.0: ; %entry
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT: v_bfe_i32 v7, v7, 0, 8
-; GFX10-SDAG-NEXT: v_bfe_i32 v3, v3, 0, 8
; GFX10-SDAG-NEXT: v_bfe_i32 v5, v5, 0, 8
; GFX10-SDAG-NEXT: v_bfe_i32 v1, v1, 0, 8
-; GFX10-SDAG-NEXT: v_bfe_i32 v2, v2, 0, 8
-; GFX10-SDAG-NEXT: v_bfe_i32 v4, v4, 0, 8
-; GFX10-SDAG-NEXT: v_max_i16 v3, v3, v7
+; GFX10-SDAG-NEXT: v_bfe_i32 v7, v7, 0, 8
+; GFX10-SDAG-NEXT: v_bfe_i32 v3, v3, 0, 8
; GFX10-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 8
-; GFX10-SDAG-NEXT: v_max3_i16 v1, v1, v5, v3
-; GFX10-SDAG-NEXT: v_bfe_i32 v3, v6, 0, 8
+; GFX10-SDAG-NEXT: v_bfe_i32 v2, v2, 0, 8
+; GFX10-SDAG-NEXT: v_max_i16 v1, v1, v5
+; GFX10-SDAG-NEXT: v_max3_i16 v1, v1, v3, v7
+; GFX10-SDAG-NEXT: v_bfe_i32 v3, v4, 0, 8
+; GFX10-SDAG-NEXT: v_bfe_i32 v4, v6, 0, 8
; GFX10-SDAG-NEXT: v_lshlrev_b16 v1, 8, v1
-; GFX10-SDAG-NEXT: v_max_i16 v2, v2, v3
+; GFX10-SDAG-NEXT: v_max_i16 v0, v0, v3
; GFX10-SDAG-NEXT: v_lshrrev_b32_e32 v1, 8, v1
-; GFX10-SDAG-NEXT: v_max3_i16 v0, v0, v4, v2
+; GFX10-SDAG-NEXT: v_max3_i16 v0, v0, v2, v4
; GFX10-SDAG-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX10-SDAG-NEXT: v_max_i16 v0, v0, v1
; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -796,62 +796,62 @@ define i8 @test_vector_reduce_smax_v8i8(<8 x i8> %v) {
; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_smax_v8i8:
; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v3, 0, 8
+; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v5, v5, 0, 8
+; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v1, v1, 0, 8
+; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v8, v3, 0, 8
; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v7, v7, 0, 8
-; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v8, v1, 0, 8
; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v2, v2, 0, 8
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.l
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v7.l
-; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v7, v5, 0, 8
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v8.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-SDAG-TRUE16-NEXT: v_max_i16 v1.l, v1.l, v3.l
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v7.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT: v_max3_i16 v1.l, v5.l, v3.l, v1.l
-; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v6, 0, 8
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v1.l
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT: v_max3_i16 v0.l, v1.l, v5.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v1, v7, 0, 8
+; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v4, 0, 8
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v6.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v0.l
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.l
-; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v2, v5, 0, 8
; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v4, 0, 8
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 8, v6
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 8, v5
; GFX11-SDAG-TRUE16-NEXT: v_max_i16 v0.l, v0.l, v1.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v4, 0, 8
-; GFX11-SDAG-TRUE16-NEXT: v_max3_i16 v0.l, v1.l, v2.l, v0.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT: v_max3_i16 v0.l, v0.l, v1.l, v2.l
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: v_max_i16 v0.l, v0.l, v1.l
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_smax_v8i8:
; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry
; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v7, v7, 0, 8
-; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v3, v3, 0, 8
; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v5, v5, 0, 8
; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v1, v1, 0, 8
-; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v2, v2, 0, 8
-; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v4, v4, 0, 8
-; GFX11-SDAG-FAKE16-NEXT: v_max_i16 v3, v3, v7
+; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v7, v7, 0, 8
+; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v3, v3, 0, 8
; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 8
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-FAKE16-NEXT: v_max3_i16 v1, v1, v5, v3
-; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v3, v6, 0, 8
+; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v2, v2, 0, 8
+; GFX11-SDAG-FAKE16-NEXT: v_max_i16 v1, v1, v5
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-SDAG-FAKE16-NEXT: v_max3_i16 v1, v1, v3, v7
+; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v3, v4, 0, 8
+; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v4, v6, 0, 8
; GFX11-SDAG-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-FAKE16-NEXT: v_max_i16 v2, v2, v3
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT: v_max_i16 v0, v0, v3
; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-FAKE16-NEXT: v_max3_i16 v0, v0, v4, v2
+; GFX11-SDAG-FAKE16-NEXT: v_max3_i16 v0, v0, v2, v4
; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-FAKE16-NEXT: v_max_i16 v0, v0, v1
@@ -906,39 +906,39 @@ define i8 @test_vector_reduce_smax_v8i8(<8 x i8> %v) {
; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v3, 0, 8
+; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v5, v5, 0, 8
+; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v1, v1, 0, 8
+; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v8, v3, 0, 8
; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v7, v7, 0, 8
-; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v8, v1, 0, 8
; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v2, v2, 0, 8
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.l
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v7.l
-; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v7, v5, 0, 8
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.l
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v8.l
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX12-SDAG-TRUE16-NEXT: v_max_i16 v1.l, v1.l, v3.l
; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v7.l
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX12-SDAG-TRUE16-NEXT: v_max3_i16 v1.l, v5.l, v3.l, v1.l
-; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v6, 0, 8
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
-; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v1.l
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.l
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT: v_max3_i16 v0.l, v1.l, v5.l, v3.l
+; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v1, v7, 0, 8
+; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v4, 0, 8
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v6.l
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v0.l
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.l
-; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v2, v5, 0, 8
; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v4, 0, 8
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 8, v6
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 8, v5
; GFX12-SDAG-TRUE16-NEXT: v_max_i16 v0.l, v0.l, v1.l
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.l
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v4, 0, 8
-; GFX12-SDAG-TRUE16-NEXT: v_max3_i16 v0.l, v1.l, v2.l, v0.l
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT: v_max3_i16 v0.l, v0.l, v1.l, v2.l
; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.l
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-TRUE16-NEXT: v_max_i16 v0.l, v0.l, v1.l
; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -949,23 +949,23 @@ define i8 @test_vector_reduce_smax_v8i8(<8 x i8> %v) {
; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v7, v7, 0, 8
-; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v3, v3, 0, 8
; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v5, v5, 0, 8
; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v1, v1, 0, 8
-; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v2, v2, 0, 8
-; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v4, v4, 0, 8
-; GFX12-SDAG-FAKE16-NEXT: v_max_i16 v3, v3, v7
+; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v7, v7, 0, 8
+; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v3, v3, 0, 8
; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 8
-; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-SDAG-FAKE16-NEXT: v_max3_i16 v1, v1, v5, v3
-; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v3, v6, 0, 8
+; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v2, v2, 0, 8
+; GFX12-SDAG-FAKE16-NEXT: v_max_i16 v1, v1, v5
+; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-SDAG-FAKE16-NEXT: v_max3_i16 v1, v1, v3, v7
+; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v3, v4, 0, 8
+; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v4, v6, 0, 8
; GFX12-SDAG-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1
-; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-FAKE16-NEXT: v_max_i16 v2, v2, v3
+; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT: v_max_i16 v0, v0, v3
; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-FAKE16-NEXT: v_max3_i16 v0, v0, v4, v2
+; GFX12-SDAG-FAKE16-NEXT: v_max3_i16 v0, v0, v2, v4
; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-FAKE16-NEXT: v_max_i16 v0, v0, v1
@@ -1025,32 +1025,32 @@ define i8 @test_vector_reduce_smax_v16i8(<16 x i8> %v) {
; GFX7-SDAG-LABEL: test_vector_reduce_smax_v16i8:
; GFX7-SDAG: ; %bb.0: ; %entry
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT: v_bfe_i32 v14, v14, 0, 8
-; GFX7-SDAG-NEXT: v_bfe_i32 v6, v6, 0, 8
-; GFX7-SDAG-NEXT: v_bfe_i32 v15, v15, 0, 8
-; GFX7-SDAG-NEXT: v_bfe_i32 v7, v7, 0, 8
-; GFX7-SDAG-NEXT: v_bfe_i32 v11, v11, 0, 8
-; GFX7-SDAG-NEXT: v_bfe_i32 v3, v3, 0, 8
-; GFX7-SDAG-NEXT: v_bfe_i32 v10, v10, 0, 8
-; GFX7-SDAG-NEXT: v_bfe_i32 v2, v2, 0, 8
+; GFX7-SDAG-NEXT: v_bfe_i32 v8, v8, 0, 8
+; GFX7-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 8
; GFX7-SDAG-NEXT: v_bfe_i32 v9, v9, 0, 8
; GFX7-SDAG-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX7-SDAG-NEXT: v_bfe_i32 v13, v13, 0, 8
; GFX7-SDAG-NEXT: v_bfe_i32 v5, v5, 0, 8
-; GFX7-SDAG-NEXT: v_bfe_i32 v8, v8, 0, 8
-; GFX7-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 8
; GFX7-SDAG-NEXT: v_bfe_i32 v12, v12, 0, 8
; GFX7-SDAG-NEXT: v_bfe_i32 v4, v4, 0, 8
-; GFX7-SDAG-NEXT: v_max_i32_e32 v7, v7, v15
-; GFX7-SDAG-NEXT: v_max_i32_e32 v6, v6, v14
-; GFX7-SDAG-NEXT: v_max_i32_e32 v4, v4, v12
-; GFX7-SDAG-NEXT: v_max_i32_e32 v0, v0, v8
-; GFX7-SDAG-NEXT: v_max_i32_e32 v5, v5, v13
+; GFX7-SDAG-NEXT: v_bfe_i32 v11, v11, 0, 8
+; GFX7-SDAG-NEXT: v_bfe_i32 v3, v3, 0, 8
+; GFX7-SDAG-NEXT: v_bfe_i32 v15, v15, 0, 8
+; GFX7-SDAG-NEXT: v_bfe_i32 v7, v7, 0, 8
+; GFX7-SDAG-NEXT: v_bfe_i32 v10, v10, 0, 8
+; GFX7-SDAG-NEXT: v_bfe_i32 v2, v2, 0, 8
+; GFX7-SDAG-NEXT: v_bfe_i32 v14, v14, 0, 8
+; GFX7-SDAG-NEXT: v_bfe_i32 v6, v6, 0, 8
; GFX7-SDAG-NEXT: v_max_i32_e32 v1, v1, v9
-; GFX7-SDAG-NEXT: v_max3_i32 v2, v2, v10, v6
-; GFX7-SDAG-NEXT: v_max3_i32 v3, v3, v11, v7
-; GFX7-SDAG-NEXT: v_max3_i32 v1, v1, v5, v3
-; GFX7-SDAG-NEXT: v_max3_i32 v0, v0, v4, v2
+; GFX7-SDAG-NEXT: v_max_i32_e32 v0, v0, v8
+; GFX7-SDAG-NEXT: v_max_i32_e32 v6, v6, v14
+; GFX7-SDAG-NEXT: v_max_i32_e32 v2, v2, v10
+; GFX7-SDAG-NEXT: v_max_i32_e32 v7, v7, v15
+; GFX7-SDAG-NEXT: v_max_i32_e32 v3, v3, v11
+; GFX7-SDAG-NEXT: v_max3_i32 v0, v0, v4, v12
+; GFX7-SDAG-NEXT: v_max3_i32 v1, v1, v5, v13
+; GFX7-SDAG-NEXT: v_max3_i32 v1, v1, v3, v7
+; GFX7-SDAG-NEXT: v_max3_i32 v0, v0, v2, v6
; GFX7-SDAG-NEXT: v_max_i32_e32 v0, v0, v1
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -1165,21 +1165,21 @@ define i8 @test_vector_reduce_smax_v16i8(<16 x i8> %v) {
; GFX9-SDAG-LABEL: test_vector_reduce_smax_v16i8:
; GFX9-SDAG: ; %bb.0: ; %entry
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_bfe_i32 v11, v11, 0, 8
-; GFX9-SDAG-NEXT: v_bfe_i32 v3, v3, 0, 8
-; GFX9-SDAG-NEXT: v_max_i16_sdwa v7, sext(v7), sext(v15) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-SDAG-NEXT: v_max_i16_sdwa v5, sext(v5), sext(v13) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-SDAG-NEXT: v_bfe_i32 v13, v13, 0, 8
+; GFX9-SDAG-NEXT: v_bfe_i32 v5, v5, 0, 8
; GFX9-SDAG-NEXT: v_max_i16_sdwa v1, sext(v1), sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-SDAG-NEXT: v_max3_i16 v3, v3, v11, v7
-; GFX9-SDAG-NEXT: v_bfe_i32 v10, v10, 0, 8
-; GFX9-SDAG-NEXT: v_bfe_i32 v2, v2, 0, 8
-; GFX9-SDAG-NEXT: v_max_i16_sdwa v6, sext(v6), sext(v14) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-SDAG-NEXT: v_max3_i16 v1, v1, v5, v3
-; GFX9-SDAG-NEXT: v_max_i16_sdwa v4, sext(v4), sext(v12) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-SDAG-NEXT: v_max_i16_sdwa v7, sext(v7), sext(v15) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-SDAG-NEXT: v_max_i16_sdwa v3, sext(v3), sext(v11) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-SDAG-NEXT: v_max3_i16 v1, v1, v5, v13
+; GFX9-SDAG-NEXT: v_bfe_i32 v12, v12, 0, 8
+; GFX9-SDAG-NEXT: v_bfe_i32 v4, v4, 0, 8
; GFX9-SDAG-NEXT: v_max_i16_sdwa v0, sext(v0), sext(v8) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-SDAG-NEXT: v_max3_i16 v2, v2, v10, v6
+; GFX9-SDAG-NEXT: v_max3_i16 v1, v1, v3, v7
+; GFX9-SDAG-NEXT: v_max_i16_sdwa v6, sext(v6), sext(v14) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-SDAG-NEXT: v_max_i16_sdwa v2, sext(v2), sext(v10) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-SDAG-NEXT: v_max3_i16 v0, v0, v4, v12
; GFX9-SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; GFX9-SDAG-NEXT: v_max3_i16 v0, v0, v4, v2
+; GFX9-SDAG-NEXT: v_max3_i16 v0, v0, v2, v6
; GFX9-SDAG-NEXT: v_lshrrev_b32_e32 v1, 8, v1
; GFX9-SDAG-NEXT: v_max_i16_sdwa v0, v0, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -1222,34 +1222,34 @@ define i8 @test_vector_reduce_smax_v16i8(<16 x i8> %v) {
; GFX10-SDAG-LABEL: test_vector_reduce_smax_v16i8:
; GFX10-SDAG: ; %bb.0: ; %entry
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT: v_bfe_i32 v15, v15, 0, 8
-; GFX10-SDAG-NEXT: v_bfe_i32 v7, v7, 0, 8
-; GFX10-SDAG-NEXT: v_bfe_i32 v11, v11, 0, 8
-; GFX10-SDAG-NEXT: v_bfe_i32 v3, v3, 0, 8
; GFX10-SDAG-NEXT: v_bfe_i32 v9, v9, 0, 8
+; GFX10-SDAG-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX10-SDAG-NEXT: v_bfe_i32 v13, v13, 0, 8
; GFX10-SDAG-NEXT: v_bfe_i32 v5, v5, 0, 8
-; GFX10-SDAG-NEXT: v_bfe_i32 v1, v1, 0, 8
+; GFX10-SDAG-NEXT: v_bfe_i32 v11, v11, 0, 8
+; GFX10-SDAG-NEXT: v_bfe_i32 v15, v15, 0, 8
+; GFX10-SDAG-NEXT: v_bfe_i32 v7, v7, 0, 8
+; GFX10-SDAG-NEXT: v_bfe_i32 v3, v3, 0, 8
+; GFX10-SDAG-NEXT: v_max_i16 v1, v1, v9
+; GFX10-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 8
+; GFX10-SDAG-NEXT: v_bfe_i32 v9, v12, 0, 8
; GFX10-SDAG-NEXT: v_max_i16 v7, v7, v15
+; GFX10-SDAG-NEXT: v_max_i16 v3, v3, v11
+; GFX10-SDAG-NEXT: v_max3_i16 v1, v1, v5, v13
+; GFX10-SDAG-NEXT: v_bfe_i32 v5, v8, 0, 8
+; GFX10-SDAG-NEXT: v_bfe_i32 v4, v4, 0, 8
+; GFX10-SDAG-NEXT: v_bfe_i32 v8, v10, 0, 8
; GFX10-SDAG-NEXT: v_bfe_i32 v6, v6, 0, 8
-; GFX10-SDAG-NEXT: v_bfe_i32 v10, v10, 0, 8
-; GFX10-SDAG-NEXT: v_max_i16 v5, v5, v13
-; GFX10-SDAG-NEXT: v_max_i16 v1, v1, v9
-; GFX10-SDAG-NEXT: v_max3_i16 v3, v3, v11, v7
-; GFX10-SDAG-NEXT: v_bfe_i32 v7, v14, 0, 8
+; GFX10-SDAG-NEXT: v_max3_i16 v1, v1, v3, v7
+; GFX10-SDAG-NEXT: v_bfe_i32 v3, v14, 0, 8
; GFX10-SDAG-NEXT: v_bfe_i32 v2, v2, 0, 8
-; GFX10-SDAG-NEXT: v_bfe_i32 v8, v8, 0, 8
-; GFX10-SDAG-NEXT: v_bfe_i32 v4, v4, 0, 8
-; GFX10-SDAG-NEXT: v_max3_i16 v1, v1, v5, v3
-; GFX10-SDAG-NEXT: v_bfe_i32 v3, v12, 0, 8
-; GFX10-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 8
-; GFX10-SDAG-NEXT: v_max_i16 v5, v6, v7
+; GFX10-SDAG-NEXT: v_max_i16 v0, v0, v5
; GFX10-SDAG-NEXT: v_lshlrev_b16 v1, 8, v1
-; GFX10-SDAG-NEXT: v_max_i16 v3, v4, v3
-; GFX10-SDAG-NEXT: v_max_i16 v0, v0, v8
-; GFX10-SDAG-NEXT: v_max3_i16 v2, v2, v10, v5
+; GFX10-SDAG-NEXT: v_max_i16 v3, v6, v3
+; GFX10-SDAG-NEXT: v_max_i16 v2, v2, v8
+; GFX10-SDAG-NEXT: v_max3_i16 v0, v0, v4, v9
; GFX10-SDAG-NEXT: v_lshrrev_b32_e32 v1, 8, v1
-; GFX10-SDAG-NEXT: v_max3_i16 v0, v0, v3, v2
+; GFX10-SDAG-NEXT: v_max3_i16 v0, v0, v2, v3
; GFX10-SDAG-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX10-SDAG-NEXT: v_max_i16 v0, v0, v1
; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -1307,59 +1307,58 @@ define i8 @test_vector_reduce_smax_v16i8(<16 x i8> %v) {
; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_smax_v16i8:
; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v16, v2, 0, 8
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.l
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v15.l
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v17.l, v0.l
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v9.l
-; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v11, v11, 0, 8
-; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v15, v2, 0, 8
-; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v2, v3, 0, 8
-; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v7, 0, 8
-; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v7, v14, 0, 8
-; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v14, v6, 0, 8
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v6.l, v13.l
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l
-; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v13, v0, 0, 8
+; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v16, v4, 0, 8
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v5.l
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v9.l
+; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v1, v1, 0, 8
+; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v17, v0, 0, 8
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v11.l
+; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v9, v4, 0, 8
; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v4, v5, 0, 8
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l
-; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v5, v6, 0, 8
-; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v6, v1, 0, 8
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.l
-; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v10, v10, 0, 8
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v15.l
+; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v13, v13, 0, 8
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v18.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.l
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
+; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v6, v0, 0, 8
+; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v5, v5, 0, 8
+; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v11, v3, 0, 8
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v4.l
+; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v2, v2, 0, 8
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.l
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v6.l
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v13.l
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v11.l
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v6.l
; GFX11-SDAG-TRUE16-NEXT: v_max_i16 v0.l, v0.l, v1.l
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v15.l
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v6.l, v11.l
-; GFX11-SDAG-TRUE16-NEXT: v_max_i16 v0.h, v2.l, v3.l
-; GFX11-SDAG-TRUE16-NEXT: v_max_i16 v1.h, v4.l, v5.l
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v11.l, v12.l
-; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v9, v9, 0, 8
-; GFX11-SDAG-TRUE16-NEXT: v_max3_i16 v0.l, v1.l, v6.l, v0.l
-; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v6, v8, 0, 8
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v14.l
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v7.l
-; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v5, v17, 0, 8
-; GFX11-SDAG-TRUE16-NEXT: v_max3_i16 v0.l, v1.h, v0.h, v0.l
-; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v4, v11, 0, 8
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v9.l
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v6.l, v13.l
; GFX11-SDAG-TRUE16-NEXT: v_max_i16 v0.h, v2.l, v3.l
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v16.l
+; GFX11-SDAG-TRUE16-NEXT: v_max_i16 v1.h, v4.l, v5.l
+; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v8, v8, 0, 8
+; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v7, v7, 0, 8
+; GFX11-SDAG-TRUE16-NEXT: v_max3_i16 v0.l, v0.l, v1.l, v6.l
+; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v12, v12, 0, 8
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v9.l, v14.l
+; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v6, v10, 0, 8
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v7.l
+; GFX11-SDAG-TRUE16-NEXT: v_max3_i16 v0.l, v0.l, v1.h, v0.h
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v17.l
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v8.l
+; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v5, v18, 0, 8
+; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v4, v9, 0, 8
; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v0.l
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v6.l
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v10.l
-; GFX11-SDAG-TRUE16-NEXT: v_max_i16 v1.l, v1.l, v4.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-TRUE16-NEXT: v_max_i16 v0.h, v2.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v16.l
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v12.l
; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 8, v7
+; GFX11-SDAG-TRUE16-NEXT: v_max_i16 v1.l, v1.l, v4.l
; GFX11-SDAG-TRUE16-NEXT: v_max_i16 v0.l, v5.l, v0.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-SDAG-TRUE16-NEXT: v_max3_i16 v0.h, v2.l, v3.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-TRUE16-NEXT: v_max3_i16 v0.h, v0.h, v2.l, v3.l
; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v2, v6, 0, 8
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_max3_i16 v0.l, v0.l, v1.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: v_max3_i16 v0.l, v0.h, v0.l, v1.l
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: v_max_i16 v0.l, v0.l, v1.l
@@ -1368,37 +1367,37 @@ define i8 @test_vector_reduce_smax_v16i8(<16 x i8> %v) {
; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_smax_v16i8:
; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry
; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v15, v15, 0, 8
-; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v7, v7, 0, 8
-; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v11, v11, 0, 8
-; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v3, v3, 0, 8
; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v9, v9, 0, 8
+; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v13, v13, 0, 8
; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v5, v5, 0, 8
-; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v1, v1, 0, 8
+; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v11, v11, 0, 8
+; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v15, v15, 0, 8
+; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v7, v7, 0, 8
+; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v3, v3, 0, 8
+; GFX11-SDAG-FAKE16-NEXT: v_max_i16 v1, v1, v9
+; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 8
+; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v9, v12, 0, 8
; GFX11-SDAG-FAKE16-NEXT: v_max_i16 v7, v7, v15
+; GFX11-SDAG-FAKE16-NEXT: v_max_i16 v3, v3, v11
+; GFX11-SDAG-FAKE16-NEXT: v_max3_i16 v1, v1, v5, v13
+; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v5, v8, 0, 8
+; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v4, v4, 0, 8
+; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v8, v10, 0, 8
; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v6, v6, 0, 8
-; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v10, v10, 0, 8
-; GFX11-SDAG-FAKE16-NEXT: v_max_i16 v5, v5, v13
-; GFX11-SDAG-FAKE16-NEXT: v_max_i16 v1, v1, v9
-; GFX11-SDAG-FAKE16-NEXT: v_max3_i16 v3, v3, v11, v7
-; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v7, v14, 0, 8
+; GFX11-SDAG-FAKE16-NEXT: v_max3_i16 v1, v1, v3, v7
+; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v3, v14, 0, 8
; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v2, v2, 0, 8
-; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v8, v8, 0, 8
-; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v4, v4, 0, 8
-; GFX11-SDAG-FAKE16-NEXT: v_max3_i16 v1, v1, v5, v3
-; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v3, v12, 0, 8
-; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 8
-; GFX11-SDAG-FAKE16-NEXT: v_max_i16 v5, v6, v7
+; GFX11-SDAG-FAKE16-NEXT: v_max_i16 v0, v0, v5
; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-SDAG-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1
-; GFX11-SDAG-FAKE16-NEXT: v_max_i16 v3, v4, v3
+; GFX11-SDAG-FAKE16-NEXT: v_max_i16 v3, v6, v3
; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-FAKE16-NEXT: v_max_i16 v0, v0, v8
-; GFX11-SDAG-FAKE16-NEXT: v_max3_i16 v2, v2, v10, v5
+; GFX11-SDAG-FAKE16-NEXT: v_max_i16 v2, v2, v8
+; GFX11-SDAG-FAKE16-NEXT: v_max3_i16 v0, v0, v4, v9
; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
-; GFX11-SDAG-FAKE16-NEXT: v_max3_i16 v0, v0, v3, v2
+; GFX11-SDAG-FAKE16-NEXT: v_max3_i16 v0, v0, v2, v3
; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX11-SDAG-FAKE16-NEXT: v_max_i16 v0, v0, v1
@@ -1468,59 +1467,58 @@ define i8 @test_vector_reduce_smax_v16i8(<16 x i8> %v) {
; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v16, v2, 0, 8
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.l
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v15.l
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v17.l, v0.l
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v9.l
-; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v11, v11, 0, 8
-; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v15, v2, 0, 8
-; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v2, v3, 0, 8
-; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v7, 0, 8
-; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v7, v14, 0, 8
-; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v14, v6, 0, 8
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v6.l, v13.l
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l
-; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v13, v0, 0, 8
+; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v16, v4, 0, 8
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v5.l
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v9.l
+; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v1, v1, 0, 8
+; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v17, v0, 0, 8
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v11.l
+; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v9, v4, 0, 8
; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v4, v5, 0, 8
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l
-; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v5, v6, 0, 8
-; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v6, v1, 0, 8
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.l
-; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v10, v10, 0, 8
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v15.l
+; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v13, v13, 0, 8
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v18.l, v2.l
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.l
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
+; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v6, v0, 0, 8
+; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v5, v5, 0, 8
+; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v11, v3, 0, 8
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v4.l
+; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v2, v2, 0, 8
; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.l
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v6.l
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v13.l
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v11.l
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v6.l
; GFX12-SDAG-TRUE16-NEXT: v_max_i16 v0.l, v0.l, v1.l
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v15.l
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v6.l, v11.l
-; GFX12-SDAG-TRUE16-NEXT: v_max_i16 v0.h, v2.l, v3.l
-; GFX12-SDAG-TRUE16-NEXT: v_max_i16 v1.h, v4.l, v5.l
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v11.l, v12.l
-; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v9, v9, 0, 8
-; GFX12-SDAG-TRUE16-NEXT: v_max3_i16 v0.l, v1.l, v6.l, v0.l
-; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v6, v8, 0, 8
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v14.l
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v7.l
-; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v5, v17, 0, 8
-; GFX12-SDAG-TRUE16-NEXT: v_max3_i16 v0.l, v1.h, v0.h, v0.l
-; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v4, v11, 0, 8
; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v9.l
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v6.l, v13.l
; GFX12-SDAG-TRUE16-NEXT: v_max_i16 v0.h, v2.l, v3.l
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v16.l
+; GFX12-SDAG-TRUE16-NEXT: v_max_i16 v1.h, v4.l, v5.l
+; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v8, v8, 0, 8
+; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v7, v7, 0, 8
+; GFX12-SDAG-TRUE16-NEXT: v_max3_i16 v0.l, v0.l, v1.l, v6.l
+; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v12, v12, 0, 8
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v9.l, v14.l
+; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v6, v10, 0, 8
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v7.l
+; GFX12-SDAG-TRUE16-NEXT: v_max3_i16 v0.l, v0.l, v1.h, v0.h
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v17.l
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v8.l
+; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v5, v18, 0, 8
+; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v4, v9, 0, 8
; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v0.l
; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v6.l
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v10.l
-; GFX12-SDAG-TRUE16-NEXT: v_max_i16 v1.l, v1.l, v4.l
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-SDAG-TRUE16-NEXT: v_max_i16 v0.h, v2.l, v3.l
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v16.l
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v12.l
; GFX12-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 8, v7
+; GFX12-SDAG-TRUE16-NEXT: v_max_i16 v1.l, v1.l, v4.l
; GFX12-SDAG-TRUE16-NEXT: v_max_i16 v0.l, v5.l, v0.l
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-SDAG-TRUE16-NEXT: v_max3_i16 v0.h, v2.l, v3.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-SDAG-TRUE16-NEXT: v_max3_i16 v0.h, v0.h, v2.l, v3.l
; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v2, v6, 0, 8
; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT: v_max3_i16 v0.l, v0.l, v1.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT: v_max3_i16 v0.l, v0.h, v0.l, v1.l
; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-TRUE16-NEXT: v_max_i16 v0.l, v0.l, v1.l
@@ -1533,37 +1531,37 @@ define i8 @test_vector_reduce_smax_v16i8(<16 x i8> %v) {
; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v15, v15, 0, 8
-; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v7, v7, 0, 8
-; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v11, v11, 0, 8
-; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v3, v3, 0, 8
; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v9, v9, 0, 8
+; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v13, v13, 0, 8
; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v5, v5, 0, 8
-; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v1, v1, 0, 8
+; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v11, v11, 0, 8
+; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v15, v15, 0, 8
+; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v7, v7, 0, 8
+; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v3, v3, 0, 8
+; GFX12-SDAG-FAKE16-NEXT: v_max_i16 v1, v1, v9
+; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 8
+; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v9, v12, 0, 8
; GFX12-SDAG-FAKE16-NEXT: v_max_i16 v7, v7, v15
+; GFX12-SDAG-FAKE16-NEXT: v_max_i16 v3, v3, v11
+; GFX12-SDAG-FAKE16-NEXT: v_max3_i16 v1, v1, v5, v13
+; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v5, v8, 0, 8
+; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v4, v4, 0, 8
+; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v8, v10, 0, 8
; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v6, v6, 0, 8
-; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v10, v10, 0, 8
-; GFX12-SDAG-FAKE16-NEXT: v_max_i16 v5, v5, v13
-; GFX12-SDAG-FAKE16-NEXT: v_max_i16 v1, v1, v9
-; GFX12-SDAG-FAKE16-NEXT: v_max3_i16 v3, v3, v11, v7
-; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v7, v14, 0, 8
+; GFX12-SDAG-FAKE16-NEXT: v_max3_i16 v1, v1, v3, v7
+; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v3, v14, 0, 8
; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v2, v2, 0, 8
-; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v8, v8, 0, 8
-; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v4, v4, 0, 8
-; GFX12-SDAG-FAKE16-NEXT: v_max3_i16 v1, v1, v5, v3
-; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v3, v12, 0, 8
-; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 8
-; GFX12-SDAG-FAKE16-NEXT: v_max_i16 v5, v6, v7
+; GFX12-SDAG-FAKE16-NEXT: v_max_i16 v0, v0, v5
; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX12-SDAG-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1
-; GFX12-SDAG-FAKE16-NEXT: v_max_i16 v3, v4, v3
+; GFX12-SDAG-FAKE16-NEXT: v_max_i16 v3, v6, v3
; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-SDAG-FAKE16-NEXT: v_max_i16 v0, v0, v8
-; GFX12-SDAG-FAKE16-NEXT: v_max3_i16 v2, v2, v10, v5
+; GFX12-SDAG-FAKE16-NEXT: v_max_i16 v2, v2, v8
+; GFX12-SDAG-FAKE16-NEXT: v_max3_i16 v0, v0, v4, v9
; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
-; GFX12-SDAG-FAKE16-NEXT: v_max3_i16 v0, v0, v3, v2
+; GFX12-SDAG-FAKE16-NEXT: v_max3_i16 v0, v0, v2, v3
; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX12-SDAG-FAKE16-NEXT: v_max_i16 v0, v0, v1
@@ -2055,18 +2053,18 @@ define i16 @test_vector_reduce_smax_v8i16(<8 x i16> %v) {
; GFX7-SDAG-LABEL: test_vector_reduce_smax_v8i16:
; GFX7-SDAG: ; %bb.0: ; %entry
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT: v_bfe_i32 v7, v7, 0, 16
-; GFX7-SDAG-NEXT: v_bfe_i32 v3, v3, 0, 16
-; GFX7-SDAG-NEXT: v_bfe_i32 v6, v6, 0, 16
-; GFX7-SDAG-NEXT: v_bfe_i32 v2, v2, 0, 16
-; GFX7-SDAG-NEXT: v_bfe_i32 v4, v4, 0, 16
-; GFX7-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX7-SDAG-NEXT: v_bfe_i32 v5, v5, 0, 16
; GFX7-SDAG-NEXT: v_bfe_i32 v1, v1, 0, 16
-; GFX7-SDAG-NEXT: v_max_i32_e32 v2, v2, v6
-; GFX7-SDAG-NEXT: v_max_i32_e32 v3, v3, v7
-; GFX7-SDAG-NEXT: v_max3_i32 v1, v1, v5, v3
-; GFX7-SDAG-NEXT: v_max3_i32 v0, v0, v4, v2
+; GFX7-SDAG-NEXT: v_bfe_i32 v4, v4, 0, 16
+; GFX7-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX7-SDAG-NEXT: v_bfe_i32 v6, v6, 0, 16
+; GFX7-SDAG-NEXT: v_bfe_i32 v2, v2, 0, 16
+; GFX7-SDAG-NEXT: v_bfe_i32 v7, v7, 0, 16
+; GFX7-SDAG-NEXT: v_bfe_i32 v3, v3, 0, 16
+; GFX7-SDAG-NEXT: v_max_i32_e32 v0, v0, v4
+; GFX7-SDAG-NEXT: v_max_i32_e32 v1, v1, v5
+; GFX7-SDAG-NEXT: v_max3_i32 v1, v1, v3, v7
+; GFX7-SDAG-NEXT: v_max3_i32 v0, v0, v2, v6
; GFX7-SDAG-NEXT: v_max_i32_e32 v0, v0, v1
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -2253,32 +2251,32 @@ define i16 @test_vector_reduce_smax_v16i16(<16 x i16> %v) {
; GFX7-SDAG-LABEL: test_vector_reduce_smax_v16i16:
; GFX7-SDAG: ; %bb.0: ; %entry
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT: v_bfe_i32 v14, v14, 0, 16
-; GFX7-SDAG-NEXT: v_bfe_i32 v6, v6, 0, 16
-; GFX7-SDAG-NEXT: v_bfe_i32 v15, v15, 0, 16
-; GFX7-SDAG-NEXT: v_bfe_i32 v7, v7, 0, 16
-; GFX7-SDAG-NEXT: v_bfe_i32 v11, v11, 0, 16
-; GFX7-SDAG-NEXT: v_bfe_i32 v3, v3, 0, 16
-; GFX7-SDAG-NEXT: v_bfe_i32 v10, v10, 0, 16
-; GFX7-SDAG-NEXT: v_bfe_i32 v2, v2, 0, 16
+; GFX7-SDAG-NEXT: v_bfe_i32 v8, v8, 0, 16
+; GFX7-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX7-SDAG-NEXT: v_bfe_i32 v9, v9, 0, 16
; GFX7-SDAG-NEXT: v_bfe_i32 v1, v1, 0, 16
; GFX7-SDAG-NEXT: v_bfe_i32 v13, v13, 0, 16
; GFX7-SDAG-NEXT: v_bfe_i32 v5, v5, 0, 16
-; GFX7-SDAG-NEXT: v_bfe_i32 v8, v8, 0, 16
-; GFX7-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX7-SDAG-NEXT: v_bfe_i32 v12, v12, 0, 16
; GFX7-SDAG-NEXT: v_bfe_i32 v4, v4, 0, 16
-; GFX7-SDAG-NEXT: v_max_i32_e32 v7, v7, v15
-; GFX7-SDAG-NEXT: v_max_i32_e32 v6, v6, v14
-; GFX7-SDAG-NEXT: v_max_i32_e32 v4, v4, v12
-; GFX7-SDAG-NEXT: v_max_i32_e32 v0, v0, v8
-; GFX7-SDAG-NEXT: v_max_i32_e32 v5, v5, v13
+; GFX7-SDAG-NEXT: v_bfe_i32 v11, v11, 0, 16
+; GFX7-SDAG-NEXT: v_bfe_i32 v3, v3, 0, 16
+; GFX7-SDAG-NEXT: v_bfe_i32 v15, v15, 0, 16
+; GFX7-SDAG-NEXT: v_bfe_i32 v7, v7, 0, 16
+; GFX7-SDAG-NEXT: v_bfe_i32 v10, v10, 0, 16
+; GFX7-SDAG-NEXT: v_bfe_i32 v2, v2, 0, 16
+; GFX7-SDAG-NEXT: v_bfe_i32 v14, v14, 0, 16
+; GFX7-SDAG-NEXT: v_bfe_i32 v6, v6, 0, 16
; GFX7-SDAG-NEXT: v_max_i32_e32 v1, v1, v9
-; GFX7-SDAG-NEXT: v_max3_i32 v2, v2, v10, v6
-; GFX7-SDAG-NEXT: v_max3_i32 v3, v3, v11, v7
-; GFX7-SDAG-NEXT: v_max3_i32 v1, v1, v5, v3
-; GFX7-SDAG-NEXT: v_max3_i32 v0, v0, v4, v2
+; GFX7-SDAG-NEXT: v_max_i32_e32 v0, v0, v8
+; GFX7-SDAG-NEXT: v_max_i32_e32 v6, v6, v14
+; GFX7-SDAG-NEXT: v_max_i32_e32 v2, v2, v10
+; GFX7-SDAG-NEXT: v_max_i32_e32 v7, v7, v15
+; GFX7-SDAG-NEXT: v_max_i32_e32 v3, v3, v11
+; GFX7-SDAG-NEXT: v_max3_i32 v0, v0, v4, v12
+; GFX7-SDAG-NEXT: v_max3_i32 v1, v1, v5, v13
+; GFX7-SDAG-NEXT: v_max3_i32 v1, v1, v3, v7
+; GFX7-SDAG-NEXT: v_max3_i32 v0, v0, v2, v6
; GFX7-SDAG-NEXT: v_max_i32_e32 v0, v0, v1
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll
index 5056747c33cc2..8812cae20f110 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll
@@ -604,18 +604,18 @@ define i8 @test_vector_reduce_smin_v8i8(<8 x i8> %v) {
; GFX7-SDAG-LABEL: test_vector_reduce_smin_v8i8:
; GFX7-SDAG: ; %bb.0: ; %entry
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT: v_bfe_i32 v7, v7, 0, 8
-; GFX7-SDAG-NEXT: v_bfe_i32 v3, v3, 0, 8
-; GFX7-SDAG-NEXT: v_bfe_i32 v6, v6, 0, 8
-; GFX7-SDAG-NEXT: v_bfe_i32 v2, v2, 0, 8
-; GFX7-SDAG-NEXT: v_bfe_i32 v4, v4, 0, 8
-; GFX7-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 8
; GFX7-SDAG-NEXT: v_bfe_i32 v5, v5, 0, 8
; GFX7-SDAG-NEXT: v_bfe_i32 v1, v1, 0, 8
-; GFX7-SDAG-NEXT: v_min_i32_e32 v2, v2, v6
-; GFX7-SDAG-NEXT: v_min_i32_e32 v3, v3, v7
-; GFX7-SDAG-NEXT: v_min3_i32 v1, v1, v5, v3
-; GFX7-SDAG-NEXT: v_min3_i32 v0, v0, v4, v2
+; GFX7-SDAG-NEXT: v_bfe_i32 v4, v4, 0, 8
+; GFX7-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 8
+; GFX7-SDAG-NEXT: v_bfe_i32 v6, v6, 0, 8
+; GFX7-SDAG-NEXT: v_bfe_i32 v2, v2, 0, 8
+; GFX7-SDAG-NEXT: v_bfe_i32 v7, v7, 0, 8
+; GFX7-SDAG-NEXT: v_bfe_i32 v3, v3, 0, 8
+; GFX7-SDAG-NEXT: v_min_i32_e32 v0, v0, v4
+; GFX7-SDAG-NEXT: v_min_i32_e32 v1, v1, v5
+; GFX7-SDAG-NEXT: v_min3_i32 v1, v1, v3, v7
+; GFX7-SDAG-NEXT: v_min3_i32 v0, v0, v2, v6
; GFX7-SDAG-NEXT: v_min_i32_e32 v0, v0, v1
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -698,15 +698,15 @@ define i8 @test_vector_reduce_smin_v8i8(<8 x i8> %v) {
; GFX9-SDAG-LABEL: test_vector_reduce_smin_v8i8:
; GFX9-SDAG: ; %bb.0: ; %entry
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_bfe_i32 v5, v5, 0, 8
-; GFX9-SDAG-NEXT: v_bfe_i32 v1, v1, 0, 8
-; GFX9-SDAG-NEXT: v_min_i16_sdwa v3, sext(v3), sext(v7) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-SDAG-NEXT: v_min3_i16 v1, v1, v5, v3
-; GFX9-SDAG-NEXT: v_bfe_i32 v4, v4, 0, 8
-; GFX9-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 8
-; GFX9-SDAG-NEXT: v_min_i16_sdwa v2, sext(v2), sext(v6) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-SDAG-NEXT: v_bfe_i32 v7, v7, 0, 8
+; GFX9-SDAG-NEXT: v_bfe_i32 v3, v3, 0, 8
+; GFX9-SDAG-NEXT: v_min_i16_sdwa v1, sext(v1), sext(v5) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-SDAG-NEXT: v_min3_i16 v1, v1, v3, v7
+; GFX9-SDAG-NEXT: v_bfe_i32 v6, v6, 0, 8
+; GFX9-SDAG-NEXT: v_bfe_i32 v2, v2, 0, 8
+; GFX9-SDAG-NEXT: v_min_i16_sdwa v0, sext(v0), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX9-SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; GFX9-SDAG-NEXT: v_min3_i16 v0, v0, v4, v2
+; GFX9-SDAG-NEXT: v_min3_i16 v0, v0, v2, v6
; GFX9-SDAG-NEXT: v_lshrrev_b32_e32 v1, 8, v1
; GFX9-SDAG-NEXT: v_min_i16_sdwa v0, v0, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -741,20 +741,20 @@ define i8 @test_vector_reduce_smin_v8i8(<8 x i8> %v) {
; GFX10-SDAG-LABEL: test_vector_reduce_smin_v8i8:
; GFX10-SDAG: ; %bb.0: ; %entry
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT: v_bfe_i32 v7, v7, 0, 8
-; GFX10-SDAG-NEXT: v_bfe_i32 v3, v3, 0, 8
; GFX10-SDAG-NEXT: v_bfe_i32 v5, v5, 0, 8
; GFX10-SDAG-NEXT: v_bfe_i32 v1, v1, 0, 8
-; GFX10-SDAG-NEXT: v_bfe_i32 v2, v2, 0, 8
-; GFX10-SDAG-NEXT: v_bfe_i32 v4, v4, 0, 8
-; GFX10-SDAG-NEXT: v_min_i16 v3, v3, v7
+; GFX10-SDAG-NEXT: v_bfe_i32 v7, v7, 0, 8
+; GFX10-SDAG-NEXT: v_bfe_i32 v3, v3, 0, 8
; GFX10-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 8
-; GFX10-SDAG-NEXT: v_min3_i16 v1, v1, v5, v3
-; GFX10-SDAG-NEXT: v_bfe_i32 v3, v6, 0, 8
+; GFX10-SDAG-NEXT: v_bfe_i32 v2, v2, 0, 8
+; GFX10-SDAG-NEXT: v_min_i16 v1, v1, v5
+; GFX10-SDAG-NEXT: v_min3_i16 v1, v1, v3, v7
+; GFX10-SDAG-NEXT: v_bfe_i32 v3, v4, 0, 8
+; GFX10-SDAG-NEXT: v_bfe_i32 v4, v6, 0, 8
; GFX10-SDAG-NEXT: v_lshlrev_b16 v1, 8, v1
-; GFX10-SDAG-NEXT: v_min_i16 v2, v2, v3
+; GFX10-SDAG-NEXT: v_min_i16 v0, v0, v3
; GFX10-SDAG-NEXT: v_lshrrev_b32_e32 v1, 8, v1
-; GFX10-SDAG-NEXT: v_min3_i16 v0, v0, v4, v2
+; GFX10-SDAG-NEXT: v_min3_i16 v0, v0, v2, v4
; GFX10-SDAG-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX10-SDAG-NEXT: v_min_i16 v0, v0, v1
; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -796,62 +796,62 @@ define i8 @test_vector_reduce_smin_v8i8(<8 x i8> %v) {
; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_smin_v8i8:
; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v3, 0, 8
+; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v5, v5, 0, 8
+; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v1, v1, 0, 8
+; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v8, v3, 0, 8
; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v7, v7, 0, 8
-; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v8, v1, 0, 8
; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v2, v2, 0, 8
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.l
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v7.l
-; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v7, v5, 0, 8
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v8.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-SDAG-TRUE16-NEXT: v_min_i16 v1.l, v1.l, v3.l
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v7.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT: v_min3_i16 v1.l, v5.l, v3.l, v1.l
-; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v6, 0, 8
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v1.l
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT: v_min3_i16 v0.l, v1.l, v5.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v1, v7, 0, 8
+; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v4, 0, 8
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v6.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v0.l
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.l
-; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v2, v5, 0, 8
; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v4, 0, 8
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 8, v6
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 8, v5
; GFX11-SDAG-TRUE16-NEXT: v_min_i16 v0.l, v0.l, v1.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v4, 0, 8
-; GFX11-SDAG-TRUE16-NEXT: v_min3_i16 v0.l, v1.l, v2.l, v0.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT: v_min3_i16 v0.l, v0.l, v1.l, v2.l
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: v_min_i16 v0.l, v0.l, v1.l
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_smin_v8i8:
; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry
; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v7, v7, 0, 8
-; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v3, v3, 0, 8
; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v5, v5, 0, 8
; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v1, v1, 0, 8
-; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v2, v2, 0, 8
-; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v4, v4, 0, 8
-; GFX11-SDAG-FAKE16-NEXT: v_min_i16 v3, v3, v7
+; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v7, v7, 0, 8
+; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v3, v3, 0, 8
; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 8
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-FAKE16-NEXT: v_min3_i16 v1, v1, v5, v3
-; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v3, v6, 0, 8
+; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v2, v2, 0, 8
+; GFX11-SDAG-FAKE16-NEXT: v_min_i16 v1, v1, v5
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-SDAG-FAKE16-NEXT: v_min3_i16 v1, v1, v3, v7
+; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v3, v4, 0, 8
+; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v4, v6, 0, 8
; GFX11-SDAG-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-FAKE16-NEXT: v_min_i16 v2, v2, v3
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT: v_min_i16 v0, v0, v3
; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-FAKE16-NEXT: v_min3_i16 v0, v0, v4, v2
+; GFX11-SDAG-FAKE16-NEXT: v_min3_i16 v0, v0, v2, v4
; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-FAKE16-NEXT: v_min_i16 v0, v0, v1
@@ -906,39 +906,39 @@ define i8 @test_vector_reduce_smin_v8i8(<8 x i8> %v) {
; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v3, 0, 8
+; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v5, v5, 0, 8
+; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v1, v1, 0, 8
+; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v8, v3, 0, 8
; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v7, v7, 0, 8
-; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v8, v1, 0, 8
; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v2, v2, 0, 8
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.l
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v7.l
-; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v7, v5, 0, 8
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.l
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v8.l
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX12-SDAG-TRUE16-NEXT: v_min_i16 v1.l, v1.l, v3.l
; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v7.l
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX12-SDAG-TRUE16-NEXT: v_min3_i16 v1.l, v5.l, v3.l, v1.l
-; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v6, 0, 8
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
-; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v1.l
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.l
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT: v_min3_i16 v0.l, v1.l, v5.l, v3.l
+; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v1, v7, 0, 8
+; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v4, 0, 8
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v6.l
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v0.l
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.l
-; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v2, v5, 0, 8
; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v4, 0, 8
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 8, v6
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 8, v5
; GFX12-SDAG-TRUE16-NEXT: v_min_i16 v0.l, v0.l, v1.l
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.l
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v4, 0, 8
-; GFX12-SDAG-TRUE16-NEXT: v_min3_i16 v0.l, v1.l, v2.l, v0.l
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT: v_min3_i16 v0.l, v0.l, v1.l, v2.l
; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.l
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-TRUE16-NEXT: v_min_i16 v0.l, v0.l, v1.l
; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -949,23 +949,23 @@ define i8 @test_vector_reduce_smin_v8i8(<8 x i8> %v) {
; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v7, v7, 0, 8
-; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v3, v3, 0, 8
; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v5, v5, 0, 8
; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v1, v1, 0, 8
-; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v2, v2, 0, 8
-; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v4, v4, 0, 8
-; GFX12-SDAG-FAKE16-NEXT: v_min_i16 v3, v3, v7
+; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v7, v7, 0, 8
+; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v3, v3, 0, 8
; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 8
-; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-SDAG-FAKE16-NEXT: v_min3_i16 v1, v1, v5, v3
-; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v3, v6, 0, 8
+; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v2, v2, 0, 8
+; GFX12-SDAG-FAKE16-NEXT: v_min_i16 v1, v1, v5
+; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-SDAG-FAKE16-NEXT: v_min3_i16 v1, v1, v3, v7
+; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v3, v4, 0, 8
+; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v4, v6, 0, 8
; GFX12-SDAG-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1
-; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-FAKE16-NEXT: v_min_i16 v2, v2, v3
+; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT: v_min_i16 v0, v0, v3
; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-FAKE16-NEXT: v_min3_i16 v0, v0, v4, v2
+; GFX12-SDAG-FAKE16-NEXT: v_min3_i16 v0, v0, v2, v4
; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-FAKE16-NEXT: v_min_i16 v0, v0, v1
@@ -1025,32 +1025,32 @@ define i8 @test_vector_reduce_smin_v16i8(<16 x i8> %v) {
; GFX7-SDAG-LABEL: test_vector_reduce_smin_v16i8:
; GFX7-SDAG: ; %bb.0: ; %entry
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT: v_bfe_i32 v14, v14, 0, 8
-; GFX7-SDAG-NEXT: v_bfe_i32 v6, v6, 0, 8
-; GFX7-SDAG-NEXT: v_bfe_i32 v15, v15, 0, 8
-; GFX7-SDAG-NEXT: v_bfe_i32 v7, v7, 0, 8
-; GFX7-SDAG-NEXT: v_bfe_i32 v11, v11, 0, 8
-; GFX7-SDAG-NEXT: v_bfe_i32 v3, v3, 0, 8
-; GFX7-SDAG-NEXT: v_bfe_i32 v10, v10, 0, 8
-; GFX7-SDAG-NEXT: v_bfe_i32 v2, v2, 0, 8
+; GFX7-SDAG-NEXT: v_bfe_i32 v8, v8, 0, 8
+; GFX7-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 8
; GFX7-SDAG-NEXT: v_bfe_i32 v9, v9, 0, 8
; GFX7-SDAG-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX7-SDAG-NEXT: v_bfe_i32 v13, v13, 0, 8
; GFX7-SDAG-NEXT: v_bfe_i32 v5, v5, 0, 8
-; GFX7-SDAG-NEXT: v_bfe_i32 v8, v8, 0, 8
-; GFX7-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 8
; GFX7-SDAG-NEXT: v_bfe_i32 v12, v12, 0, 8
; GFX7-SDAG-NEXT: v_bfe_i32 v4, v4, 0, 8
-; GFX7-SDAG-NEXT: v_min_i32_e32 v7, v7, v15
-; GFX7-SDAG-NEXT: v_min_i32_e32 v6, v6, v14
-; GFX7-SDAG-NEXT: v_min_i32_e32 v4, v4, v12
-; GFX7-SDAG-NEXT: v_min_i32_e32 v0, v0, v8
-; GFX7-SDAG-NEXT: v_min_i32_e32 v5, v5, v13
+; GFX7-SDAG-NEXT: v_bfe_i32 v11, v11, 0, 8
+; GFX7-SDAG-NEXT: v_bfe_i32 v3, v3, 0, 8
+; GFX7-SDAG-NEXT: v_bfe_i32 v15, v15, 0, 8
+; GFX7-SDAG-NEXT: v_bfe_i32 v7, v7, 0, 8
+; GFX7-SDAG-NEXT: v_bfe_i32 v10, v10, 0, 8
+; GFX7-SDAG-NEXT: v_bfe_i32 v2, v2, 0, 8
+; GFX7-SDAG-NEXT: v_bfe_i32 v14, v14, 0, 8
+; GFX7-SDAG-NEXT: v_bfe_i32 v6, v6, 0, 8
; GFX7-SDAG-NEXT: v_min_i32_e32 v1, v1, v9
-; GFX7-SDAG-NEXT: v_min3_i32 v2, v2, v10, v6
-; GFX7-SDAG-NEXT: v_min3_i32 v3, v3, v11, v7
-; GFX7-SDAG-NEXT: v_min3_i32 v1, v1, v5, v3
-; GFX7-SDAG-NEXT: v_min3_i32 v0, v0, v4, v2
+; GFX7-SDAG-NEXT: v_min_i32_e32 v0, v0, v8
+; GFX7-SDAG-NEXT: v_min_i32_e32 v6, v6, v14
+; GFX7-SDAG-NEXT: v_min_i32_e32 v2, v2, v10
+; GFX7-SDAG-NEXT: v_min_i32_e32 v7, v7, v15
+; GFX7-SDAG-NEXT: v_min_i32_e32 v3, v3, v11
+; GFX7-SDAG-NEXT: v_min3_i32 v0, v0, v4, v12
+; GFX7-SDAG-NEXT: v_min3_i32 v1, v1, v5, v13
+; GFX7-SDAG-NEXT: v_min3_i32 v1, v1, v3, v7
+; GFX7-SDAG-NEXT: v_min3_i32 v0, v0, v2, v6
; GFX7-SDAG-NEXT: v_min_i32_e32 v0, v0, v1
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -1165,21 +1165,21 @@ define i8 @test_vector_reduce_smin_v16i8(<16 x i8> %v) {
; GFX9-SDAG-LABEL: test_vector_reduce_smin_v16i8:
; GFX9-SDAG: ; %bb.0: ; %entry
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_bfe_i32 v11, v11, 0, 8
-; GFX9-SDAG-NEXT: v_bfe_i32 v3, v3, 0, 8
-; GFX9-SDAG-NEXT: v_min_i16_sdwa v7, sext(v7), sext(v15) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-SDAG-NEXT: v_min_i16_sdwa v5, sext(v5), sext(v13) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-SDAG-NEXT: v_bfe_i32 v13, v13, 0, 8
+; GFX9-SDAG-NEXT: v_bfe_i32 v5, v5, 0, 8
; GFX9-SDAG-NEXT: v_min_i16_sdwa v1, sext(v1), sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-SDAG-NEXT: v_min3_i16 v3, v3, v11, v7
-; GFX9-SDAG-NEXT: v_bfe_i32 v10, v10, 0, 8
-; GFX9-SDAG-NEXT: v_bfe_i32 v2, v2, 0, 8
-; GFX9-SDAG-NEXT: v_min_i16_sdwa v6, sext(v6), sext(v14) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-SDAG-NEXT: v_min3_i16 v1, v1, v5, v3
-; GFX9-SDAG-NEXT: v_min_i16_sdwa v4, sext(v4), sext(v12) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-SDAG-NEXT: v_min_i16_sdwa v7, sext(v7), sext(v15) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-SDAG-NEXT: v_min_i16_sdwa v3, sext(v3), sext(v11) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-SDAG-NEXT: v_min3_i16 v1, v1, v5, v13
+; GFX9-SDAG-NEXT: v_bfe_i32 v12, v12, 0, 8
+; GFX9-SDAG-NEXT: v_bfe_i32 v4, v4, 0, 8
; GFX9-SDAG-NEXT: v_min_i16_sdwa v0, sext(v0), sext(v8) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-SDAG-NEXT: v_min3_i16 v2, v2, v10, v6
+; GFX9-SDAG-NEXT: v_min3_i16 v1, v1, v3, v7
+; GFX9-SDAG-NEXT: v_min_i16_sdwa v6, sext(v6), sext(v14) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-SDAG-NEXT: v_min_i16_sdwa v2, sext(v2), sext(v10) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-SDAG-NEXT: v_min3_i16 v0, v0, v4, v12
; GFX9-SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; GFX9-SDAG-NEXT: v_min3_i16 v0, v0, v4, v2
+; GFX9-SDAG-NEXT: v_min3_i16 v0, v0, v2, v6
; GFX9-SDAG-NEXT: v_lshrrev_b32_e32 v1, 8, v1
; GFX9-SDAG-NEXT: v_min_i16_sdwa v0, v0, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -1222,34 +1222,34 @@ define i8 @test_vector_reduce_smin_v16i8(<16 x i8> %v) {
; GFX10-SDAG-LABEL: test_vector_reduce_smin_v16i8:
; GFX10-SDAG: ; %bb.0: ; %entry
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT: v_bfe_i32 v15, v15, 0, 8
-; GFX10-SDAG-NEXT: v_bfe_i32 v7, v7, 0, 8
-; GFX10-SDAG-NEXT: v_bfe_i32 v11, v11, 0, 8
-; GFX10-SDAG-NEXT: v_bfe_i32 v3, v3, 0, 8
; GFX10-SDAG-NEXT: v_bfe_i32 v9, v9, 0, 8
+; GFX10-SDAG-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX10-SDAG-NEXT: v_bfe_i32 v13, v13, 0, 8
; GFX10-SDAG-NEXT: v_bfe_i32 v5, v5, 0, 8
-; GFX10-SDAG-NEXT: v_bfe_i32 v1, v1, 0, 8
+; GFX10-SDAG-NEXT: v_bfe_i32 v11, v11, 0, 8
+; GFX10-SDAG-NEXT: v_bfe_i32 v15, v15, 0, 8
+; GFX10-SDAG-NEXT: v_bfe_i32 v7, v7, 0, 8
+; GFX10-SDAG-NEXT: v_bfe_i32 v3, v3, 0, 8
+; GFX10-SDAG-NEXT: v_min_i16 v1, v1, v9
+; GFX10-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 8
+; GFX10-SDAG-NEXT: v_bfe_i32 v9, v12, 0, 8
; GFX10-SDAG-NEXT: v_min_i16 v7, v7, v15
+; GFX10-SDAG-NEXT: v_min_i16 v3, v3, v11
+; GFX10-SDAG-NEXT: v_min3_i16 v1, v1, v5, v13
+; GFX10-SDAG-NEXT: v_bfe_i32 v5, v8, 0, 8
+; GFX10-SDAG-NEXT: v_bfe_i32 v4, v4, 0, 8
+; GFX10-SDAG-NEXT: v_bfe_i32 v8, v10, 0, 8
; GFX10-SDAG-NEXT: v_bfe_i32 v6, v6, 0, 8
-; GFX10-SDAG-NEXT: v_bfe_i32 v10, v10, 0, 8
-; GFX10-SDAG-NEXT: v_min_i16 v5, v5, v13
-; GFX10-SDAG-NEXT: v_min_i16 v1, v1, v9
-; GFX10-SDAG-NEXT: v_min3_i16 v3, v3, v11, v7
-; GFX10-SDAG-NEXT: v_bfe_i32 v7, v14, 0, 8
+; GFX10-SDAG-NEXT: v_min3_i16 v1, v1, v3, v7
+; GFX10-SDAG-NEXT: v_bfe_i32 v3, v14, 0, 8
; GFX10-SDAG-NEXT: v_bfe_i32 v2, v2, 0, 8
-; GFX10-SDAG-NEXT: v_bfe_i32 v8, v8, 0, 8
-; GFX10-SDAG-NEXT: v_bfe_i32 v4, v4, 0, 8
-; GFX10-SDAG-NEXT: v_min3_i16 v1, v1, v5, v3
-; GFX10-SDAG-NEXT: v_bfe_i32 v3, v12, 0, 8
-; GFX10-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 8
-; GFX10-SDAG-NEXT: v_min_i16 v5, v6, v7
+; GFX10-SDAG-NEXT: v_min_i16 v0, v0, v5
; GFX10-SDAG-NEXT: v_lshlrev_b16 v1, 8, v1
-; GFX10-SDAG-NEXT: v_min_i16 v3, v4, v3
-; GFX10-SDAG-NEXT: v_min_i16 v0, v0, v8
-; GFX10-SDAG-NEXT: v_min3_i16 v2, v2, v10, v5
+; GFX10-SDAG-NEXT: v_min_i16 v3, v6, v3
+; GFX10-SDAG-NEXT: v_min_i16 v2, v2, v8
+; GFX10-SDAG-NEXT: v_min3_i16 v0, v0, v4, v9
; GFX10-SDAG-NEXT: v_lshrrev_b32_e32 v1, 8, v1
-; GFX10-SDAG-NEXT: v_min3_i16 v0, v0, v3, v2
+; GFX10-SDAG-NEXT: v_min3_i16 v0, v0, v2, v3
; GFX10-SDAG-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX10-SDAG-NEXT: v_min_i16 v0, v0, v1
; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -1307,59 +1307,58 @@ define i8 @test_vector_reduce_smin_v16i8(<16 x i8> %v) {
; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_smin_v16i8:
; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v16, v2, 0, 8
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.l
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v15.l
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v17.l, v0.l
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v9.l
-; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v11, v11, 0, 8
-; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v15, v2, 0, 8
-; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v2, v3, 0, 8
-; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v7, 0, 8
-; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v7, v14, 0, 8
-; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v14, v6, 0, 8
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v6.l, v13.l
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l
-; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v13, v0, 0, 8
+; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v16, v4, 0, 8
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v5.l
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v9.l
+; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v1, v1, 0, 8
+; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v17, v0, 0, 8
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v11.l
+; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v9, v4, 0, 8
; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v4, v5, 0, 8
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l
-; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v5, v6, 0, 8
-; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v6, v1, 0, 8
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.l
-; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v10, v10, 0, 8
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v15.l
+; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v13, v13, 0, 8
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v18.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.l
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
+; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v6, v0, 0, 8
+; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v5, v5, 0, 8
+; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v11, v3, 0, 8
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v4.l
+; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v2, v2, 0, 8
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.l
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v6.l
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v13.l
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v11.l
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v6.l
; GFX11-SDAG-TRUE16-NEXT: v_min_i16 v0.l, v0.l, v1.l
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v15.l
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v6.l, v11.l
-; GFX11-SDAG-TRUE16-NEXT: v_min_i16 v0.h, v2.l, v3.l
-; GFX11-SDAG-TRUE16-NEXT: v_min_i16 v1.h, v4.l, v5.l
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v11.l, v12.l
-; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v9, v9, 0, 8
-; GFX11-SDAG-TRUE16-NEXT: v_min3_i16 v0.l, v1.l, v6.l, v0.l
-; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v6, v8, 0, 8
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v14.l
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v7.l
-; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v5, v17, 0, 8
-; GFX11-SDAG-TRUE16-NEXT: v_min3_i16 v0.l, v1.h, v0.h, v0.l
-; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v4, v11, 0, 8
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v9.l
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v6.l, v13.l
; GFX11-SDAG-TRUE16-NEXT: v_min_i16 v0.h, v2.l, v3.l
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v16.l
+; GFX11-SDAG-TRUE16-NEXT: v_min_i16 v1.h, v4.l, v5.l
+; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v8, v8, 0, 8
+; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v7, v7, 0, 8
+; GFX11-SDAG-TRUE16-NEXT: v_min3_i16 v0.l, v0.l, v1.l, v6.l
+; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v12, v12, 0, 8
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v9.l, v14.l
+; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v6, v10, 0, 8
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v7.l
+; GFX11-SDAG-TRUE16-NEXT: v_min3_i16 v0.l, v0.l, v1.h, v0.h
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v17.l
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v8.l
+; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v5, v18, 0, 8
+; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v4, v9, 0, 8
; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v0.l
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v6.l
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v10.l
-; GFX11-SDAG-TRUE16-NEXT: v_min_i16 v1.l, v1.l, v4.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-TRUE16-NEXT: v_min_i16 v0.h, v2.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v16.l
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v12.l
; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 8, v7
+; GFX11-SDAG-TRUE16-NEXT: v_min_i16 v1.l, v1.l, v4.l
; GFX11-SDAG-TRUE16-NEXT: v_min_i16 v0.l, v5.l, v0.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-SDAG-TRUE16-NEXT: v_min3_i16 v0.h, v2.l, v3.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-TRUE16-NEXT: v_min3_i16 v0.h, v0.h, v2.l, v3.l
; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v2, v6, 0, 8
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_min3_i16 v0.l, v0.l, v1.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: v_min3_i16 v0.l, v0.h, v0.l, v1.l
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: v_min_i16 v0.l, v0.l, v1.l
@@ -1368,37 +1367,37 @@ define i8 @test_vector_reduce_smin_v16i8(<16 x i8> %v) {
; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_smin_v16i8:
; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry
; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v15, v15, 0, 8
-; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v7, v7, 0, 8
-; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v11, v11, 0, 8
-; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v3, v3, 0, 8
; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v9, v9, 0, 8
+; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v13, v13, 0, 8
; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v5, v5, 0, 8
-; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v1, v1, 0, 8
+; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v11, v11, 0, 8
+; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v15, v15, 0, 8
+; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v7, v7, 0, 8
+; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v3, v3, 0, 8
+; GFX11-SDAG-FAKE16-NEXT: v_min_i16 v1, v1, v9
+; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 8
+; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v9, v12, 0, 8
; GFX11-SDAG-FAKE16-NEXT: v_min_i16 v7, v7, v15
+; GFX11-SDAG-FAKE16-NEXT: v_min_i16 v3, v3, v11
+; GFX11-SDAG-FAKE16-NEXT: v_min3_i16 v1, v1, v5, v13
+; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v5, v8, 0, 8
+; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v4, v4, 0, 8
+; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v8, v10, 0, 8
; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v6, v6, 0, 8
-; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v10, v10, 0, 8
-; GFX11-SDAG-FAKE16-NEXT: v_min_i16 v5, v5, v13
-; GFX11-SDAG-FAKE16-NEXT: v_min_i16 v1, v1, v9
-; GFX11-SDAG-FAKE16-NEXT: v_min3_i16 v3, v3, v11, v7
-; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v7, v14, 0, 8
+; GFX11-SDAG-FAKE16-NEXT: v_min3_i16 v1, v1, v3, v7
+; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v3, v14, 0, 8
; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v2, v2, 0, 8
-; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v8, v8, 0, 8
-; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v4, v4, 0, 8
-; GFX11-SDAG-FAKE16-NEXT: v_min3_i16 v1, v1, v5, v3
-; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v3, v12, 0, 8
-; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 8
-; GFX11-SDAG-FAKE16-NEXT: v_min_i16 v5, v6, v7
+; GFX11-SDAG-FAKE16-NEXT: v_min_i16 v0, v0, v5
; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-SDAG-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1
-; GFX11-SDAG-FAKE16-NEXT: v_min_i16 v3, v4, v3
+; GFX11-SDAG-FAKE16-NEXT: v_min_i16 v3, v6, v3
; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-FAKE16-NEXT: v_min_i16 v0, v0, v8
-; GFX11-SDAG-FAKE16-NEXT: v_min3_i16 v2, v2, v10, v5
+; GFX11-SDAG-FAKE16-NEXT: v_min_i16 v2, v2, v8
+; GFX11-SDAG-FAKE16-NEXT: v_min3_i16 v0, v0, v4, v9
; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
-; GFX11-SDAG-FAKE16-NEXT: v_min3_i16 v0, v0, v3, v2
+; GFX11-SDAG-FAKE16-NEXT: v_min3_i16 v0, v0, v2, v3
; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX11-SDAG-FAKE16-NEXT: v_min_i16 v0, v0, v1
@@ -1468,59 +1467,58 @@ define i8 @test_vector_reduce_smin_v16i8(<16 x i8> %v) {
; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v16, v2, 0, 8
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.l
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v15.l
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v17.l, v0.l
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v9.l
-; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v11, v11, 0, 8
-; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v15, v2, 0, 8
-; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v2, v3, 0, 8
-; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v7, 0, 8
-; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v7, v14, 0, 8
-; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v14, v6, 0, 8
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v6.l, v13.l
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l
-; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v13, v0, 0, 8
+; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v16, v4, 0, 8
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v5.l
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v9.l
+; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v1, v1, 0, 8
+; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v17, v0, 0, 8
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v11.l
+; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v9, v4, 0, 8
; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v4, v5, 0, 8
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l
-; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v5, v6, 0, 8
-; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v6, v1, 0, 8
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.l
-; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v10, v10, 0, 8
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v15.l
+; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v13, v13, 0, 8
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v18.l, v2.l
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.l
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
+; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v6, v0, 0, 8
+; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v5, v5, 0, 8
+; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v11, v3, 0, 8
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v4.l
+; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v2, v2, 0, 8
; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.l
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v6.l
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v13.l
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v11.l
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v6.l
; GFX12-SDAG-TRUE16-NEXT: v_min_i16 v0.l, v0.l, v1.l
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v15.l
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v6.l, v11.l
-; GFX12-SDAG-TRUE16-NEXT: v_min_i16 v0.h, v2.l, v3.l
-; GFX12-SDAG-TRUE16-NEXT: v_min_i16 v1.h, v4.l, v5.l
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v11.l, v12.l
-; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v9, v9, 0, 8
-; GFX12-SDAG-TRUE16-NEXT: v_min3_i16 v0.l, v1.l, v6.l, v0.l
-; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v6, v8, 0, 8
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v14.l
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v7.l
-; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v5, v17, 0, 8
-; GFX12-SDAG-TRUE16-NEXT: v_min3_i16 v0.l, v1.h, v0.h, v0.l
-; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v4, v11, 0, 8
; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v9.l
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v6.l, v13.l
; GFX12-SDAG-TRUE16-NEXT: v_min_i16 v0.h, v2.l, v3.l
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v16.l
+; GFX12-SDAG-TRUE16-NEXT: v_min_i16 v1.h, v4.l, v5.l
+; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v8, v8, 0, 8
+; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v7, v7, 0, 8
+; GFX12-SDAG-TRUE16-NEXT: v_min3_i16 v0.l, v0.l, v1.l, v6.l
+; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v12, v12, 0, 8
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v9.l, v14.l
+; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v6, v10, 0, 8
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v7.l
+; GFX12-SDAG-TRUE16-NEXT: v_min3_i16 v0.l, v0.l, v1.h, v0.h
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v17.l
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v8.l
+; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v5, v18, 0, 8
+; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v4, v9, 0, 8
; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v0.l
; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v6.l
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v10.l
-; GFX12-SDAG-TRUE16-NEXT: v_min_i16 v1.l, v1.l, v4.l
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-SDAG-TRUE16-NEXT: v_min_i16 v0.h, v2.l, v3.l
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v16.l
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v12.l
; GFX12-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 8, v7
+; GFX12-SDAG-TRUE16-NEXT: v_min_i16 v1.l, v1.l, v4.l
; GFX12-SDAG-TRUE16-NEXT: v_min_i16 v0.l, v5.l, v0.l
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-SDAG-TRUE16-NEXT: v_min3_i16 v0.h, v2.l, v3.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-SDAG-TRUE16-NEXT: v_min3_i16 v0.h, v0.h, v2.l, v3.l
; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v2, v6, 0, 8
; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT: v_min3_i16 v0.l, v0.l, v1.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT: v_min3_i16 v0.l, v0.h, v0.l, v1.l
; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-TRUE16-NEXT: v_min_i16 v0.l, v0.l, v1.l
@@ -1533,37 +1531,37 @@ define i8 @test_vector_reduce_smin_v16i8(<16 x i8> %v) {
; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v15, v15, 0, 8
-; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v7, v7, 0, 8
-; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v11, v11, 0, 8
-; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v3, v3, 0, 8
; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v9, v9, 0, 8
+; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v13, v13, 0, 8
; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v5, v5, 0, 8
-; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v1, v1, 0, 8
+; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v11, v11, 0, 8
+; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v15, v15, 0, 8
+; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v7, v7, 0, 8
+; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v3, v3, 0, 8
+; GFX12-SDAG-FAKE16-NEXT: v_min_i16 v1, v1, v9
+; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 8
+; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v9, v12, 0, 8
; GFX12-SDAG-FAKE16-NEXT: v_min_i16 v7, v7, v15
+; GFX12-SDAG-FAKE16-NEXT: v_min_i16 v3, v3, v11
+; GFX12-SDAG-FAKE16-NEXT: v_min3_i16 v1, v1, v5, v13
+; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v5, v8, 0, 8
+; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v4, v4, 0, 8
+; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v8, v10, 0, 8
; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v6, v6, 0, 8
-; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v10, v10, 0, 8
-; GFX12-SDAG-FAKE16-NEXT: v_min_i16 v5, v5, v13
-; GFX12-SDAG-FAKE16-NEXT: v_min_i16 v1, v1, v9
-; GFX12-SDAG-FAKE16-NEXT: v_min3_i16 v3, v3, v11, v7
-; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v7, v14, 0, 8
+; GFX12-SDAG-FAKE16-NEXT: v_min3_i16 v1, v1, v3, v7
+; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v3, v14, 0, 8
; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v2, v2, 0, 8
-; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v8, v8, 0, 8
-; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v4, v4, 0, 8
-; GFX12-SDAG-FAKE16-NEXT: v_min3_i16 v1, v1, v5, v3
-; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v3, v12, 0, 8
-; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 8
-; GFX12-SDAG-FAKE16-NEXT: v_min_i16 v5, v6, v7
+; GFX12-SDAG-FAKE16-NEXT: v_min_i16 v0, v0, v5
; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX12-SDAG-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1
-; GFX12-SDAG-FAKE16-NEXT: v_min_i16 v3, v4, v3
+; GFX12-SDAG-FAKE16-NEXT: v_min_i16 v3, v6, v3
; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-SDAG-FAKE16-NEXT: v_min_i16 v0, v0, v8
-; GFX12-SDAG-FAKE16-NEXT: v_min3_i16 v2, v2, v10, v5
+; GFX12-SDAG-FAKE16-NEXT: v_min_i16 v2, v2, v8
+; GFX12-SDAG-FAKE16-NEXT: v_min3_i16 v0, v0, v4, v9
; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
-; GFX12-SDAG-FAKE16-NEXT: v_min3_i16 v0, v0, v3, v2
+; GFX12-SDAG-FAKE16-NEXT: v_min3_i16 v0, v0, v2, v3
; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-SDAG-FAKE16-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX12-SDAG-FAKE16-NEXT: v_min_i16 v0, v0, v1
@@ -2055,18 +2053,18 @@ define i16 @test_vector_reduce_smin_v8i16(<8 x i16> %v) {
; GFX7-SDAG-LABEL: test_vector_reduce_smin_v8i16:
; GFX7-SDAG: ; %bb.0: ; %entry
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT: v_bfe_i32 v7, v7, 0, 16
-; GFX7-SDAG-NEXT: v_bfe_i32 v3, v3, 0, 16
-; GFX7-SDAG-NEXT: v_bfe_i32 v6, v6, 0, 16
-; GFX7-SDAG-NEXT: v_bfe_i32 v2, v2, 0, 16
-; GFX7-SDAG-NEXT: v_bfe_i32 v4, v4, 0, 16
-; GFX7-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX7-SDAG-NEXT: v_bfe_i32 v5, v5, 0, 16
; GFX7-SDAG-NEXT: v_bfe_i32 v1, v1, 0, 16
-; GFX7-SDAG-NEXT: v_min_i32_e32 v2, v2, v6
-; GFX7-SDAG-NEXT: v_min_i32_e32 v3, v3, v7
-; GFX7-SDAG-NEXT: v_min3_i32 v1, v1, v5, v3
-; GFX7-SDAG-NEXT: v_min3_i32 v0, v0, v4, v2
+; GFX7-SDAG-NEXT: v_bfe_i32 v4, v4, 0, 16
+; GFX7-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX7-SDAG-NEXT: v_bfe_i32 v6, v6, 0, 16
+; GFX7-SDAG-NEXT: v_bfe_i32 v2, v2, 0, 16
+; GFX7-SDAG-NEXT: v_bfe_i32 v7, v7, 0, 16
+; GFX7-SDAG-NEXT: v_bfe_i32 v3, v3, 0, 16
+; GFX7-SDAG-NEXT: v_min_i32_e32 v0, v0, v4
+; GFX7-SDAG-NEXT: v_min_i32_e32 v1, v1, v5
+; GFX7-SDAG-NEXT: v_min3_i32 v1, v1, v3, v7
+; GFX7-SDAG-NEXT: v_min3_i32 v0, v0, v2, v6
; GFX7-SDAG-NEXT: v_min_i32_e32 v0, v0, v1
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -2253,32 +2251,32 @@ define i16 @test_vector_reduce_smin_v16i16(<16 x i16> %v) {
; GFX7-SDAG-LABEL: test_vector_reduce_smin_v16i16:
; GFX7-SDAG: ; %bb.0: ; %entry
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT: v_bfe_i32 v14, v14, 0, 16
-; GFX7-SDAG-NEXT: v_bfe_i32 v6, v6, 0, 16
-; GFX7-SDAG-NEXT: v_bfe_i32 v15, v15, 0, 16
-; GFX7-SDAG-NEXT: v_bfe_i32 v7, v7, 0, 16
-; GFX7-SDAG-NEXT: v_bfe_i32 v11, v11, 0, 16
-; GFX7-SDAG-NEXT: v_bfe_i32 v3, v3, 0, 16
-; GFX7-SDAG-NEXT: v_bfe_i32 v10, v10, 0, 16
-; GFX7-SDAG-NEXT: v_bfe_i32 v2, v2, 0, 16
+; GFX7-SDAG-NEXT: v_bfe_i32 v8, v8, 0, 16
+; GFX7-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX7-SDAG-NEXT: v_bfe_i32 v9, v9, 0, 16
; GFX7-SDAG-NEXT: v_bfe_i32 v1, v1, 0, 16
; GFX7-SDAG-NEXT: v_bfe_i32 v13, v13, 0, 16
; GFX7-SDAG-NEXT: v_bfe_i32 v5, v5, 0, 16
-; GFX7-SDAG-NEXT: v_bfe_i32 v8, v8, 0, 16
-; GFX7-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX7-SDAG-NEXT: v_bfe_i32 v12, v12, 0, 16
; GFX7-SDAG-NEXT: v_bfe_i32 v4, v4, 0, 16
-; GFX7-SDAG-NEXT: v_min_i32_e32 v7, v7, v15
-; GFX7-SDAG-NEXT: v_min_i32_e32 v6, v6, v14
-; GFX7-SDAG-NEXT: v_min_i32_e32 v4, v4, v12
-; GFX7-SDAG-NEXT: v_min_i32_e32 v0, v0, v8
-; GFX7-SDAG-NEXT: v_min_i32_e32 v5, v5, v13
+; GFX7-SDAG-NEXT: v_bfe_i32 v11, v11, 0, 16
+; GFX7-SDAG-NEXT: v_bfe_i32 v3, v3, 0, 16
+; GFX7-SDAG-NEXT: v_bfe_i32 v15, v15, 0, 16
+; GFX7-SDAG-NEXT: v_bfe_i32 v7, v7, 0, 16
+; GFX7-SDAG-NEXT: v_bfe_i32 v10, v10, 0, 16
+; GFX7-SDAG-NEXT: v_bfe_i32 v2, v2, 0, 16
+; GFX7-SDAG-NEXT: v_bfe_i32 v14, v14, 0, 16
+; GFX7-SDAG-NEXT: v_bfe_i32 v6, v6, 0, 16
; GFX7-SDAG-NEXT: v_min_i32_e32 v1, v1, v9
-; GFX7-SDAG-NEXT: v_min3_i32 v2, v2, v10, v6
-; GFX7-SDAG-NEXT: v_min3_i32 v3, v3, v11, v7
-; GFX7-SDAG-NEXT: v_min3_i32 v1, v1, v5, v3
-; GFX7-SDAG-NEXT: v_min3_i32 v0, v0, v4, v2
+; GFX7-SDAG-NEXT: v_min_i32_e32 v0, v0, v8
+; GFX7-SDAG-NEXT: v_min_i32_e32 v6, v6, v14
+; GFX7-SDAG-NEXT: v_min_i32_e32 v2, v2, v10
+; GFX7-SDAG-NEXT: v_min_i32_e32 v7, v7, v15
+; GFX7-SDAG-NEXT: v_min_i32_e32 v3, v3, v11
+; GFX7-SDAG-NEXT: v_min3_i32 v0, v0, v4, v12
+; GFX7-SDAG-NEXT: v_min3_i32 v1, v1, v5, v13
+; GFX7-SDAG-NEXT: v_min3_i32 v1, v1, v3, v7
+; GFX7-SDAG-NEXT: v_min3_i32 v0, v0, v2, v6
; GFX7-SDAG-NEXT: v_min_i32_e32 v0, v0, v1
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll
index ddae1b296024e..82eb122f9f703 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll
@@ -320,7 +320,7 @@ define i8 @test_vector_reduce_umax_v4i8(<4 x i8> %v) {
; GFX8-SDAG-NEXT: v_max_u16_sdwa v1, v1, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX8-SDAG-NEXT: v_max_u16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX8-SDAG-NEXT: v_lshrrev_b32_e32 v1, 8, v1
-; GFX8-SDAG-NEXT: v_max_u16_e32 v0, v0, v1
+; GFX8-SDAG-NEXT: v_max_u16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-GISEL-LABEL: test_vector_reduce_umax_v4i8:
@@ -351,8 +351,9 @@ define i8 @test_vector_reduce_umax_v4i8(<4 x i8> %v) {
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-SDAG-NEXT: v_max_u16_sdwa v1, v1, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX9-SDAG-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX9-SDAG-NEXT: v_lshrrev_b32_e32 v1, 8, v1
+; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX9-SDAG-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX9-SDAG-NEXT: v_max3_u16 v0, v0, v2, v1
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -387,9 +388,9 @@ define i8 @test_vector_reduce_umax_v4i8(<4 x i8> %v) {
; GFX10-SDAG-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX10-SDAG-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX10-SDAG-NEXT: v_max_u16 v1, v1, v3
-; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 8
; GFX10-SDAG-NEXT: v_lshlrev_b16 v1, 8, v1
-; GFX10-SDAG-NEXT: v_lshrrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-SDAG-NEXT: v_lshrrev_b32_e32 v1, 8, v1
+; GFX10-SDAG-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX10-SDAG-NEXT: v_max3_u16 v0, v0, v2, v1
; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -429,8 +430,8 @@ define i8 @test_vector_reduce_umax_v4i8(<4 x i8> %v) {
; GFX11-SDAG-TRUE16-NEXT: v_max_u16 v1.l, v1.l, v1.h
; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
+; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: v_max3_u16 v0.l, v0.l, v0.h, v1.l
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
@@ -446,8 +447,8 @@ define i8 @test_vector_reduce_umax_v4i8(<4 x i8> %v) {
; GFX11-SDAG-FAKE16-NEXT: v_max_u16 v1, v1, v3
; GFX11-SDAG-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1
; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
+; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-FAKE16-NEXT: v_max3_u16 v0, v0, v2, v1
; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
@@ -500,8 +501,8 @@ define i8 @test_vector_reduce_umax_v4i8(<4 x i8> %v) {
; GFX12-SDAG-TRUE16-NEXT: v_max_u16 v1.l, v1.l, v1.h
; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX12-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
+; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-TRUE16-NEXT: v_max3_u16 v0.l, v0.l, v0.h, v1.l
; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
@@ -521,8 +522,8 @@ define i8 @test_vector_reduce_umax_v4i8(<4 x i8> %v) {
; GFX12-SDAG-FAKE16-NEXT: v_max_u16 v1, v1, v3
; GFX12-SDAG-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1
; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
+; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-FAKE16-NEXT: v_max3_u16 v0, v0, v2, v1
; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
@@ -572,18 +573,18 @@ define i8 @test_vector_reduce_umax_v8i8(<8 x i8> %v) {
; GFX7-SDAG-LABEL: test_vector_reduce_umax_v8i8:
; GFX7-SDAG: ; %bb.0: ; %entry
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT: v_and_b32_e32 v7, 0xff, v7
-; GFX7-SDAG-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX7-SDAG-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX7-SDAG-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX7-SDAG-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX7-SDAG-NEXT: v_and_b32_e32 v5, 0xff, v5
; GFX7-SDAG-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX7-SDAG-NEXT: v_max_u32_e32 v2, v2, v6
-; GFX7-SDAG-NEXT: v_max_u32_e32 v3, v3, v7
-; GFX7-SDAG-NEXT: v_max3_u32 v1, v1, v5, v3
-; GFX7-SDAG-NEXT: v_max3_u32 v0, v0, v4, v2
+; GFX7-SDAG-NEXT: v_and_b32_e32 v4, 0xff, v4
+; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX7-SDAG-NEXT: v_and_b32_e32 v6, 0xff, v6
+; GFX7-SDAG-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX7-SDAG-NEXT: v_and_b32_e32 v7, 0xff, v7
+; GFX7-SDAG-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX7-SDAG-NEXT: v_max_u32_e32 v0, v0, v4
+; GFX7-SDAG-NEXT: v_max_u32_e32 v1, v1, v5
+; GFX7-SDAG-NEXT: v_max3_u32 v1, v1, v3, v7
+; GFX7-SDAG-NEXT: v_max3_u32 v0, v0, v2, v6
; GFX7-SDAG-NEXT: v_max_u32_e32 v0, v0, v1
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -628,7 +629,7 @@ define i8 @test_vector_reduce_umax_v8i8(<8 x i8> %v) {
; GFX8-SDAG-NEXT: v_max_u16_sdwa v1, v1, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-SDAG-NEXT: v_max_u16_e32 v0, v0, v2
; GFX8-SDAG-NEXT: v_lshrrev_b32_e32 v1, 8, v1
-; GFX8-SDAG-NEXT: v_max_u16_e32 v0, v0, v1
+; GFX8-SDAG-NEXT: v_max_u16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-GISEL-LABEL: test_vector_reduce_umax_v8i8:
@@ -660,17 +661,17 @@ define i8 @test_vector_reduce_umax_v8i8(<8 x i8> %v) {
; GFX9-SDAG-LABEL: test_vector_reduce_umax_v8i8:
; GFX9-SDAG: ; %bb.0: ; %entry
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX9-SDAG-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX9-SDAG-NEXT: v_max_u16_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-SDAG-NEXT: v_max3_u16 v1, v1, v5, v3
-; GFX9-SDAG-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX9-SDAG-NEXT: v_max_u16_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-SDAG-NEXT: v_and_b32_e32 v7, 0xff, v7
+; GFX9-SDAG-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX9-SDAG-NEXT: v_max_u16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-SDAG-NEXT: v_max3_u16 v1, v1, v3, v7
+; GFX9-SDAG-NEXT: v_and_b32_e32 v6, 0xff, v6
+; GFX9-SDAG-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX9-SDAG-NEXT: v_max_u16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX9-SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; GFX9-SDAG-NEXT: v_max3_u16 v0, v0, v4, v2
+; GFX9-SDAG-NEXT: v_max3_u16 v0, v0, v2, v6
; GFX9-SDAG-NEXT: v_lshrrev_b32_e32 v1, 8, v1
-; GFX9-SDAG-NEXT: v_max_u16_e32 v0, v0, v1
+; GFX9-SDAG-NEXT: v_max_u16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: test_vector_reduce_umax_v8i8:
@@ -702,21 +703,21 @@ define i8 @test_vector_reduce_umax_v8i8(<8 x i8> %v) {
; GFX10-SDAG-LABEL: test_vector_reduce_umax_v8i8:
; GFX10-SDAG: ; %bb.0: ; %entry
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT: v_and_b32_e32 v7, 0xff, v7
-; GFX10-SDAG-NEXT: v_and_b32_e32 v3, 0xff, v3
; GFX10-SDAG-NEXT: v_and_b32_e32 v5, 0xff, v5
; GFX10-SDAG-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX10-SDAG-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX10-SDAG-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX10-SDAG-NEXT: v_max_u16 v3, v3, v7
+; GFX10-SDAG-NEXT: v_and_b32_e32 v7, 0xff, v7
+; GFX10-SDAG-NEXT: v_and_b32_e32 v3, 0xff, v3
; GFX10-SDAG-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX10-SDAG-NEXT: v_max_u16 v2, v2, v6
-; GFX10-SDAG-NEXT: v_max3_u16 v1, v1, v5, v3
+; GFX10-SDAG-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX10-SDAG-NEXT: v_max_u16 v1, v1, v5
+; GFX10-SDAG-NEXT: v_max3_u16 v1, v1, v3, v7
; GFX10-SDAG-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX10-SDAG-NEXT: v_mov_b32_e32 v4, 8
+; GFX10-SDAG-NEXT: v_and_b32_e32 v4, 0xff, v6
; GFX10-SDAG-NEXT: v_lshlrev_b16 v1, 8, v1
-; GFX10-SDAG-NEXT: v_max3_u16 v0, v0, v3, v2
-; GFX10-SDAG-NEXT: v_lshrrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-SDAG-NEXT: v_max_u16 v0, v0, v3
+; GFX10-SDAG-NEXT: v_lshrrev_b32_e32 v1, 8, v1
+; GFX10-SDAG-NEXT: v_max3_u16 v0, v0, v2, v4
+; GFX10-SDAG-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX10-SDAG-NEXT: v_max_u16 v0, v0, v1
; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -756,50 +757,49 @@ define i8 @test_vector_reduce_umax_v8i8(<8 x i8> %v) {
; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_umax_v8i8:
; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.l
-; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
-; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v5.l
+; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v5.l
; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v6.l
+; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v7.l
+; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-SDAG-TRUE16-NEXT: v_max_u16 v1.h, v3.l, v1.h
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_max3_u16 v1.l, v1.l, v3.h, v1.h
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT: v_max_u16 v1.l, v1.l, v1.h
; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_max_u16 v0.h, v1.h, v0.h
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT: v_max3_u16 v0.h, v1.l, v3.l, v3.h
; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v4.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-SDAG-TRUE16-NEXT: v_max3_u16 v0.l, v0.l, v1.l, v0.h
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v2
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v0.h
+; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v6.l
; GFX11-SDAG-TRUE16-NEXT: v_max_u16 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 8, v3
+; GFX11-SDAG-TRUE16-NEXT: v_max3_u16 v0.l, v0.l, v1.h, v0.h
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l
+; GFX11-SDAG-TRUE16-NEXT: v_max_u16 v0.l, v0.l, v0.h
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_umax_v8i8:
; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry
; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7
-; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5
; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-SDAG-FAKE16-NEXT: v_max_u16 v3, v3, v7
+; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7
+; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-FAKE16-NEXT: v_max3_u16 v1, v1, v5, v3
-; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v6
+; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-SDAG-FAKE16-NEXT: v_max_u16 v1, v1, v5
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-SDAG-FAKE16-NEXT: v_max3_u16 v1, v1, v3, v7
+; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
+; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v6
; GFX11-SDAG-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-FAKE16-NEXT: v_max_u16 v2, v2, v3
-; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-FAKE16-NEXT: v_max3_u16 v0, v0, v4, v2
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT: v_max_u16 v0, v0, v3
; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT: v_max3_u16 v0, v0, v2, v4
+; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-FAKE16-NEXT: v_max_u16 v0, v0, v1
; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
@@ -852,27 +852,26 @@ define i8 @test_vector_reduce_umax_v8i8(<8 x i8> %v) {
; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.l
-; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
-; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v5.l
+; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v5.l
; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v6.l
+; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v7.l
+; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX12-SDAG-TRUE16-NEXT: v_max_u16 v1.h, v3.l, v1.h
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT: v_max3_u16 v1.l, v1.l, v3.h, v1.h
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT: v_max_u16 v1.l, v1.l, v1.h
; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l
-; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT: v_max_u16 v0.h, v1.h, v0.h
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT: v_max3_u16 v0.h, v1.l, v3.l, v3.h
; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v4.l
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX12-SDAG-TRUE16-NEXT: v_max3_u16 v0.l, v0.l, v1.l, v0.h
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v2
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v0.h
+; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v6.l
; GFX12-SDAG-TRUE16-NEXT: v_max_u16 v0.l, v0.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 8, v3
+; GFX12-SDAG-TRUE16-NEXT: v_max3_u16 v0.l, v0.l, v1.h, v0.h
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l
+; GFX12-SDAG-TRUE16-NEXT: v_max_u16 v0.l, v0.l, v0.h
; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_umax_v8i8:
@@ -882,24 +881,24 @@ define i8 @test_vector_reduce_umax_v8i8(<8 x i8> %v) {
; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7
-; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5
; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX12-SDAG-FAKE16-NEXT: v_max_u16 v3, v3, v7
+; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7
+; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-SDAG-FAKE16-NEXT: v_max3_u16 v1, v1, v5, v3
-; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v6
+; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX12-SDAG-FAKE16-NEXT: v_max_u16 v1, v1, v5
+; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-SDAG-FAKE16-NEXT: v_max3_u16 v1, v1, v3, v7
+; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
+; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v6
; GFX12-SDAG-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1
-; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-FAKE16-NEXT: v_max_u16 v2, v2, v3
-; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-FAKE16-NEXT: v_max3_u16 v0, v0, v4, v2
+; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT: v_max_u16 v0, v0, v3
; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
+; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT: v_max3_u16 v0, v0, v2, v4
+; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-FAKE16-NEXT: v_max_u16 v0, v0, v1
; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
@@ -957,32 +956,32 @@ define i8 @test_vector_reduce_umax_v16i8(<16 x i8> %v) {
; GFX7-SDAG-LABEL: test_vector_reduce_umax_v16i8:
; GFX7-SDAG: ; %bb.0: ; %entry
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT: v_and_b32_e32 v14, 0xff, v14
-; GFX7-SDAG-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX7-SDAG-NEXT: v_and_b32_e32 v15, 0xff, v15
-; GFX7-SDAG-NEXT: v_and_b32_e32 v7, 0xff, v7
-; GFX7-SDAG-NEXT: v_and_b32_e32 v11, 0xff, v11
-; GFX7-SDAG-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX7-SDAG-NEXT: v_and_b32_e32 v10, 0xff, v10
-; GFX7-SDAG-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX7-SDAG-NEXT: v_and_b32_e32 v8, 0xff, v8
+; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX7-SDAG-NEXT: v_and_b32_e32 v9, 0xff, v9
; GFX7-SDAG-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX7-SDAG-NEXT: v_and_b32_e32 v13, 0xff, v13
; GFX7-SDAG-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX7-SDAG-NEXT: v_and_b32_e32 v8, 0xff, v8
-; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX7-SDAG-NEXT: v_and_b32_e32 v12, 0xff, v12
; GFX7-SDAG-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX7-SDAG-NEXT: v_max_u32_e32 v7, v7, v15
-; GFX7-SDAG-NEXT: v_max_u32_e32 v6, v6, v14
-; GFX7-SDAG-NEXT: v_max_u32_e32 v4, v4, v12
-; GFX7-SDAG-NEXT: v_max_u32_e32 v0, v0, v8
-; GFX7-SDAG-NEXT: v_max_u32_e32 v5, v5, v13
+; GFX7-SDAG-NEXT: v_and_b32_e32 v11, 0xff, v11
+; GFX7-SDAG-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX7-SDAG-NEXT: v_and_b32_e32 v15, 0xff, v15
+; GFX7-SDAG-NEXT: v_and_b32_e32 v7, 0xff, v7
+; GFX7-SDAG-NEXT: v_and_b32_e32 v10, 0xff, v10
+; GFX7-SDAG-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX7-SDAG-NEXT: v_and_b32_e32 v14, 0xff, v14
+; GFX7-SDAG-NEXT: v_and_b32_e32 v6, 0xff, v6
; GFX7-SDAG-NEXT: v_max_u32_e32 v1, v1, v9
-; GFX7-SDAG-NEXT: v_max3_u32 v2, v2, v10, v6
-; GFX7-SDAG-NEXT: v_max3_u32 v3, v3, v11, v7
-; GFX7-SDAG-NEXT: v_max3_u32 v1, v1, v5, v3
-; GFX7-SDAG-NEXT: v_max3_u32 v0, v0, v4, v2
+; GFX7-SDAG-NEXT: v_max_u32_e32 v0, v0, v8
+; GFX7-SDAG-NEXT: v_max_u32_e32 v6, v6, v14
+; GFX7-SDAG-NEXT: v_max_u32_e32 v2, v2, v10
+; GFX7-SDAG-NEXT: v_max_u32_e32 v7, v7, v15
+; GFX7-SDAG-NEXT: v_max_u32_e32 v3, v3, v11
+; GFX7-SDAG-NEXT: v_max3_u32 v0, v0, v4, v12
+; GFX7-SDAG-NEXT: v_max3_u32 v1, v1, v5, v13
+; GFX7-SDAG-NEXT: v_max3_u32 v1, v1, v3, v7
+; GFX7-SDAG-NEXT: v_max3_u32 v0, v0, v2, v6
; GFX7-SDAG-NEXT: v_max_u32_e32 v0, v0, v1
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -1051,9 +1050,8 @@ define i8 @test_vector_reduce_umax_v16i8(<16 x i8> %v) {
; GFX8-SDAG-NEXT: v_max_u16_e32 v0, v0, v2
; GFX8-SDAG-NEXT: v_max_u16_sdwa v1, v1, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX8-SDAG-NEXT: v_mov_b32_e32 v1, 8
-; GFX8-SDAG-NEXT: v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX8-SDAG-NEXT: v_max_u16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-SDAG-NEXT: v_lshrrev_b32_e32 v1, 8, v0
+; GFX8-SDAG-NEXT: v_max_u16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-GISEL-LABEL: test_vector_reduce_umax_v16i8:
@@ -1093,25 +1091,24 @@ define i8 @test_vector_reduce_umax_v16i8(<16 x i8> %v) {
; GFX9-SDAG-LABEL: test_vector_reduce_umax_v16i8:
; GFX9-SDAG: ; %bb.0: ; %entry
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_and_b32_e32 v11, 0xff, v11
-; GFX9-SDAG-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX9-SDAG-NEXT: v_max_u16_sdwa v7, v7, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-SDAG-NEXT: v_and_b32_e32 v10, 0xff, v10
-; GFX9-SDAG-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX9-SDAG-NEXT: v_max_u16_sdwa v5, v5, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-SDAG-NEXT: v_and_b32_e32 v13, 0xff, v13
+; GFX9-SDAG-NEXT: v_and_b32_e32 v5, 0xff, v5
; GFX9-SDAG-NEXT: v_max_u16_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-SDAG-NEXT: v_max_u16_sdwa v6, v6, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-SDAG-NEXT: v_max3_u16 v3, v3, v11, v7
-; GFX9-SDAG-NEXT: v_max_u16_sdwa v4, v4, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-SDAG-NEXT: v_and_b32_e32 v12, 0xff, v12
+; GFX9-SDAG-NEXT: v_and_b32_e32 v4, 0xff, v4
+; GFX9-SDAG-NEXT: v_max_u16_sdwa v7, v7, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-SDAG-NEXT: v_max_u16_sdwa v3, v3, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX9-SDAG-NEXT: v_max_u16_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-SDAG-NEXT: v_max3_u16 v2, v2, v10, v6
-; GFX9-SDAG-NEXT: v_max3_u16 v1, v1, v5, v3
-; GFX9-SDAG-NEXT: v_max3_u16 v0, v0, v4, v2
+; GFX9-SDAG-NEXT: v_max3_u16 v1, v1, v5, v13
+; GFX9-SDAG-NEXT: v_max_u16_sdwa v6, v6, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-SDAG-NEXT: v_max_u16_sdwa v2, v2, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-SDAG-NEXT: v_max3_u16 v0, v0, v4, v12
+; GFX9-SDAG-NEXT: v_max3_u16 v1, v1, v3, v7
+; GFX9-SDAG-NEXT: v_max3_u16 v0, v0, v2, v6
; GFX9-SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX9-SDAG-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 8
-; GFX9-SDAG-NEXT: v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX9-SDAG-NEXT: v_max_u16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-SDAG-NEXT: v_lshrrev_b32_e32 v1, 8, v0
+; GFX9-SDAG-NEXT: v_max_u16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: test_vector_reduce_umax_v16i8:
@@ -1151,38 +1148,38 @@ define i8 @test_vector_reduce_umax_v16i8(<16 x i8> %v) {
; GFX10-SDAG-LABEL: test_vector_reduce_umax_v16i8:
; GFX10-SDAG: ; %bb.0: ; %entry
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT: v_and_b32_e32 v15, 0xff, v15
-; GFX10-SDAG-NEXT: v_and_b32_e32 v7, 0xff, v7
-; GFX10-SDAG-NEXT: v_and_b32_e32 v11, 0xff, v11
-; GFX10-SDAG-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX10-SDAG-NEXT: v_and_b32_e32 v14, 0xff, v14
-; GFX10-SDAG-NEXT: v_and_b32_e32 v6, 0xff, v6
; GFX10-SDAG-NEXT: v_and_b32_e32 v9, 0xff, v9
+; GFX10-SDAG-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX10-SDAG-NEXT: v_and_b32_e32 v13, 0xff, v13
; GFX10-SDAG-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX10-SDAG-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX10-SDAG-NEXT: v_max_u16 v7, v7, v15
-; GFX10-SDAG-NEXT: v_and_b32_e32 v10, 0xff, v10
-; GFX10-SDAG-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX10-SDAG-NEXT: v_and_b32_e32 v8, 0xff, v8
; GFX10-SDAG-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX10-SDAG-NEXT: v_and_b32_e32 v11, 0xff, v11
+; GFX10-SDAG-NEXT: v_and_b32_e32 v15, 0xff, v15
+; GFX10-SDAG-NEXT: v_and_b32_e32 v7, 0xff, v7
+; GFX10-SDAG-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX10-SDAG-NEXT: v_max_u16 v1, v1, v9
; GFX10-SDAG-NEXT: v_and_b32_e32 v12, 0xff, v12
; GFX10-SDAG-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX10-SDAG-NEXT: v_max_u16 v5, v5, v13
-; GFX10-SDAG-NEXT: v_max_u16 v1, v1, v9
-; GFX10-SDAG-NEXT: v_max_u16 v6, v6, v14
-; GFX10-SDAG-NEXT: v_max3_u16 v3, v3, v11, v7
-; GFX10-SDAG-NEXT: v_max_u16 v4, v4, v12
+; GFX10-SDAG-NEXT: v_and_b32_e32 v10, 0xff, v10
+; GFX10-SDAG-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX10-SDAG-NEXT: v_and_b32_e32 v14, 0xff, v14
+; GFX10-SDAG-NEXT: v_and_b32_e32 v6, 0xff, v6
+; GFX10-SDAG-NEXT: v_max_u16 v7, v7, v15
+; GFX10-SDAG-NEXT: v_max_u16 v3, v3, v11
; GFX10-SDAG-NEXT: v_max_u16 v0, v0, v8
-; GFX10-SDAG-NEXT: v_max3_u16 v2, v2, v10, v6
-; GFX10-SDAG-NEXT: v_max3_u16 v1, v1, v5, v3
-; GFX10-SDAG-NEXT: v_max3_u16 v0, v0, v4, v2
+; GFX10-SDAG-NEXT: v_max3_u16 v1, v1, v5, v13
+; GFX10-SDAG-NEXT: v_max_u16 v5, v6, v14
+; GFX10-SDAG-NEXT: v_max_u16 v2, v2, v10
+; GFX10-SDAG-NEXT: v_max3_u16 v0, v0, v4, v12
+; GFX10-SDAG-NEXT: v_max3_u16 v1, v1, v3, v7
+; GFX10-SDAG-NEXT: v_max3_u16 v0, v0, v2, v5
; GFX10-SDAG-NEXT: v_lshlrev_b16 v1, 8, v1
; GFX10-SDAG-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 8
-; GFX10-SDAG-NEXT: v_and_b32_e32 v2, 0xff, v0
-; GFX10-SDAG-NEXT: v_lshrrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-SDAG-NEXT: v_max_u16 v0, v2, v0
+; GFX10-SDAG-NEXT: v_lshrrev_b32_e32 v1, 8, v0
+; GFX10-SDAG-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX10-SDAG-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX10-SDAG-NEXT: v_max_u16 v0, v0, v1
; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-GISEL-LABEL: test_vector_reduce_umax_v16i8:
@@ -1237,84 +1234,82 @@ define i8 @test_vector_reduce_umax_v16i8(<16 x i8> %v) {
; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_umax_v16i8:
; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l
-; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v15.l
-; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v7.l
-; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v11.l
-; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
-; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v14.l
-; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v9.l
-; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v13.l
+; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
+; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v1.l
+; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v13.l
; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
-; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-SDAG-TRUE16-NEXT: v_max_u16 v0.h, v0.h, v10.l
-; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l
+; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v8.l
; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v12.l
+; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v11.l
+; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v15.l
+; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v7.l
+; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-SDAG-TRUE16-NEXT: v_max_u16 v0.h, v0.h, v9.l
+; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v12.l
; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-SDAG-TRUE16-NEXT: v_max_u16 v4.h, v5.l, v4.h
-; GFX11-SDAG-TRUE16-NEXT: v_max_u16 v1.l, v1.l, v1.h
-; GFX11-SDAG-TRUE16-NEXT: v_max_u16 v1.h, v6.l, v6.h
-; GFX11-SDAG-TRUE16-NEXT: v_max3_u16 v0.h, v3.l, v3.h, v0.h
-; GFX11-SDAG-TRUE16-NEXT: v_max_u16 v3.l, v4.l, v5.h
-; GFX11-SDAG-TRUE16-NEXT: v_max_u16 v0.l, v0.l, v7.l
+; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v10.l
+; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v14.l
+; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
+; GFX11-SDAG-TRUE16-NEXT: v_max_u16 v3.h, v6.h, v3.h
+; GFX11-SDAG-TRUE16-NEXT: v_max_u16 v2.h, v3.l, v2.h
+; GFX11-SDAG-TRUE16-NEXT: v_max_u16 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT: v_max3_u16 v0.h, v0.h, v5.l, v5.h
+; GFX11-SDAG-TRUE16-NEXT: v_max_u16 v1.l, v6.l, v7.l
+; GFX11-SDAG-TRUE16-NEXT: v_max_u16 v1.h, v2.l, v1.h
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT: v_max3_u16 v1.h, v2.l, v2.h, v1.h
-; GFX11-SDAG-TRUE16-NEXT: v_max3_u16 v0.h, v1.l, v4.h, v0.h
+; GFX11-SDAG-TRUE16-NEXT: v_max3_u16 v0.l, v0.l, v4.l, v4.h
+; GFX11-SDAG-TRUE16-NEXT: v_max3_u16 v0.h, v0.h, v2.h, v3.h
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_max3_u16 v0.l, v0.l, v3.l, v1.h
+; GFX11-SDAG-TRUE16-NEXT: v_max3_u16 v0.l, v0.l, v1.h, v1.l
; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v0.h
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v0
; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_max_u16 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v1.l
+; GFX11-SDAG-TRUE16-NEXT: v_max_u16 v0.l, v0.l, v0.h
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_umax_v16i8:
; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry
; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15
-; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7
-; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11
-; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14
-; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6
; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9
+; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13
; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-SDAG-FAKE16-NEXT: v_max_u16 v7, v7, v15
-; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10
-; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8
; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11
+; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15
+; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7
+; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-SDAG-FAKE16-NEXT: v_max_u16 v1, v1, v9
; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12
; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-SDAG-FAKE16-NEXT: v_max_u16 v5, v5, v13
-; GFX11-SDAG-FAKE16-NEXT: v_max_u16 v1, v1, v9
-; GFX11-SDAG-FAKE16-NEXT: v_max_u16 v6, v6, v14
-; GFX11-SDAG-FAKE16-NEXT: v_max3_u16 v3, v3, v11, v7
-; GFX11-SDAG-FAKE16-NEXT: v_max_u16 v4, v4, v12
+; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10
+; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14
+; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6
+; GFX11-SDAG-FAKE16-NEXT: v_max_u16 v7, v7, v15
+; GFX11-SDAG-FAKE16-NEXT: v_max_u16 v3, v3, v11
; GFX11-SDAG-FAKE16-NEXT: v_max_u16 v0, v0, v8
+; GFX11-SDAG-FAKE16-NEXT: v_max3_u16 v1, v1, v5, v13
+; GFX11-SDAG-FAKE16-NEXT: v_max_u16 v5, v6, v14
+; GFX11-SDAG-FAKE16-NEXT: v_max_u16 v2, v2, v10
; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-FAKE16-NEXT: v_max3_u16 v2, v2, v10, v6
-; GFX11-SDAG-FAKE16-NEXT: v_max3_u16 v1, v1, v5, v3
+; GFX11-SDAG-FAKE16-NEXT: v_max3_u16 v0, v0, v4, v12
+; GFX11-SDAG-FAKE16-NEXT: v_max3_u16 v1, v1, v3, v7
; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-FAKE16-NEXT: v_max3_u16 v0, v0, v4, v2
+; GFX11-SDAG-FAKE16-NEXT: v_max3_u16 v0, v0, v2, v5
; GFX11-SDAG-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1
; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v0
+; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v0
; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
+; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-SDAG-FAKE16-NEXT: v_max_u16 v0, v0, v1
; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -1382,44 +1377,42 @@ define i8 @test_vector_reduce_umax_v16i8(<16 x i8> %v) {
; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l
-; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v15.l
-; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v7.l
-; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v11.l
-; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
-; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v14.l
-; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v9.l
-; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v13.l
+; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
+; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v1.l
+; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v13.l
; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
-; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX12-SDAG-TRUE16-NEXT: v_max_u16 v0.h, v0.h, v10.l
-; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l
+; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v8.l
; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v12.l
+; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v11.l
+; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v15.l
+; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v7.l
+; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX12-SDAG-TRUE16-NEXT: v_max_u16 v0.h, v0.h, v9.l
+; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v12.l
; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX12-SDAG-TRUE16-NEXT: v_max_u16 v4.h, v5.l, v4.h
-; GFX12-SDAG-TRUE16-NEXT: v_max_u16 v1.l, v1.l, v1.h
-; GFX12-SDAG-TRUE16-NEXT: v_max_u16 v1.h, v6.l, v6.h
-; GFX12-SDAG-TRUE16-NEXT: v_max3_u16 v0.h, v3.l, v3.h, v0.h
-; GFX12-SDAG-TRUE16-NEXT: v_max_u16 v3.l, v4.l, v5.h
-; GFX12-SDAG-TRUE16-NEXT: v_max_u16 v0.l, v0.l, v7.l
+; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v10.l
+; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v14.l
+; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
+; GFX12-SDAG-TRUE16-NEXT: v_max_u16 v3.h, v6.h, v3.h
+; GFX12-SDAG-TRUE16-NEXT: v_max_u16 v2.h, v3.l, v2.h
+; GFX12-SDAG-TRUE16-NEXT: v_max_u16 v0.l, v0.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT: v_max3_u16 v0.h, v0.h, v5.l, v5.h
+; GFX12-SDAG-TRUE16-NEXT: v_max_u16 v1.l, v6.l, v7.l
+; GFX12-SDAG-TRUE16-NEXT: v_max_u16 v1.h, v2.l, v1.h
; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-SDAG-TRUE16-NEXT: v_max3_u16 v1.h, v2.l, v2.h, v1.h
-; GFX12-SDAG-TRUE16-NEXT: v_max3_u16 v0.h, v1.l, v4.h, v0.h
+; GFX12-SDAG-TRUE16-NEXT: v_max3_u16 v0.l, v0.l, v4.l, v4.h
+; GFX12-SDAG-TRUE16-NEXT: v_max3_u16 v0.h, v0.h, v2.h, v3.h
; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT: v_max3_u16 v0.l, v0.l, v3.l, v1.h
+; GFX12-SDAG-TRUE16-NEXT: v_max3_u16 v0.l, v0.l, v1.h, v1.l
; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v0.h
; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
+; GFX12-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v0
; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX12-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT: v_max_u16 v0.l, v0.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v1.l
+; GFX12-SDAG-TRUE16-NEXT: v_max_u16 v0.l, v0.l, v0.h
; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_umax_v16i8:
@@ -1429,41 +1422,41 @@ define i8 @test_vector_reduce_umax_v16i8(<16 x i8> %v) {
; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15
-; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7
-; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11
-; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14
-; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6
; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9
+; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13
; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX12-SDAG-FAKE16-NEXT: v_max_u16 v7, v7, v15
-; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10
-; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8
; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11
+; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15
+; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7
+; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX12-SDAG-FAKE16-NEXT: v_max_u16 v1, v1, v9
; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12
; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX12-SDAG-FAKE16-NEXT: v_max_u16 v5, v5, v13
-; GFX12-SDAG-FAKE16-NEXT: v_max_u16 v1, v1, v9
-; GFX12-SDAG-FAKE16-NEXT: v_max_u16 v6, v6, v14
-; GFX12-SDAG-FAKE16-NEXT: v_max3_u16 v3, v3, v11, v7
-; GFX12-SDAG-FAKE16-NEXT: v_max_u16 v4, v4, v12
+; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10
+; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14
+; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6
+; GFX12-SDAG-FAKE16-NEXT: v_max_u16 v7, v7, v15
+; GFX12-SDAG-FAKE16-NEXT: v_max_u16 v3, v3, v11
; GFX12-SDAG-FAKE16-NEXT: v_max_u16 v0, v0, v8
+; GFX12-SDAG-FAKE16-NEXT: v_max3_u16 v1, v1, v5, v13
+; GFX12-SDAG-FAKE16-NEXT: v_max_u16 v5, v6, v14
+; GFX12-SDAG-FAKE16-NEXT: v_max_u16 v2, v2, v10
; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-SDAG-FAKE16-NEXT: v_max3_u16 v2, v2, v10, v6
-; GFX12-SDAG-FAKE16-NEXT: v_max3_u16 v1, v1, v5, v3
+; GFX12-SDAG-FAKE16-NEXT: v_max3_u16 v0, v0, v4, v12
+; GFX12-SDAG-FAKE16-NEXT: v_max3_u16 v1, v1, v3, v7
; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-FAKE16-NEXT: v_max3_u16 v0, v0, v4, v2
+; GFX12-SDAG-FAKE16-NEXT: v_max3_u16 v0, v0, v2, v5
; GFX12-SDAG-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1
; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v0
+; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v0
; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
+; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX12-SDAG-FAKE16-NEXT: v_max_u16 v0, v0, v1
; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -1940,18 +1933,18 @@ define i16 @test_vector_reduce_umax_v8i16(<8 x i16> %v) {
; GFX7-SDAG-LABEL: test_vector_reduce_umax_v8i16:
; GFX7-SDAG: ; %bb.0: ; %entry
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX7-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX7-SDAG-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX7-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX7-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX7-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX7-SDAG-NEXT: v_max_u32_e32 v2, v2, v6
-; GFX7-SDAG-NEXT: v_max_u32_e32 v3, v3, v7
-; GFX7-SDAG-NEXT: v_max3_u32 v1, v1, v5, v3
-; GFX7-SDAG-NEXT: v_max3_u32 v0, v0, v4, v2
+; GFX7-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-SDAG-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX7-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX7-SDAG-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; GFX7-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX7-SDAG-NEXT: v_max_u32_e32 v0, v0, v4
+; GFX7-SDAG-NEXT: v_max_u32_e32 v1, v1, v5
+; GFX7-SDAG-NEXT: v_max3_u32 v1, v1, v3, v7
+; GFX7-SDAG-NEXT: v_max3_u32 v0, v0, v2, v6
; GFX7-SDAG-NEXT: v_max_u32_e32 v0, v0, v1
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -2136,32 +2129,32 @@ define i16 @test_vector_reduce_umax_v16i16(<16 x i16> %v) {
; GFX7-SDAG-LABEL: test_vector_reduce_umax_v16i16:
; GFX7-SDAG: ; %bb.0: ; %entry
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX7-SDAG-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX7-SDAG-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX7-SDAG-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX7-SDAG-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX7-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX7-SDAG-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX7-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX7-SDAG-NEXT: v_and_b32_e32 v8, 0xffff, v8
+; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-SDAG-NEXT: v_and_b32_e32 v9, 0xffff, v9
; GFX7-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX7-SDAG-NEXT: v_and_b32_e32 v13, 0xffff, v13
; GFX7-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX7-SDAG-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-SDAG-NEXT: v_and_b32_e32 v12, 0xffff, v12
; GFX7-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX7-SDAG-NEXT: v_max_u32_e32 v7, v7, v15
-; GFX7-SDAG-NEXT: v_max_u32_e32 v6, v6, v14
-; GFX7-SDAG-NEXT: v_max_u32_e32 v4, v4, v12
-; GFX7-SDAG-NEXT: v_max_u32_e32 v0, v0, v8
-; GFX7-SDAG-NEXT: v_max_u32_e32 v5, v5, v13
+; GFX7-SDAG-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; GFX7-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX7-SDAG-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX7-SDAG-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; GFX7-SDAG-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; GFX7-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX7-SDAG-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; GFX7-SDAG-NEXT: v_and_b32_e32 v6, 0xffff, v6
; GFX7-SDAG-NEXT: v_max_u32_e32 v1, v1, v9
-; GFX7-SDAG-NEXT: v_max3_u32 v2, v2, v10, v6
-; GFX7-SDAG-NEXT: v_max3_u32 v3, v3, v11, v7
-; GFX7-SDAG-NEXT: v_max3_u32 v1, v1, v5, v3
-; GFX7-SDAG-NEXT: v_max3_u32 v0, v0, v4, v2
+; GFX7-SDAG-NEXT: v_max_u32_e32 v0, v0, v8
+; GFX7-SDAG-NEXT: v_max_u32_e32 v6, v6, v14
+; GFX7-SDAG-NEXT: v_max_u32_e32 v2, v2, v10
+; GFX7-SDAG-NEXT: v_max_u32_e32 v7, v7, v15
+; GFX7-SDAG-NEXT: v_max_u32_e32 v3, v3, v11
+; GFX7-SDAG-NEXT: v_max3_u32 v0, v0, v4, v12
+; GFX7-SDAG-NEXT: v_max3_u32 v1, v1, v5, v13
+; GFX7-SDAG-NEXT: v_max3_u32 v1, v1, v3, v7
+; GFX7-SDAG-NEXT: v_max3_u32 v0, v0, v2, v6
; GFX7-SDAG-NEXT: v_max_u32_e32 v0, v0, v1
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll
index e3a7ae5fd0256..115b05a5ca6a2 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll
@@ -485,18 +485,18 @@ define i8 @test_vector_reduce_umin_v8i8(<8 x i8> %v) {
; GFX7-SDAG-LABEL: test_vector_reduce_umin_v8i8:
; GFX7-SDAG: ; %bb.0: ; %entry
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT: v_and_b32_e32 v7, 0xff, v7
-; GFX7-SDAG-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX7-SDAG-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX7-SDAG-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX7-SDAG-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX7-SDAG-NEXT: v_and_b32_e32 v5, 0xff, v5
; GFX7-SDAG-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX7-SDAG-NEXT: v_min_u32_e32 v2, v2, v6
-; GFX7-SDAG-NEXT: v_min_u32_e32 v3, v3, v7
-; GFX7-SDAG-NEXT: v_min3_u32 v1, v1, v5, v3
-; GFX7-SDAG-NEXT: v_min3_u32 v0, v0, v4, v2
+; GFX7-SDAG-NEXT: v_and_b32_e32 v4, 0xff, v4
+; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX7-SDAG-NEXT: v_and_b32_e32 v6, 0xff, v6
+; GFX7-SDAG-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX7-SDAG-NEXT: v_and_b32_e32 v7, 0xff, v7
+; GFX7-SDAG-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX7-SDAG-NEXT: v_min_u32_e32 v0, v0, v4
+; GFX7-SDAG-NEXT: v_min_u32_e32 v1, v1, v5
+; GFX7-SDAG-NEXT: v_min3_u32 v1, v1, v3, v7
+; GFX7-SDAG-NEXT: v_min3_u32 v0, v0, v2, v6
; GFX7-SDAG-NEXT: v_min_u32_e32 v0, v0, v1
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -549,15 +549,15 @@ define i8 @test_vector_reduce_umin_v8i8(<8 x i8> %v) {
; GFX9-SDAG-LABEL: test_vector_reduce_umin_v8i8:
; GFX9-SDAG: ; %bb.0: ; %entry
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX9-SDAG-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX9-SDAG-NEXT: v_min_u16_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-SDAG-NEXT: v_min3_u16 v1, v1, v5, v3
-; GFX9-SDAG-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX9-SDAG-NEXT: v_min_u16_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-SDAG-NEXT: v_and_b32_e32 v7, 0xff, v7
+; GFX9-SDAG-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX9-SDAG-NEXT: v_min_u16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-SDAG-NEXT: v_min3_u16 v1, v1, v3, v7
+; GFX9-SDAG-NEXT: v_and_b32_e32 v6, 0xff, v6
+; GFX9-SDAG-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX9-SDAG-NEXT: v_min_u16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX9-SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; GFX9-SDAG-NEXT: v_min3_u16 v0, v0, v4, v2
+; GFX9-SDAG-NEXT: v_min3_u16 v0, v0, v2, v6
; GFX9-SDAG-NEXT: v_lshrrev_b32_e32 v1, 8, v1
; GFX9-SDAG-NEXT: v_min_u16_e32 v0, v0, v1
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -578,20 +578,20 @@ define i8 @test_vector_reduce_umin_v8i8(<8 x i8> %v) {
; GFX10-SDAG-LABEL: test_vector_reduce_umin_v8i8:
; GFX10-SDAG: ; %bb.0: ; %entry
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT: v_and_b32_e32 v7, 0xff, v7
-; GFX10-SDAG-NEXT: v_and_b32_e32 v3, 0xff, v3
; GFX10-SDAG-NEXT: v_and_b32_e32 v5, 0xff, v5
; GFX10-SDAG-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX10-SDAG-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX10-SDAG-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX10-SDAG-NEXT: v_min_u16 v3, v3, v7
+; GFX10-SDAG-NEXT: v_and_b32_e32 v7, 0xff, v7
+; GFX10-SDAG-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX10-SDAG-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX10-SDAG-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX10-SDAG-NEXT: v_min_u16 v2, v2, v6
-; GFX10-SDAG-NEXT: v_min3_u16 v1, v1, v5, v3
-; GFX10-SDAG-NEXT: v_and_b32_e32 v3, 0xff, v4
+; GFX10-SDAG-NEXT: v_min_u16 v1, v1, v5
+; GFX10-SDAG-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX10-SDAG-NEXT: v_min_u16 v0, v0, v4
+; GFX10-SDAG-NEXT: v_min3_u16 v1, v1, v3, v7
+; GFX10-SDAG-NEXT: v_and_b32_e32 v3, 0xff, v6
; GFX10-SDAG-NEXT: v_mov_b32_e32 v4, 8
; GFX10-SDAG-NEXT: v_lshlrev_b16 v1, 8, v1
-; GFX10-SDAG-NEXT: v_min3_u16 v0, v0, v3, v2
+; GFX10-SDAG-NEXT: v_min3_u16 v0, v0, v2, v3
; GFX10-SDAG-NEXT: v_lshrrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX10-SDAG-NEXT: v_min_u16 v0, v0, v1
; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -620,24 +620,24 @@ define i8 @test_vector_reduce_umin_v8i8(<8 x i8> %v) {
; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_umin_v8i8:
; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.l
-; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
-; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v5.l
+; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v5.l
; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v6.l
+; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v7.l
+; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.l
; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-SDAG-TRUE16-NEXT: v_min_u16 v1.h, v3.l, v1.h
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_min3_u16 v1.l, v1.l, v3.h, v1.h
+; GFX11-SDAG-TRUE16-NEXT: v_min_u16 v1.l, v1.l, v1.h
; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT: v_min_u16 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: v_min3_u16 v1.l, v1.l, v3.l, v3.h
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_min_u16 v0.h, v1.h, v0.h
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.l
-; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v4.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-SDAG-TRUE16-NEXT: v_min3_u16 v0.l, v0.l, v1.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v3
+; GFX11-SDAG-TRUE16-NEXT: v_min3_u16 v0.l, v0.l, v1.h, v1.l
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v2
; GFX11-SDAG-TRUE16-NEXT: v_min_u16 v0.l, v0.l, v1.l
@@ -646,23 +646,23 @@ define i8 @test_vector_reduce_umin_v8i8(<8 x i8> %v) {
; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_umin_v8i8:
; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry
; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7
-; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5
; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-SDAG-FAKE16-NEXT: v_min_u16 v3, v3, v7
+; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7
+; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-FAKE16-NEXT: v_min3_u16 v1, v1, v5, v3
-; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v6
+; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-SDAG-FAKE16-NEXT: v_min_u16 v1, v1, v5
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-SDAG-FAKE16-NEXT: v_min3_u16 v1, v1, v3, v7
+; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
+; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v6
; GFX11-SDAG-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-FAKE16-NEXT: v_min_u16 v2, v2, v3
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT: v_min_u16 v0, v0, v3
; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-FAKE16-NEXT: v_min3_u16 v0, v0, v4, v2
+; GFX11-SDAG-FAKE16-NEXT: v_min3_u16 v0, v0, v2, v4
; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-FAKE16-NEXT: v_min_u16 v0, v0, v1
@@ -699,24 +699,24 @@ define i8 @test_vector_reduce_umin_v8i8(<8 x i8> %v) {
; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.l
-; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
-; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v5.l
+; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v5.l
; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v6.l
+; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v7.l
+; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.l
; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX12-SDAG-TRUE16-NEXT: v_min_u16 v1.h, v3.l, v1.h
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT: v_min3_u16 v1.l, v1.l, v3.h, v1.h
+; GFX12-SDAG-TRUE16-NEXT: v_min_u16 v1.l, v1.l, v1.h
; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-SDAG-TRUE16-NEXT: v_min_u16 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT: v_min3_u16 v1.l, v1.l, v3.l, v3.h
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l
; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT: v_min_u16 v0.h, v1.h, v0.h
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.l
-; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v4.l
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX12-SDAG-TRUE16-NEXT: v_min3_u16 v0.l, v0.l, v1.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v3
+; GFX12-SDAG-TRUE16-NEXT: v_min3_u16 v0.l, v0.l, v1.h, v1.l
; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v2
; GFX12-SDAG-TRUE16-NEXT: v_min_u16 v0.l, v0.l, v1.l
@@ -729,23 +729,23 @@ define i8 @test_vector_reduce_umin_v8i8(<8 x i8> %v) {
; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7
-; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5
; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX12-SDAG-FAKE16-NEXT: v_min_u16 v3, v3, v7
+; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7
+; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-SDAG-FAKE16-NEXT: v_min3_u16 v1, v1, v5, v3
-; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v6
+; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX12-SDAG-FAKE16-NEXT: v_min_u16 v1, v1, v5
+; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-SDAG-FAKE16-NEXT: v_min3_u16 v1, v1, v3, v7
+; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
+; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v6
; GFX12-SDAG-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1
-; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-FAKE16-NEXT: v_min_u16 v2, v2, v3
+; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT: v_min_u16 v0, v0, v3
; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-FAKE16-NEXT: v_min3_u16 v0, v0, v4, v2
+; GFX12-SDAG-FAKE16-NEXT: v_min3_u16 v0, v0, v2, v4
; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-FAKE16-NEXT: v_min_u16 v0, v0, v1
@@ -787,32 +787,32 @@ define i8 @test_vector_reduce_umin_v16i8(<16 x i8> %v) {
; GFX7-SDAG-LABEL: test_vector_reduce_umin_v16i8:
; GFX7-SDAG: ; %bb.0: ; %entry
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT: v_and_b32_e32 v14, 0xff, v14
-; GFX7-SDAG-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX7-SDAG-NEXT: v_and_b32_e32 v15, 0xff, v15
-; GFX7-SDAG-NEXT: v_and_b32_e32 v7, 0xff, v7
-; GFX7-SDAG-NEXT: v_and_b32_e32 v11, 0xff, v11
-; GFX7-SDAG-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX7-SDAG-NEXT: v_and_b32_e32 v10, 0xff, v10
-; GFX7-SDAG-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX7-SDAG-NEXT: v_and_b32_e32 v8, 0xff, v8
+; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX7-SDAG-NEXT: v_and_b32_e32 v9, 0xff, v9
; GFX7-SDAG-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX7-SDAG-NEXT: v_and_b32_e32 v13, 0xff, v13
; GFX7-SDAG-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX7-SDAG-NEXT: v_and_b32_e32 v8, 0xff, v8
-; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX7-SDAG-NEXT: v_and_b32_e32 v12, 0xff, v12
; GFX7-SDAG-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX7-SDAG-NEXT: v_min_u32_e32 v7, v7, v15
-; GFX7-SDAG-NEXT: v_min_u32_e32 v6, v6, v14
-; GFX7-SDAG-NEXT: v_min_u32_e32 v4, v4, v12
-; GFX7-SDAG-NEXT: v_min_u32_e32 v0, v0, v8
-; GFX7-SDAG-NEXT: v_min_u32_e32 v5, v5, v13
+; GFX7-SDAG-NEXT: v_and_b32_e32 v11, 0xff, v11
+; GFX7-SDAG-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX7-SDAG-NEXT: v_and_b32_e32 v15, 0xff, v15
+; GFX7-SDAG-NEXT: v_and_b32_e32 v7, 0xff, v7
+; GFX7-SDAG-NEXT: v_and_b32_e32 v10, 0xff, v10
+; GFX7-SDAG-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX7-SDAG-NEXT: v_and_b32_e32 v14, 0xff, v14
+; GFX7-SDAG-NEXT: v_and_b32_e32 v6, 0xff, v6
; GFX7-SDAG-NEXT: v_min_u32_e32 v1, v1, v9
-; GFX7-SDAG-NEXT: v_min3_u32 v2, v2, v10, v6
-; GFX7-SDAG-NEXT: v_min3_u32 v3, v3, v11, v7
-; GFX7-SDAG-NEXT: v_min3_u32 v1, v1, v5, v3
-; GFX7-SDAG-NEXT: v_min3_u32 v0, v0, v4, v2
+; GFX7-SDAG-NEXT: v_min_u32_e32 v0, v0, v8
+; GFX7-SDAG-NEXT: v_min_u32_e32 v6, v6, v14
+; GFX7-SDAG-NEXT: v_min_u32_e32 v2, v2, v10
+; GFX7-SDAG-NEXT: v_min_u32_e32 v7, v7, v15
+; GFX7-SDAG-NEXT: v_min_u32_e32 v3, v3, v11
+; GFX7-SDAG-NEXT: v_min3_u32 v0, v0, v4, v12
+; GFX7-SDAG-NEXT: v_min3_u32 v1, v1, v5, v13
+; GFX7-SDAG-NEXT: v_min3_u32 v1, v1, v3, v7
+; GFX7-SDAG-NEXT: v_min3_u32 v0, v0, v2, v6
; GFX7-SDAG-NEXT: v_min_u32_e32 v0, v0, v1
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -899,20 +899,20 @@ define i8 @test_vector_reduce_umin_v16i8(<16 x i8> %v) {
; GFX9-SDAG-LABEL: test_vector_reduce_umin_v16i8:
; GFX9-SDAG: ; %bb.0: ; %entry
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_and_b32_e32 v11, 0xff, v11
-; GFX9-SDAG-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX9-SDAG-NEXT: v_min_u16_sdwa v7, v7, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-SDAG-NEXT: v_and_b32_e32 v10, 0xff, v10
-; GFX9-SDAG-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX9-SDAG-NEXT: v_min_u16_sdwa v5, v5, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-SDAG-NEXT: v_and_b32_e32 v13, 0xff, v13
+; GFX9-SDAG-NEXT: v_and_b32_e32 v5, 0xff, v5
; GFX9-SDAG-NEXT: v_min_u16_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-SDAG-NEXT: v_min_u16_sdwa v6, v6, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-SDAG-NEXT: v_min3_u16 v3, v3, v11, v7
-; GFX9-SDAG-NEXT: v_min_u16_sdwa v4, v4, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-SDAG-NEXT: v_and_b32_e32 v12, 0xff, v12
+; GFX9-SDAG-NEXT: v_and_b32_e32 v4, 0xff, v4
+; GFX9-SDAG-NEXT: v_min_u16_sdwa v7, v7, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-SDAG-NEXT: v_min_u16_sdwa v3, v3, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX9-SDAG-NEXT: v_min_u16_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-SDAG-NEXT: v_min3_u16 v2, v2, v10, v6
-; GFX9-SDAG-NEXT: v_min3_u16 v1, v1, v5, v3
-; GFX9-SDAG-NEXT: v_min3_u16 v0, v0, v4, v2
+; GFX9-SDAG-NEXT: v_min3_u16 v1, v1, v5, v13
+; GFX9-SDAG-NEXT: v_min_u16_sdwa v6, v6, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-SDAG-NEXT: v_min_u16_sdwa v2, v2, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-SDAG-NEXT: v_min3_u16 v0, v0, v4, v12
+; GFX9-SDAG-NEXT: v_min3_u16 v1, v1, v3, v7
+; GFX9-SDAG-NEXT: v_min3_u16 v0, v0, v2, v6
; GFX9-SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX9-SDAG-NEXT: v_or_b32_e32 v0, v0, v1
; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 8
@@ -944,32 +944,32 @@ define i8 @test_vector_reduce_umin_v16i8(<16 x i8> %v) {
; GFX10-SDAG-LABEL: test_vector_reduce_umin_v16i8:
; GFX10-SDAG: ; %bb.0: ; %entry
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT: v_and_b32_e32 v15, 0xff, v15
-; GFX10-SDAG-NEXT: v_and_b32_e32 v7, 0xff, v7
-; GFX10-SDAG-NEXT: v_and_b32_e32 v11, 0xff, v11
-; GFX10-SDAG-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX10-SDAG-NEXT: v_and_b32_e32 v14, 0xff, v14
-; GFX10-SDAG-NEXT: v_and_b32_e32 v6, 0xff, v6
; GFX10-SDAG-NEXT: v_and_b32_e32 v9, 0xff, v9
+; GFX10-SDAG-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX10-SDAG-NEXT: v_and_b32_e32 v13, 0xff, v13
; GFX10-SDAG-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX10-SDAG-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX10-SDAG-NEXT: v_min_u16 v7, v7, v15
-; GFX10-SDAG-NEXT: v_and_b32_e32 v10, 0xff, v10
-; GFX10-SDAG-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX10-SDAG-NEXT: v_and_b32_e32 v8, 0xff, v8
; GFX10-SDAG-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX10-SDAG-NEXT: v_and_b32_e32 v11, 0xff, v11
+; GFX10-SDAG-NEXT: v_and_b32_e32 v15, 0xff, v15
+; GFX10-SDAG-NEXT: v_and_b32_e32 v7, 0xff, v7
+; GFX10-SDAG-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX10-SDAG-NEXT: v_min_u16 v1, v1, v9
; GFX10-SDAG-NEXT: v_and_b32_e32 v12, 0xff, v12
; GFX10-SDAG-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX10-SDAG-NEXT: v_min_u16 v5, v5, v13
-; GFX10-SDAG-NEXT: v_min_u16 v1, v1, v9
-; GFX10-SDAG-NEXT: v_min_u16 v6, v6, v14
-; GFX10-SDAG-NEXT: v_min3_u16 v3, v3, v11, v7
-; GFX10-SDAG-NEXT: v_min_u16 v4, v4, v12
+; GFX10-SDAG-NEXT: v_and_b32_e32 v10, 0xff, v10
+; GFX10-SDAG-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX10-SDAG-NEXT: v_and_b32_e32 v14, 0xff, v14
+; GFX10-SDAG-NEXT: v_and_b32_e32 v6, 0xff, v6
+; GFX10-SDAG-NEXT: v_min_u16 v7, v7, v15
+; GFX10-SDAG-NEXT: v_min_u16 v3, v3, v11
; GFX10-SDAG-NEXT: v_min_u16 v0, v0, v8
-; GFX10-SDAG-NEXT: v_min3_u16 v2, v2, v10, v6
-; GFX10-SDAG-NEXT: v_min3_u16 v1, v1, v5, v3
-; GFX10-SDAG-NEXT: v_min3_u16 v0, v0, v4, v2
+; GFX10-SDAG-NEXT: v_min3_u16 v1, v1, v5, v13
+; GFX10-SDAG-NEXT: v_min_u16 v5, v6, v14
+; GFX10-SDAG-NEXT: v_min_u16 v2, v2, v10
+; GFX10-SDAG-NEXT: v_min3_u16 v0, v0, v4, v12
+; GFX10-SDAG-NEXT: v_min3_u16 v1, v1, v3, v7
+; GFX10-SDAG-NEXT: v_min3_u16 v0, v0, v2, v5
; GFX10-SDAG-NEXT: v_lshlrev_b16 v1, 8, v1
; GFX10-SDAG-NEXT: v_or_b32_e32 v0, v0, v1
; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 8
@@ -1018,34 +1018,34 @@ define i8 @test_vector_reduce_umin_v16i8(<16 x i8> %v) {
; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_umin_v16i8:
; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l
-; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v15.l
-; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v7.l
-; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v11.l
-; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
-; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v14.l
-; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v9.l
-; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v13.l
+; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
+; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v1.l
+; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v13.l
; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
-; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-SDAG-TRUE16-NEXT: v_min_u16 v0.h, v0.h, v10.l
-; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l
+; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v8.l
; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v12.l
+; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v11.l
+; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v15.l
+; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v7.l
+; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-SDAG-TRUE16-NEXT: v_min_u16 v0.h, v0.h, v9.l
+; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v12.l
; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-SDAG-TRUE16-NEXT: v_min_u16 v4.h, v5.l, v4.h
-; GFX11-SDAG-TRUE16-NEXT: v_min_u16 v1.l, v1.l, v1.h
-; GFX11-SDAG-TRUE16-NEXT: v_min_u16 v1.h, v6.l, v6.h
-; GFX11-SDAG-TRUE16-NEXT: v_min3_u16 v0.h, v3.l, v3.h, v0.h
-; GFX11-SDAG-TRUE16-NEXT: v_min_u16 v3.l, v4.l, v5.h
-; GFX11-SDAG-TRUE16-NEXT: v_min_u16 v0.l, v0.l, v7.l
+; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v10.l
+; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v14.l
+; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
+; GFX11-SDAG-TRUE16-NEXT: v_min_u16 v3.h, v6.h, v3.h
+; GFX11-SDAG-TRUE16-NEXT: v_min_u16 v2.h, v3.l, v2.h
+; GFX11-SDAG-TRUE16-NEXT: v_min_u16 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT: v_min3_u16 v0.h, v0.h, v5.l, v5.h
+; GFX11-SDAG-TRUE16-NEXT: v_min_u16 v1.l, v6.l, v7.l
+; GFX11-SDAG-TRUE16-NEXT: v_min_u16 v1.h, v2.l, v1.h
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT: v_min3_u16 v1.h, v2.l, v2.h, v1.h
-; GFX11-SDAG-TRUE16-NEXT: v_min3_u16 v0.h, v1.l, v4.h, v0.h
+; GFX11-SDAG-TRUE16-NEXT: v_min3_u16 v0.l, v0.l, v4.l, v4.h
+; GFX11-SDAG-TRUE16-NEXT: v_min3_u16 v0.h, v0.h, v2.h, v3.h
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_min3_u16 v0.l, v0.l, v3.l, v1.h
+; GFX11-SDAG-TRUE16-NEXT: v_min3_u16 v0.l, v0.l, v1.h, v1.l
; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v0.h
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h
@@ -1061,34 +1061,34 @@ define i8 @test_vector_reduce_umin_v16i8(<16 x i8> %v) {
; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_umin_v16i8:
; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry
; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15
-; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7
-; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11
-; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14
-; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6
; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9
+; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13
; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-SDAG-FAKE16-NEXT: v_min_u16 v7, v7, v15
-; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10
-; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8
; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11
+; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15
+; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7
+; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-SDAG-FAKE16-NEXT: v_min_u16 v1, v1, v9
; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12
; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-SDAG-FAKE16-NEXT: v_min_u16 v5, v5, v13
-; GFX11-SDAG-FAKE16-NEXT: v_min_u16 v1, v1, v9
-; GFX11-SDAG-FAKE16-NEXT: v_min_u16 v6, v6, v14
-; GFX11-SDAG-FAKE16-NEXT: v_min3_u16 v3, v3, v11, v7
-; GFX11-SDAG-FAKE16-NEXT: v_min_u16 v4, v4, v12
+; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10
+; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14
+; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6
+; GFX11-SDAG-FAKE16-NEXT: v_min_u16 v7, v7, v15
+; GFX11-SDAG-FAKE16-NEXT: v_min_u16 v3, v3, v11
; GFX11-SDAG-FAKE16-NEXT: v_min_u16 v0, v0, v8
+; GFX11-SDAG-FAKE16-NEXT: v_min3_u16 v1, v1, v5, v13
+; GFX11-SDAG-FAKE16-NEXT: v_min_u16 v5, v6, v14
+; GFX11-SDAG-FAKE16-NEXT: v_min_u16 v2, v2, v10
; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-FAKE16-NEXT: v_min3_u16 v2, v2, v10, v6
-; GFX11-SDAG-FAKE16-NEXT: v_min3_u16 v1, v1, v5, v3
+; GFX11-SDAG-FAKE16-NEXT: v_min3_u16 v0, v0, v4, v12
+; GFX11-SDAG-FAKE16-NEXT: v_min3_u16 v1, v1, v3, v7
; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-FAKE16-NEXT: v_min3_u16 v0, v0, v4, v2
+; GFX11-SDAG-FAKE16-NEXT: v_min3_u16 v0, v0, v2, v5
; GFX11-SDAG-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1
; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1
@@ -1147,34 +1147,34 @@ define i8 @test_vector_reduce_umin_v16i8(<16 x i8> %v) {
; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l
-; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v15.l
-; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v7.l
-; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v11.l
-; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
-; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v14.l
-; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v9.l
-; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v13.l
+; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
+; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v1.l
+; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v13.l
; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
-; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX12-SDAG-TRUE16-NEXT: v_min_u16 v0.h, v0.h, v10.l
-; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l
+; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v8.l
; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v12.l
+; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v11.l
+; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v15.l
+; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v7.l
+; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX12-SDAG-TRUE16-NEXT: v_min_u16 v0.h, v0.h, v9.l
+; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v12.l
; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX12-SDAG-TRUE16-NEXT: v_min_u16 v4.h, v5.l, v4.h
-; GFX12-SDAG-TRUE16-NEXT: v_min_u16 v1.l, v1.l, v1.h
-; GFX12-SDAG-TRUE16-NEXT: v_min_u16 v1.h, v6.l, v6.h
-; GFX12-SDAG-TRUE16-NEXT: v_min3_u16 v0.h, v3.l, v3.h, v0.h
-; GFX12-SDAG-TRUE16-NEXT: v_min_u16 v3.l, v4.l, v5.h
-; GFX12-SDAG-TRUE16-NEXT: v_min_u16 v0.l, v0.l, v7.l
+; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v10.l
+; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v14.l
+; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
+; GFX12-SDAG-TRUE16-NEXT: v_min_u16 v3.h, v6.h, v3.h
+; GFX12-SDAG-TRUE16-NEXT: v_min_u16 v2.h, v3.l, v2.h
+; GFX12-SDAG-TRUE16-NEXT: v_min_u16 v0.l, v0.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT: v_min3_u16 v0.h, v0.h, v5.l, v5.h
+; GFX12-SDAG-TRUE16-NEXT: v_min_u16 v1.l, v6.l, v7.l
+; GFX12-SDAG-TRUE16-NEXT: v_min_u16 v1.h, v2.l, v1.h
; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-SDAG-TRUE16-NEXT: v_min3_u16 v1.h, v2.l, v2.h, v1.h
-; GFX12-SDAG-TRUE16-NEXT: v_min3_u16 v0.h, v1.l, v4.h, v0.h
+; GFX12-SDAG-TRUE16-NEXT: v_min3_u16 v0.l, v0.l, v4.l, v4.h
+; GFX12-SDAG-TRUE16-NEXT: v_min3_u16 v0.h, v0.h, v2.h, v3.h
; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT: v_min3_u16 v0.l, v0.l, v3.l, v1.h
+; GFX12-SDAG-TRUE16-NEXT: v_min3_u16 v0.l, v0.l, v1.h, v1.l
; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v0.h
; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h
@@ -1194,34 +1194,34 @@ define i8 @test_vector_reduce_umin_v16i8(<16 x i8> %v) {
; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15
-; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7
-; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11
-; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14
-; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6
; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9
+; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13
; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX12-SDAG-FAKE16-NEXT: v_min_u16 v7, v7, v15
-; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10
-; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8
; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11
+; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15
+; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7
+; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX12-SDAG-FAKE16-NEXT: v_min_u16 v1, v1, v9
; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12
; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX12-SDAG-FAKE16-NEXT: v_min_u16 v5, v5, v13
-; GFX12-SDAG-FAKE16-NEXT: v_min_u16 v1, v1, v9
-; GFX12-SDAG-FAKE16-NEXT: v_min_u16 v6, v6, v14
-; GFX12-SDAG-FAKE16-NEXT: v_min3_u16 v3, v3, v11, v7
-; GFX12-SDAG-FAKE16-NEXT: v_min_u16 v4, v4, v12
+; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10
+; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14
+; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6
+; GFX12-SDAG-FAKE16-NEXT: v_min_u16 v7, v7, v15
+; GFX12-SDAG-FAKE16-NEXT: v_min_u16 v3, v3, v11
; GFX12-SDAG-FAKE16-NEXT: v_min_u16 v0, v0, v8
+; GFX12-SDAG-FAKE16-NEXT: v_min3_u16 v1, v1, v5, v13
+; GFX12-SDAG-FAKE16-NEXT: v_min_u16 v5, v6, v14
+; GFX12-SDAG-FAKE16-NEXT: v_min_u16 v2, v2, v10
; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-SDAG-FAKE16-NEXT: v_min3_u16 v2, v2, v10, v6
-; GFX12-SDAG-FAKE16-NEXT: v_min3_u16 v1, v1, v5, v3
+; GFX12-SDAG-FAKE16-NEXT: v_min3_u16 v0, v0, v4, v12
+; GFX12-SDAG-FAKE16-NEXT: v_min3_u16 v1, v1, v3, v7
; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-FAKE16-NEXT: v_min3_u16 v0, v0, v4, v2
+; GFX12-SDAG-FAKE16-NEXT: v_min3_u16 v0, v0, v2, v5
; GFX12-SDAG-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1
; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1
@@ -1685,18 +1685,18 @@ define i16 @test_vector_reduce_umin_v8i16(<8 x i16> %v) {
; GFX7-SDAG-LABEL: test_vector_reduce_umin_v8i16:
; GFX7-SDAG: ; %bb.0: ; %entry
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX7-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX7-SDAG-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX7-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX7-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX7-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX7-SDAG-NEXT: v_min_u32_e32 v2, v2, v6
-; GFX7-SDAG-NEXT: v_min_u32_e32 v3, v3, v7
-; GFX7-SDAG-NEXT: v_min3_u32 v1, v1, v5, v3
-; GFX7-SDAG-NEXT: v_min3_u32 v0, v0, v4, v2
+; GFX7-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-SDAG-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX7-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX7-SDAG-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; GFX7-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX7-SDAG-NEXT: v_min_u32_e32 v0, v0, v4
+; GFX7-SDAG-NEXT: v_min_u32_e32 v1, v1, v5
+; GFX7-SDAG-NEXT: v_min3_u32 v1, v1, v3, v7
+; GFX7-SDAG-NEXT: v_min3_u32 v0, v0, v2, v6
; GFX7-SDAG-NEXT: v_min_u32_e32 v0, v0, v1
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -1878,32 +1878,32 @@ define i16 @test_vector_reduce_umin_v16i16(<16 x i16> %v) {
; GFX7-SDAG-LABEL: test_vector_reduce_umin_v16i16:
; GFX7-SDAG: ; %bb.0: ; %entry
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX7-SDAG-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX7-SDAG-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX7-SDAG-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX7-SDAG-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX7-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX7-SDAG-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX7-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX7-SDAG-NEXT: v_and_b32_e32 v8, 0xffff, v8
+; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-SDAG-NEXT: v_and_b32_e32 v9, 0xffff, v9
; GFX7-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX7-SDAG-NEXT: v_and_b32_e32 v13, 0xffff, v13
; GFX7-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX7-SDAG-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-SDAG-NEXT: v_and_b32_e32 v12, 0xffff, v12
; GFX7-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX7-SDAG-NEXT: v_min_u32_e32 v7, v7, v15
-; GFX7-SDAG-NEXT: v_min_u32_e32 v6, v6, v14
-; GFX7-SDAG-NEXT: v_min_u32_e32 v4, v4, v12
-; GFX7-SDAG-NEXT: v_min_u32_e32 v0, v0, v8
-; GFX7-SDAG-NEXT: v_min_u32_e32 v5, v5, v13
+; GFX7-SDAG-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; GFX7-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX7-SDAG-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX7-SDAG-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; GFX7-SDAG-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; GFX7-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX7-SDAG-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; GFX7-SDAG-NEXT: v_and_b32_e32 v6, 0xffff, v6
; GFX7-SDAG-NEXT: v_min_u32_e32 v1, v1, v9
-; GFX7-SDAG-NEXT: v_min3_u32 v2, v2, v10, v6
-; GFX7-SDAG-NEXT: v_min3_u32 v3, v3, v11, v7
-; GFX7-SDAG-NEXT: v_min3_u32 v1, v1, v5, v3
-; GFX7-SDAG-NEXT: v_min3_u32 v0, v0, v4, v2
+; GFX7-SDAG-NEXT: v_min_u32_e32 v0, v0, v8
+; GFX7-SDAG-NEXT: v_min_u32_e32 v6, v6, v14
+; GFX7-SDAG-NEXT: v_min_u32_e32 v2, v2, v10
+; GFX7-SDAG-NEXT: v_min_u32_e32 v7, v7, v15
+; GFX7-SDAG-NEXT: v_min_u32_e32 v3, v3, v11
+; GFX7-SDAG-NEXT: v_min3_u32 v0, v0, v4, v12
+; GFX7-SDAG-NEXT: v_min3_u32 v1, v1, v5, v13
+; GFX7-SDAG-NEXT: v_min3_u32 v1, v1, v3, v7
+; GFX7-SDAG-NEXT: v_min3_u32 v0, v0, v2, v6
; GFX7-SDAG-NEXT: v_min_u32_e32 v0, v0, v1
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/X86/combine-add-ssat.ll b/llvm/test/CodeGen/X86/combine-add-ssat.ll
index 3e217980d4a77..75adcddc7b2f3 100644
--- a/llvm/test/CodeGen/X86/combine-add-ssat.ll
+++ b/llvm/test/CodeGen/X86/combine-add-ssat.ll
@@ -62,12 +62,12 @@ define <8 x i16> @combine_constfold_v8i16() {
define <8 x i16> @combine_constfold_undef_v8i16() {
; SSE-LABEL: combine_constfold_undef_v8i16:
; SSE: # %bb.0:
-; SSE-NEXT: movaps {{.*#+}} xmm0 = [65535,65535,65535,65534,0,65280,32768,0]
+; SSE-NEXT: movaps {{.*#+}} xmm0 = [65535,65535,u,65534,0,65280,32768,0]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_constfold_undef_v8i16:
; AVX: # %bb.0:
-; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [65535,65535,65535,65534,0,65280,32768,0]
+; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [65535,65535,u,65534,0,65280,32768,0]
; AVX-NEXT: retq
%res = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> <i16 undef, i16 1, i16 undef, i16 65535, i16 -1, i16 -255, i16 -32760, i16 1>, <8 x i16> <i16 1, i16 undef, i16 undef, i16 65535, i16 1, i16 65535, i16 -10, i16 65535>)
ret <8 x i16> %res
diff --git a/llvm/test/CodeGen/X86/combine-add-usat.ll b/llvm/test/CodeGen/X86/combine-add-usat.ll
index 13bc3b26e2894..5b947dd6f2b55 100644
--- a/llvm/test/CodeGen/X86/combine-add-usat.ll
+++ b/llvm/test/CodeGen/X86/combine-add-usat.ll
@@ -62,12 +62,13 @@ define <8 x i16> @combine_constfold_v8i16() {
define <8 x i16> @combine_constfold_undef_v8i16() {
; SSE-LABEL: combine_constfold_undef_v8i16:
; SSE: # %bb.0:
-; SSE-NEXT: movaps {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,65535,2,65535]
+; SSE-NEXT: movaps {{.*#+}} xmm0 = [65535,65535,u,65535,65535,65535,2,65535]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_constfold_undef_v8i16:
; AVX: # %bb.0:
-; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,65535,2,65535]
+; AVX-NEXT: vmovddup {{.*#+}} xmm0 = [65535,65535,2,65535,65535,65535,2,65535]
+; AVX-NEXT: # xmm0 = mem[0,0]
; AVX-NEXT: retq
%res = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> <i16 undef, i16 1, i16 undef, i16 65535, i16 -1, i16 -255, i16 -65535, i16 1>, <8 x i16> <i16 1, i16 undef, i16 undef, i16 65535, i16 1, i16 65535, i16 1, i16 65535>)
ret <8 x i16> %res
diff --git a/llvm/test/CodeGen/X86/combine-sub-ssat.ll b/llvm/test/CodeGen/X86/combine-sub-ssat.ll
index 979331faf19ec..0dab025c19cc1 100644
--- a/llvm/test/CodeGen/X86/combine-sub-ssat.ll
+++ b/llvm/test/CodeGen/X86/combine-sub-ssat.ll
@@ -62,12 +62,12 @@ define <8 x i16> @combine_constfold_v8i16() {
define <8 x i16> @combine_constfold_undef_v8i16() {
; SSE-LABEL: combine_constfold_undef_v8i16:
; SSE: # %bb.0:
-; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,0,0,0,65534,65282,32786,2]
+; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,0,u,0,65534,65282,32786,2]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_constfold_undef_v8i16:
; AVX: # %bb.0:
-; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,0,0,0,65534,65282,32786,2]
+; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,0,u,0,65534,65282,32786,2]
; AVX-NEXT: retq
%res = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> <i16 undef, i16 1, i16 undef, i16 65535, i16 -1, i16 -255, i16 -32760, i16 1>, <8 x i16> <i16 1, i16 undef, i16 undef, i16 65535, i16 1, i16 65535, i16 -10, i16 65535>)
ret <8 x i16> %res
diff --git a/llvm/test/CodeGen/X86/combine-sub-usat.ll b/llvm/test/CodeGen/X86/combine-sub-usat.ll
index b70e3fcd779c5..36e374bd2e67c 100644
--- a/llvm/test/CodeGen/X86/combine-sub-usat.ll
+++ b/llvm/test/CodeGen/X86/combine-sub-usat.ll
@@ -73,17 +73,17 @@ define <8 x i16> @combine_constfold_v8i16() {
define <8 x i16> @combine_constfold_undef_v8i16() {
; SSE-LABEL: combine_constfold_undef_v8i16:
; SSE: # %bb.0:
-; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,0,0,0,65534,0,0,0]
+; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,0,u,0,65534,0,0,0]
; SSE-NEXT: retq
;
; AVX1-LABEL: combine_constfold_undef_v8i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [0,0,0,0,65534,0,0,0]
+; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [0,0,u,0,65534,0,0,0]
; AVX1-NEXT: retq
;
; AVX2-LABEL: combine_constfold_undef_v8i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [0,0,0,0,65534,0,0,0]
+; AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [0,0,u,0,65534,0,0,0]
; AVX2-NEXT: retq
;
; AVX512-LABEL: combine_constfold_undef_v8i16:
diff --git a/llvm/test/CodeGen/X86/load-combine.ll b/llvm/test/CodeGen/X86/load-combine.ll
index b5f3e78991881..f21c07599d6f1 100644
--- a/llvm/test/CodeGen/X86/load-combine.ll
+++ b/llvm/test/CodeGen/X86/load-combine.ll
@@ -800,13 +800,13 @@ define void @shift_i32_by_32(ptr %src1, ptr %src2, ptr %dst) {
; CHECK-LABEL: shift_i32_by_32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: movl $-1, 4(%eax)
-; CHECK-NEXT: movl $-1, (%eax)
+; CHECK-NEXT: movl $0, 4(%eax)
+; CHECK-NEXT: movl $0, (%eax)
; CHECK-NEXT: retl
;
; CHECK64-LABEL: shift_i32_by_32:
; CHECK64: # %bb.0: # %entry
-; CHECK64-NEXT: movq $-1, (%rdx)
+; CHECK64-NEXT: movq $0, (%rdx)
; CHECK64-NEXT: retq
entry:
%load1 = load i8, ptr %src1, align 1
diff --git a/llvm/test/CodeGen/X86/pr33960.ll b/llvm/test/CodeGen/X86/pr33960.ll
index 44fe777e6d140..6ee270e406892 100644
--- a/llvm/test/CodeGen/X86/pr33960.ll
+++ b/llvm/test/CodeGen/X86/pr33960.ll
@@ -7,12 +7,10 @@
define void @PR33960() {
; X86-LABEL: PR33960:
; X86: # %bb.0: # %entry
-; X86-NEXT: movl $-1, b
; X86-NEXT: retl
;
; X64-LABEL: PR33960:
; X64: # %bb.0: # %entry
-; X64-NEXT: movl $-1, b(%rip)
; X64-NEXT: retq
entry:
%tmp = insertelement <4 x i32> <i32 undef, i32 -7, i32 -3, i32 undef>, i32 -2, i32 3
diff --git a/llvm/unittests/CodeGen/CMakeLists.txt b/llvm/unittests/CodeGen/CMakeLists.txt
index d19b122676c9b..22dbdaa4fa82e 100644
--- a/llvm/unittests/CodeGen/CMakeLists.txt
+++ b/llvm/unittests/CodeGen/CMakeLists.txt
@@ -42,6 +42,7 @@ add_llvm_unittest(CodeGenTests
ScalableVectorMVTsTest.cpp
SchedBoundary.cpp
SelectionDAGAddressAnalysisTest.cpp
+ SelectionDAGNodeConstructionTest.cpp
SelectionDAGPatternMatchTest.cpp
TypeTraitsTest.cpp
TargetOptionsTest.cpp
diff --git a/llvm/unittests/CodeGen/SelectionDAGAddressAnalysisTest.cpp b/llvm/unittests/CodeGen/SelectionDAGAddressAnalysisTest.cpp
index 0058daf680816..7ad7a51091ceb 100644
--- a/llvm/unittests/CodeGen/SelectionDAGAddressAnalysisTest.cpp
+++ b/llvm/unittests/CodeGen/SelectionDAGAddressAnalysisTest.cpp
@@ -7,103 +7,12 @@
//===----------------------------------------------------------------------===//
#include "llvm/CodeGen/SelectionDAGAddressAnalysis.h"
+#include "SelectionDAGTestBase.h"
#include "llvm/Analysis/MemoryLocation.h"
-#include "llvm/Analysis/OptimizationRemarkEmitter.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/AsmParser/Parser.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/CodeGen/SelectionDAG.h"
-#include "llvm/CodeGen/TargetLowering.h"
-#include "llvm/IR/Module.h"
-#include "llvm/MC/TargetRegistry.h"
-#include "llvm/Support/SourceMgr.h"
-#include "llvm/Support/TargetSelect.h"
-#include "llvm/Target/TargetMachine.h"
-#include "gtest/gtest.h"
namespace llvm {
-class SelectionDAGAddressAnalysisTest : public testing::Test {
-protected:
- static void SetUpTestCase() {
- InitializeAllTargets();
- InitializeAllTargetMCs();
- }
-
- void SetUp() override {
- StringRef Assembly = "@g = global i32 0\n"
- "@g_alias = alias i32, i32* @g\n"
- "define i32 @f() {\n"
- " %1 = load i32, i32* @g\n"
- " ret i32 %1\n"
- "}";
-
- Triple TargetTriple("aarch64--");
- std::string Error;
- const Target *T = TargetRegistry::lookupTarget("", TargetTriple, Error);
- // FIXME: These tests do not depend on AArch64 specifically, but we have to
- // initialize a target. A skeleton Target for unittests would allow us to
- // always run these tests.
- if (!T)
- GTEST_SKIP();
-
- TargetOptions Options;
- TM = std::unique_ptr<TargetMachine>(
- T->createTargetMachine(TargetTriple, "", "+sve", Options, std::nullopt,
- std::nullopt, CodeGenOptLevel::Aggressive));
- if (!TM)
- GTEST_SKIP();
-
- SMDiagnostic SMError;
- M = parseAssemblyString(Assembly, SMError, Context);
- if (!M)
- report_fatal_error(SMError.getMessage());
- M->setDataLayout(TM->createDataLayout());
-
- F = M->getFunction("f");
- if (!F)
- report_fatal_error("F?");
- G = M->getGlobalVariable("g");
- if (!G)
- report_fatal_error("G?");
- AliasedG = M->getNamedAlias("g_alias");
- if (!AliasedG)
- report_fatal_error("AliasedG?");
-
- MachineModuleInfo MMI(TM.get());
-
- MF = std::make_unique<MachineFunction>(*F, *TM, *TM->getSubtargetImpl(*F),
- MMI.getContext(), 0);
-
- DAG = std::make_unique<SelectionDAG>(*TM, CodeGenOptLevel::None);
- if (!DAG)
- report_fatal_error("DAG?");
- OptimizationRemarkEmitter ORE(F);
- FunctionAnalysisManager FAM;
- FAM.registerPass([&] { return TM->getTargetIRAnalysis(); });
-
- TargetTransformInfo TTI = TM->getTargetIRAnalysis().run(*F, FAM);
- DAG->init(*MF, ORE, nullptr, nullptr, nullptr, nullptr, nullptr, MMI,
- nullptr, TTI.hasBranchDivergence(F));
- }
-
- TargetLoweringBase::LegalizeTypeAction getTypeAction(EVT VT) {
- return DAG->getTargetLoweringInfo().getTypeAction(Context, VT);
- }
-
- EVT getTypeToTransformTo(EVT VT) {
- return DAG->getTargetLoweringInfo().getTypeToTransformTo(Context, VT);
- }
-
- LLVMContext Context;
- std::unique_ptr<TargetMachine> TM;
- std::unique_ptr<Module> M;
- Function *F;
- GlobalVariable *G;
- GlobalAlias *AliasedG;
- std::unique_ptr<MachineFunction> MF;
- std::unique_ptr<SelectionDAG> DAG;
-};
+class SelectionDAGAddressAnalysisTest : public SelectionDAGTestBase {};
TEST_F(SelectionDAGAddressAnalysisTest, sameFrameObject) {
SDLoc Loc;
diff --git a/llvm/unittests/CodeGen/SelectionDAGNodeConstructionTest.cpp b/llvm/unittests/CodeGen/SelectionDAGNodeConstructionTest.cpp
new file mode 100644
index 0000000000000..b2c1420215c3f
--- /dev/null
+++ b/llvm/unittests/CodeGen/SelectionDAGNodeConstructionTest.cpp
@@ -0,0 +1,317 @@
+//===---- llvm/unittest/CodeGen/SelectionDAGPatternMatchTest.cpp ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "SelectionDAGTestBase.h"
+
+using namespace llvm;
+
+class SelectionDAGNodeConstructionTest : public SelectionDAGTestBase {};
+
+TEST_F(SelectionDAGNodeConstructionTest, ADD) {
+ SDLoc DL;
+ SDValue Poison = DAG->getPOISON(MVT::i32);
+ SDValue Undef = DAG->getUNDEF(MVT::i32);
+ SDValue Op = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 1, MVT::i32);
+
+ EXPECT_EQ(DAG->getNode(ISD::ADD, DL, MVT::i32, Op, Poison), Poison);
+ EXPECT_EQ(DAG->getNode(ISD::ADD, DL, MVT::i32, Poison, Op), Poison);
+ EXPECT_EQ(DAG->getNode(ISD::ADD, DL, MVT::i32, Poison, Undef), Poison);
+ EXPECT_EQ(DAG->getNode(ISD::ADD, DL, MVT::i32, Undef, Poison), Poison);
+
+ EXPECT_EQ(DAG->getNode(ISD::ADD, DL, MVT::i32, Op, Undef), Undef);
+ EXPECT_EQ(DAG->getNode(ISD::ADD, DL, MVT::i32, Undef, Op), Undef);
+ EXPECT_EQ(DAG->getNode(ISD::ADD, DL, MVT::i32, Undef, Undef), Undef);
+}
+
+TEST_F(SelectionDAGNodeConstructionTest, AND) {
+ SDLoc DL;
+ SDValue Op = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 1, MVT::i32);
+ SDValue Poison = DAG->getPOISON(MVT::i32);
+ SDValue Undef = DAG->getUNDEF(MVT::i32);
+ SDValue Zero = DAG->getConstant(0, DL, MVT::i32);
+
+ EXPECT_EQ(DAG->getNode(ISD::AND, DL, MVT::i32, Op, Poison), Poison);
+ EXPECT_EQ(DAG->getNode(ISD::AND, DL, MVT::i32, Poison, Op), Poison);
+ EXPECT_EQ(DAG->getNode(ISD::AND, DL, MVT::i32, Poison, Undef), Poison);
+ EXPECT_EQ(DAG->getNode(ISD::AND, DL, MVT::i32, Undef, Poison), Poison);
+
+ EXPECT_EQ(DAG->getNode(ISD::AND, DL, MVT::i32, Op, Undef), Zero);
+ EXPECT_EQ(DAG->getNode(ISD::AND, DL, MVT::i32, Undef, Op), Zero);
+ EXPECT_EQ(DAG->getNode(ISD::AND, DL, MVT::i32, Undef, Undef), Undef);
+}
+
+TEST_F(SelectionDAGNodeConstructionTest, MUL) {
+ SDLoc DL;
+ SDValue Op = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 1, MVT::i32);
+ SDValue Poison = DAG->getPOISON(MVT::i32);
+ SDValue Undef = DAG->getUNDEF(MVT::i32);
+ SDValue Zero = DAG->getConstant(0, DL, MVT::i32);
+
+ EXPECT_EQ(DAG->getNode(ISD::MUL, DL, MVT::i32, Op, Poison), Poison);
+ EXPECT_EQ(DAG->getNode(ISD::MUL, DL, MVT::i32, Poison, Op), Poison);
+ EXPECT_EQ(DAG->getNode(ISD::MUL, DL, MVT::i32, Poison, Undef), Poison);
+ EXPECT_EQ(DAG->getNode(ISD::MUL, DL, MVT::i32, Undef, Poison), Poison);
+
+ EXPECT_EQ(DAG->getNode(ISD::MUL, DL, MVT::i32, Op, Undef), Zero);
+ EXPECT_EQ(DAG->getNode(ISD::MUL, DL, MVT::i32, Undef, Op), Zero);
+ EXPECT_EQ(DAG->getNode(ISD::MUL, DL, MVT::i32, Undef, Undef), Undef);
+}
+
+TEST_F(SelectionDAGNodeConstructionTest, OR) {
+ SDLoc DL;
+ SDValue Op = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 1, MVT::i32);
+ SDValue Poison = DAG->getPOISON(MVT::i32);
+ SDValue Undef = DAG->getUNDEF(MVT::i32);
+ SDValue AllOnes = DAG->getAllOnesConstant(DL, MVT::i32);
+
+ EXPECT_EQ(DAG->getNode(ISD::OR, DL, MVT::i32, Op, Poison), Poison);
+ EXPECT_EQ(DAG->getNode(ISD::OR, DL, MVT::i32, Poison, Op), Poison);
+ EXPECT_EQ(DAG->getNode(ISD::OR, DL, MVT::i32, Poison, Undef), Poison);
+ EXPECT_EQ(DAG->getNode(ISD::OR, DL, MVT::i32, Undef, Poison), Poison);
+
+ EXPECT_EQ(DAG->getNode(ISD::OR, DL, MVT::i32, Op, Undef), AllOnes);
+ EXPECT_EQ(DAG->getNode(ISD::OR, DL, MVT::i32, Undef, Op), AllOnes);
+ EXPECT_EQ(DAG->getNode(ISD::OR, DL, MVT::i32, Undef, Undef), Undef);
+}
+
+TEST_F(SelectionDAGNodeConstructionTest, SADDSAT) {
+ SDLoc DL;
+ SDValue Op = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 1, MVT::i32);
+ SDValue Poison = DAG->getPOISON(MVT::i32);
+ SDValue Undef = DAG->getUNDEF(MVT::i32);
+ SDValue AllOnes = DAG->getAllOnesConstant(DL, MVT::i32);
+
+ EXPECT_EQ(DAG->getNode(ISD::SADDSAT, DL, MVT::i32, Op, Poison), Poison);
+ EXPECT_EQ(DAG->getNode(ISD::SADDSAT, DL, MVT::i32, Poison, Op), Poison);
+ EXPECT_EQ(DAG->getNode(ISD::SADDSAT, DL, MVT::i32, Poison, Undef), Poison);
+ EXPECT_EQ(DAG->getNode(ISD::SADDSAT, DL, MVT::i32, Undef, Poison), Poison);
+
+ EXPECT_EQ(DAG->getNode(ISD::SADDSAT, DL, MVT::i32, Op, Undef), AllOnes);
+ EXPECT_EQ(DAG->getNode(ISD::SADDSAT, DL, MVT::i32, Undef, Op), AllOnes);
+ EXPECT_EQ(DAG->getNode(ISD::SADDSAT, DL, MVT::i32, Undef, Undef), Undef);
+}
+
+TEST_F(SelectionDAGNodeConstructionTest, SDIV) {
+ SDLoc DL;
+ SDValue Op = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 1, MVT::i32);
+ SDValue Poison = DAG->getPOISON(MVT::i32);
+ SDValue Undef = DAG->getUNDEF(MVT::i32);
+ SDValue Zero = DAG->getConstant(0, DL, MVT::i32);
+
+ EXPECT_EQ(DAG->getNode(ISD::SDIV, DL, MVT::i32, Op, Poison), Poison);
+ EXPECT_EQ(DAG->getNode(ISD::SDIV, DL, MVT::i32, Poison, Op), Poison);
+ EXPECT_EQ(DAG->getNode(ISD::SDIV, DL, MVT::i32, Poison, Undef), Poison);
+ EXPECT_EQ(DAG->getNode(ISD::SDIV, DL, MVT::i32, Undef, Poison), Poison);
+
+ EXPECT_EQ(DAG->getNode(ISD::SDIV, DL, MVT::i32, Op, Undef), Poison);
+ EXPECT_EQ(DAG->getNode(ISD::SDIV, DL, MVT::i32, Undef, Op), Zero);
+ EXPECT_EQ(DAG->getNode(ISD::SDIV, DL, MVT::i32, Undef, Undef), Poison);
+}
+
+TEST_F(SelectionDAGNodeConstructionTest, SMAX) {
+ SDLoc DL;
+ SDValue Op = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 1, MVT::i32);
+ SDValue Poison = DAG->getPOISON(MVT::i32);
+ SDValue Undef = DAG->getUNDEF(MVT::i32);
+ SDValue MaxInt = DAG->getConstant(APInt::getSignedMaxValue(32), DL, MVT::i32);
+
+ EXPECT_EQ(DAG->getNode(ISD::SMAX, DL, MVT::i32, Op, Poison), Poison);
+ EXPECT_EQ(DAG->getNode(ISD::SMAX, DL, MVT::i32, Poison, Op), Poison);
+ EXPECT_EQ(DAG->getNode(ISD::SMAX, DL, MVT::i32, Poison, Undef), Poison);
+ EXPECT_EQ(DAG->getNode(ISD::SMAX, DL, MVT::i32, Undef, Poison), Poison);
+
+ EXPECT_EQ(DAG->getNode(ISD::SMAX, DL, MVT::i32, Op, Undef), MaxInt);
+ EXPECT_EQ(DAG->getNode(ISD::SMAX, DL, MVT::i32, Undef, Op), MaxInt);
+ EXPECT_EQ(DAG->getNode(ISD::SMAX, DL, MVT::i32, Undef, Undef), Undef);
+}
+
+TEST_F(SelectionDAGNodeConstructionTest, SMIN) {
+ SDLoc DL;
+ SDValue Op = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 1, MVT::i32);
+ SDValue Poison = DAG->getPOISON(MVT::i32);
+ SDValue Undef = DAG->getUNDEF(MVT::i32);
+ SDValue MinInt = DAG->getConstant(APInt::getSignedMinValue(32), DL, MVT::i32);
+
+ EXPECT_EQ(DAG->getNode(ISD::SMIN, DL, MVT::i32, Op, Poison), Poison);
+ EXPECT_EQ(DAG->getNode(ISD::SMIN, DL, MVT::i32, Poison, Op), Poison);
+ EXPECT_EQ(DAG->getNode(ISD::SMIN, DL, MVT::i32, Poison, Undef), Poison);
+ EXPECT_EQ(DAG->getNode(ISD::SMIN, DL, MVT::i32, Undef, Poison), Poison);
+
+ EXPECT_EQ(DAG->getNode(ISD::SMIN, DL, MVT::i32, Op, Undef), MinInt);
+ EXPECT_EQ(DAG->getNode(ISD::SMIN, DL, MVT::i32, Undef, Op), MinInt);
+ EXPECT_EQ(DAG->getNode(ISD::SMIN, DL, MVT::i32, Undef, Undef), Undef);
+}
+
+TEST_F(SelectionDAGNodeConstructionTest, SREM) {
+ SDLoc DL;
+ SDValue Op = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 1, MVT::i32);
+ SDValue Poison = DAG->getPOISON(MVT::i32);
+ SDValue Undef = DAG->getUNDEF(MVT::i32);
+ SDValue Zero = DAG->getConstant(0, DL, MVT::i32);
+
+ EXPECT_EQ(DAG->getNode(ISD::SREM, DL, MVT::i32, Op, Poison), Poison);
+ EXPECT_EQ(DAG->getNode(ISD::SREM, DL, MVT::i32, Poison, Op), Poison);
+ EXPECT_EQ(DAG->getNode(ISD::SREM, DL, MVT::i32, Poison, Undef), Poison);
+ EXPECT_EQ(DAG->getNode(ISD::SREM, DL, MVT::i32, Undef, Poison), Poison);
+
+ EXPECT_EQ(DAG->getNode(ISD::SREM, DL, MVT::i32, Op, Undef), Poison);
+ EXPECT_EQ(DAG->getNode(ISD::SREM, DL, MVT::i32, Undef, Op), Zero);
+ EXPECT_EQ(DAG->getNode(ISD::SREM, DL, MVT::i32, Undef, Undef), Poison);
+}
+
+TEST_F(SelectionDAGNodeConstructionTest, SSUBSAT) {
+ SDLoc DL;
+ SDValue Op = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 1, MVT::i32);
+ SDValue Poison = DAG->getPOISON(MVT::i32);
+ SDValue Undef = DAG->getUNDEF(MVT::i32);
+ SDValue Zero = DAG->getConstant(0, DL, MVT::i32);
+
+ EXPECT_EQ(DAG->getNode(ISD::SSUBSAT, DL, MVT::i32, Op, Poison), Poison);
+ EXPECT_EQ(DAG->getNode(ISD::SSUBSAT, DL, MVT::i32, Poison, Op), Poison);
+ EXPECT_EQ(DAG->getNode(ISD::SSUBSAT, DL, MVT::i32, Poison, Undef), Poison);
+ EXPECT_EQ(DAG->getNode(ISD::SSUBSAT, DL, MVT::i32, Undef, Poison), Poison);
+
+ EXPECT_EQ(DAG->getNode(ISD::SSUBSAT, DL, MVT::i32, Op, Undef), Zero);
+ EXPECT_EQ(DAG->getNode(ISD::SSUBSAT, DL, MVT::i32, Undef, Op), Zero);
+ EXPECT_EQ(DAG->getNode(ISD::SSUBSAT, DL, MVT::i32, Undef, Undef), Undef);
+}
+
+TEST_F(SelectionDAGNodeConstructionTest, SUB) {
+ SDLoc DL;
+ SDValue Op = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 1, MVT::i32);
+ SDValue Poison = DAG->getPOISON(MVT::i32);
+ SDValue Undef = DAG->getUNDEF(MVT::i32);
+
+ EXPECT_EQ(DAG->getNode(ISD::SUB, DL, MVT::i32, Op, Poison), Poison);
+ EXPECT_EQ(DAG->getNode(ISD::SUB, DL, MVT::i32, Poison, Op), Poison);
+ EXPECT_EQ(DAG->getNode(ISD::SUB, DL, MVT::i32, Poison, Undef), Poison);
+ EXPECT_EQ(DAG->getNode(ISD::SUB, DL, MVT::i32, Undef, Poison), Poison);
+
+ EXPECT_EQ(DAG->getNode(ISD::SUB, DL, MVT::i32, Op, Undef), Undef);
+ EXPECT_EQ(DAG->getNode(ISD::SUB, DL, MVT::i32, Undef, Op), Undef);
+ EXPECT_EQ(DAG->getNode(ISD::SUB, DL, MVT::i32, Undef, Undef), Undef);
+}
+
+TEST_F(SelectionDAGNodeConstructionTest, UADDSAT) {
+ SDLoc DL;
+ SDValue Op = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 1, MVT::i32);
+ SDValue Poison = DAG->getPOISON(MVT::i32);
+ SDValue Undef = DAG->getUNDEF(MVT::i32);
+ SDValue AllOnes = DAG->getAllOnesConstant(DL, MVT::i32);
+
+ EXPECT_EQ(DAG->getNode(ISD::UADDSAT, DL, MVT::i32, Op, Poison), Poison);
+ EXPECT_EQ(DAG->getNode(ISD::UADDSAT, DL, MVT::i32, Poison, Op), Poison);
+ EXPECT_EQ(DAG->getNode(ISD::UADDSAT, DL, MVT::i32, Poison, Undef), Poison);
+ EXPECT_EQ(DAG->getNode(ISD::UADDSAT, DL, MVT::i32, Undef, Poison), Poison);
+
+ EXPECT_EQ(DAG->getNode(ISD::UADDSAT, DL, MVT::i32, Op, Undef), AllOnes);
+ EXPECT_EQ(DAG->getNode(ISD::UADDSAT, DL, MVT::i32, Undef, Op), AllOnes);
+ EXPECT_EQ(DAG->getNode(ISD::UADDSAT, DL, MVT::i32, Undef, Undef), Undef);
+}
+
+TEST_F(SelectionDAGNodeConstructionTest, UDIV) {
+ SDLoc DL;
+ SDValue Op = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 1, MVT::i32);
+ SDValue Poison = DAG->getPOISON(MVT::i32);
+ SDValue Undef = DAG->getUNDEF(MVT::i32);
+ SDValue Zero = DAG->getConstant(0, DL, MVT::i32);
+
+ EXPECT_EQ(DAG->getNode(ISD::UDIV, DL, MVT::i32, Op, Poison), Poison);
+ EXPECT_EQ(DAG->getNode(ISD::UDIV, DL, MVT::i32, Poison, Op), Poison);
+ EXPECT_EQ(DAG->getNode(ISD::UDIV, DL, MVT::i32, Poison, Undef), Poison);
+ EXPECT_EQ(DAG->getNode(ISD::UDIV, DL, MVT::i32, Undef, Poison), Poison);
+
+ EXPECT_EQ(DAG->getNode(ISD::UDIV, DL, MVT::i32, Op, Undef), Poison);
+ EXPECT_EQ(DAG->getNode(ISD::UDIV, DL, MVT::i32, Undef, Op), Zero);
+ EXPECT_EQ(DAG->getNode(ISD::UDIV, DL, MVT::i32, Undef, Undef), Poison);
+}
+
+TEST_F(SelectionDAGNodeConstructionTest, UMAX) {
+ SDLoc DL;
+ SDValue Op = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 1, MVT::i32);
+ SDValue Poison = DAG->getPOISON(MVT::i32);
+ SDValue Undef = DAG->getUNDEF(MVT::i32);
+ SDValue AllOnes = DAG->getAllOnesConstant(DL, MVT::i32);
+
+ EXPECT_EQ(DAG->getNode(ISD::UMAX, DL, MVT::i32, Op, Poison), Poison);
+ EXPECT_EQ(DAG->getNode(ISD::UMAX, DL, MVT::i32, Poison, Op), Poison);
+ EXPECT_EQ(DAG->getNode(ISD::UMAX, DL, MVT::i32, Poison, Undef), Poison);
+ EXPECT_EQ(DAG->getNode(ISD::UMAX, DL, MVT::i32, Undef, Poison), Poison);
+
+ EXPECT_EQ(DAG->getNode(ISD::UMAX, DL, MVT::i32, Op, Undef), AllOnes);
+ EXPECT_EQ(DAG->getNode(ISD::UMAX, DL, MVT::i32, Undef, Op), AllOnes);
+ EXPECT_EQ(DAG->getNode(ISD::UMAX, DL, MVT::i32, Undef, Undef), Undef);
+}
+
+TEST_F(SelectionDAGNodeConstructionTest, UMIN) {
+ SDLoc DL;
+ SDValue Op = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 1, MVT::i32);
+ SDValue Poison = DAG->getPOISON(MVT::i32);
+ SDValue Undef = DAG->getUNDEF(MVT::i32);
+ SDValue Zero = DAG->getConstant(0, DL, MVT::i32);
+
+ EXPECT_EQ(DAG->getNode(ISD::UMIN, DL, MVT::i32, Op, Poison), Poison);
+ EXPECT_EQ(DAG->getNode(ISD::UMIN, DL, MVT::i32, Poison, Op), Poison);
+ EXPECT_EQ(DAG->getNode(ISD::UMIN, DL, MVT::i32, Poison, Undef), Poison);
+ EXPECT_EQ(DAG->getNode(ISD::UMIN, DL, MVT::i32, Undef, Poison), Poison);
+
+ EXPECT_EQ(DAG->getNode(ISD::UMIN, DL, MVT::i32, Op, Undef), Zero);
+ EXPECT_EQ(DAG->getNode(ISD::UMIN, DL, MVT::i32, Undef, Op), Zero);
+ EXPECT_EQ(DAG->getNode(ISD::UMIN, DL, MVT::i32, Undef, Undef), Undef);
+}
+
+TEST_F(SelectionDAGNodeConstructionTest, UREM) {
+ SDLoc DL;
+ SDValue Op = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 1, MVT::i32);
+ SDValue Poison = DAG->getPOISON(MVT::i32);
+ SDValue Undef = DAG->getUNDEF(MVT::i32);
+ SDValue Zero = DAG->getConstant(0, DL, MVT::i32);
+
+ EXPECT_EQ(DAG->getNode(ISD::UREM, DL, MVT::i32, Op, Poison), Poison);
+ EXPECT_EQ(DAG->getNode(ISD::UREM, DL, MVT::i32, Poison, Op), Poison);
+ EXPECT_EQ(DAG->getNode(ISD::UREM, DL, MVT::i32, Poison, Undef), Poison);
+ EXPECT_EQ(DAG->getNode(ISD::UREM, DL, MVT::i32, Undef, Poison), Poison);
+
+ EXPECT_EQ(DAG->getNode(ISD::UREM, DL, MVT::i32, Op, Undef), Poison);
+ EXPECT_EQ(DAG->getNode(ISD::UREM, DL, MVT::i32, Undef, Op), Zero);
+ EXPECT_EQ(DAG->getNode(ISD::UREM, DL, MVT::i32, Undef, Undef), Poison);
+}
+
+TEST_F(SelectionDAGNodeConstructionTest, USUBSAT) {
+ SDLoc DL;
+ SDValue Op = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 1, MVT::i32);
+ SDValue Poison = DAG->getPOISON(MVT::i32);
+ SDValue Undef = DAG->getUNDEF(MVT::i32);
+ SDValue Zero = DAG->getConstant(0, DL, MVT::i32);
+
+ EXPECT_EQ(DAG->getNode(ISD::USUBSAT, DL, MVT::i32, Op, Poison), Poison);
+ EXPECT_EQ(DAG->getNode(ISD::USUBSAT, DL, MVT::i32, Poison, Op), Poison);
+ EXPECT_EQ(DAG->getNode(ISD::USUBSAT, DL, MVT::i32, Poison, Undef), Poison);
+ EXPECT_EQ(DAG->getNode(ISD::USUBSAT, DL, MVT::i32, Undef, Poison), Poison);
+
+ EXPECT_EQ(DAG->getNode(ISD::USUBSAT, DL, MVT::i32, Op, Undef), Zero);
+ EXPECT_EQ(DAG->getNode(ISD::USUBSAT, DL, MVT::i32, Undef, Op), Zero);
+ EXPECT_EQ(DAG->getNode(ISD::USUBSAT, DL, MVT::i32, Undef, Undef), Undef);
+}
+
+TEST_F(SelectionDAGNodeConstructionTest, XOR) {
+ SDLoc DL;
+ SDValue Op = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 1, MVT::i32);
+ SDValue Poison = DAG->getPOISON(MVT::i32);
+ SDValue Undef = DAG->getUNDEF(MVT::i32);
+ SDValue Zero = DAG->getConstant(0, DL, MVT::i32);
+
+ EXPECT_EQ(DAG->getNode(ISD::XOR, DL, MVT::i32, Op, Poison), Poison);
+ EXPECT_EQ(DAG->getNode(ISD::XOR, DL, MVT::i32, Poison, Op), Poison);
+ EXPECT_EQ(DAG->getNode(ISD::XOR, DL, MVT::i32, Poison, Undef), Poison);
+ EXPECT_EQ(DAG->getNode(ISD::XOR, DL, MVT::i32, Undef, Poison), Poison);
+
+ EXPECT_EQ(DAG->getNode(ISD::XOR, DL, MVT::i32, Op, Undef), Undef);
+ EXPECT_EQ(DAG->getNode(ISD::XOR, DL, MVT::i32, Undef, Op), Undef);
+ EXPECT_EQ(DAG->getNode(ISD::XOR, DL, MVT::i32, Undef, Undef), Zero);
+}
diff --git a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
index 30a1406da7734..4e0bf385d72b2 100644
--- a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
+++ b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
@@ -6,102 +6,12 @@
//
//===----------------------------------------------------------------------===//
-#include "llvm/Analysis/OptimizationRemarkEmitter.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/AsmParser/Parser.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "SelectionDAGTestBase.h"
#include "llvm/CodeGen/SDPatternMatch.h"
-#include "llvm/CodeGen/TargetLowering.h"
-#include "llvm/IR/Module.h"
-#include "llvm/MC/TargetRegistry.h"
-#include "llvm/Support/SourceMgr.h"
-#include "llvm/Support/TargetSelect.h"
-#include "llvm/Target/TargetMachine.h"
-#include "gtest/gtest.h"
using namespace llvm;
-class SelectionDAGPatternMatchTest : public testing::Test {
-protected:
- static void SetUpTestCase() {
- InitializeAllTargets();
- InitializeAllTargetMCs();
- }
-
- void SetUp() override {
- StringRef Assembly = "@g = global i32 0\n"
- "@g_alias = alias i32, i32* @g\n"
- "define i32 @f() {\n"
- " %1 = load i32, i32* @g\n"
- " ret i32 %1\n"
- "}";
-
- Triple TargetTriple("riscv64--");
- std::string Error;
- const Target *T = TargetRegistry::lookupTarget("", TargetTriple, Error);
- // FIXME: These tests do not depend on RISCV specifically, but we have to
- // initialize a target. A skeleton Target for unittests would allow us to
- // always run these tests.
- if (!T)
- GTEST_SKIP();
-
- TargetOptions Options;
- TM = std::unique_ptr<TargetMachine>(T->createTargetMachine(
- TargetTriple, "", "+m,+f,+d,+v", Options, std::nullopt, std::nullopt,
- CodeGenOptLevel::Aggressive));
- if (!TM)
- GTEST_SKIP();
-
- SMDiagnostic SMError;
- M = parseAssemblyString(Assembly, SMError, Context);
- if (!M)
- report_fatal_error(SMError.getMessage());
- M->setDataLayout(TM->createDataLayout());
-
- F = M->getFunction("f");
- if (!F)
- report_fatal_error("F?");
- G = M->getGlobalVariable("g");
- if (!G)
- report_fatal_error("G?");
- AliasedG = M->getNamedAlias("g_alias");
- if (!AliasedG)
- report_fatal_error("AliasedG?");
-
- MachineModuleInfo MMI(TM.get());
-
- MF = std::make_unique<MachineFunction>(*F, *TM, *TM->getSubtargetImpl(*F),
- MMI.getContext(), 0);
-
- DAG = std::make_unique<SelectionDAG>(*TM, CodeGenOptLevel::None);
- if (!DAG)
- report_fatal_error("DAG?");
- OptimizationRemarkEmitter ORE(F);
- FunctionAnalysisManager FAM;
- FAM.registerPass([&] { return TM->getTargetIRAnalysis(); });
-
- TargetTransformInfo TTI = TM->getTargetIRAnalysis().run(*F, FAM);
- DAG->init(*MF, ORE, nullptr, nullptr, nullptr, nullptr, nullptr, MMI,
- nullptr, TTI.hasBranchDivergence(F));
- }
-
- TargetLoweringBase::LegalizeTypeAction getTypeAction(EVT VT) {
- return DAG->getTargetLoweringInfo().getTypeAction(Context, VT);
- }
-
- EVT getTypeToTransformTo(EVT VT) {
- return DAG->getTargetLoweringInfo().getTypeToTransformTo(Context, VT);
- }
-
- LLVMContext Context;
- std::unique_ptr<TargetMachine> TM;
- std::unique_ptr<Module> M;
- Function *F;
- GlobalVariable *G;
- GlobalAlias *AliasedG;
- std::unique_ptr<MachineFunction> MF;
- std::unique_ptr<SelectionDAG> DAG;
-};
+class SelectionDAGPatternMatchTest : public SelectionDAGTestBase {};
TEST_F(SelectionDAGPatternMatchTest, matchValueType) {
SDLoc DL;
diff --git a/llvm/unittests/CodeGen/SelectionDAGTestBase.h b/llvm/unittests/CodeGen/SelectionDAGTestBase.h
new file mode 100644
index 0000000000000..edc730d7f9b45
--- /dev/null
+++ b/llvm/unittests/CodeGen/SelectionDAGTestBase.h
@@ -0,0 +1,99 @@
+//===---- llvm/unittest/CodeGen/SelectionDAGTestBase.h --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/AsmParser/Parser.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Target/TargetMachine.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+
+class SelectionDAGTestBase : public testing::Test {
+protected:
+ static void SetUpTestCase() {
+ InitializeAllTargets();
+ InitializeAllTargetMCs();
+ }
+
+ void SetUp() override {
+ StringRef Assembly = "@g = global i32 0\n"
+ "@g_alias = alias i32, i32* @g\n"
+ "define i32 @f() {\n"
+ " %1 = load i32, i32* @g\n"
+ " ret i32 %1\n"
+ "}";
+
+ Triple TargetTriple("aarch64--");
+ std::string Error;
+ const Target *T = TargetRegistry::lookupTarget("", TargetTriple, Error);
+ // FIXME: These tests do not depend on AArch64 specifically, but we have to
+ // initialize a target. A skeleton Target for unittests would allow us to
+ // always run these tests.
+ if (!T)
+ GTEST_SKIP();
+
+ TargetOptions Options;
+ TM = std::unique_ptr<TargetMachine>(
+ T->createTargetMachine(TargetTriple, "", "+sve", Options, std::nullopt,
+ std::nullopt, CodeGenOptLevel::Aggressive));
+ if (!TM)
+ GTEST_SKIP();
+
+ SMDiagnostic SMError;
+ M = parseAssemblyString(Assembly, SMError, Context);
+ ASSERT_TRUE(M && "Could not parse module!");
+ M->setDataLayout(TM->createDataLayout());
+
+ F = M->getFunction("f");
+ ASSERT_TRUE(F && "Could not get function f!");
+ G = M->getGlobalVariable("g");
+ ASSERT_TRUE(G && "Could not get global g!");
+ AliasedG = M->getNamedAlias("g_alias");
+ ASSERT_TRUE(AliasedG && "Could not get alias g_alias!");
+
+ MachineModuleInfo MMI(TM.get());
+
+ MF = std::make_unique<MachineFunction>(*F, *TM, *TM->getSubtargetImpl(*F),
+ MMI.getContext(), 0);
+
+ DAG = std::make_unique<SelectionDAG>(*TM, CodeGenOptLevel::None);
+ if (!DAG)
+ reportFatalUsageError("Failed to create SelectionDAG?");
+ OptimizationRemarkEmitter ORE(F);
+ FunctionAnalysisManager FAM;
+ FAM.registerPass([&] { return TM->getTargetIRAnalysis(); });
+
+ TargetTransformInfo TTI = TM->getTargetIRAnalysis().run(*F, FAM);
+ DAG->init(*MF, ORE, nullptr, nullptr, nullptr, nullptr, nullptr, MMI,
+ nullptr, TTI.hasBranchDivergence(F));
+ }
+
+ TargetLoweringBase::LegalizeTypeAction getTypeAction(EVT VT) {
+ return DAG->getTargetLoweringInfo().getTypeAction(Context, VT);
+ }
+
+ EVT getTypeToTransformTo(EVT VT) {
+ return DAG->getTargetLoweringInfo().getTypeToTransformTo(Context, VT);
+ }
+
+ LLVMContext Context;
+ std::unique_ptr<TargetMachine> TM;
+ std::unique_ptr<Module> M;
+ Function *F;
+ GlobalVariable *G;
+ GlobalAlias *AliasedG;
+ std::unique_ptr<MachineFunction> MF;
+ std::unique_ptr<SelectionDAG> DAG;
+};
More information about the llvm-commits
mailing list