[llvm] d2c5fbe - [AMDGPU] Legalize vector fminimum and fmaximum with VOP3P (#138971)
via llvm-commits
llvm-commits at lists.llvm.org
Thu May 8 22:31:31 PDT 2025
Author: Stanislav Mekhanoshin
Date: 2025-05-08T22:31:27-07:00
New Revision: d2c5fbe9ea14bdcd0008691b76a562ff69f04b99
URL: https://github.com/llvm/llvm-project/commit/d2c5fbe9ea14bdcd0008691b76a562ff69f04b99
DIFF: https://github.com/llvm/llvm-project/commit/d2c5fbe9ea14bdcd0008691b76a562ff69f04b99.diff
LOG: [AMDGPU] Legalize vector fminimum and fmaximum with VOP3P (#138971)
Co-authored-by: Matt Arsenault <Matthew.Arsenault at amd.com>
Added:
Modified:
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/test/Analysis/CostModel/AMDGPU/maximum.ll
llvm/test/Analysis/CostModel/AMDGPU/minimum.ll
llvm/test/CodeGen/AMDGPU/fmaximum3.ll
llvm/test/CodeGen/AMDGPU/fminimum3.ll
llvm/test/CodeGen/AMDGPU/vector-reduce-fmaximum.ll
llvm/test/CodeGen/AMDGPU/vector-reduce-fminimum.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 76c4546f1207e..95a8558c3f5a4 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -861,9 +861,6 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
if (Subtarget->hasIEEEMinMax()) {
setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM},
{MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal);
- setOperationAction({ISD::FMINIMUM, ISD::FMAXIMUM},
- {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
- Custom);
} else {
// FIXME: For nnan fmaximum, emit the fmaximum3 instead of fmaxnum
if (Subtarget->hasMinimum3Maximum3F32())
@@ -878,6 +875,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
}
}
+ if (Subtarget->hasVOP3PInsts()) {
+ // We want to break these into v2f16 pieces, not scalarize.
+ setOperationAction({ISD::FMINIMUM, ISD::FMAXIMUM},
+ {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
+ Custom);
+ }
+
setOperationAction(ISD::INTRINSIC_WO_CHAIN,
{MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/maximum.ll b/llvm/test/Analysis/CostModel/AMDGPU/maximum.ll
index 603e04fc7a7a7..3774c6c0cbbee 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/maximum.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/maximum.ll
@@ -11,19 +11,19 @@ define void @maximum_f16() {
; GFX950-FASTF64-LABEL: 'maximum_f16'
; GFX950-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maximum.f16(half undef, half undef)
; GFX950-FASTF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.maximum.v2f16(<2 x half> undef, <2 x half> undef)
-; GFX950-FASTF64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.maximum.v3f16(<3 x half> undef, <3 x half> undef)
-; GFX950-FASTF64-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.maximum.v4f16(<4 x half> undef, <4 x half> undef)
-; GFX950-FASTF64-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.maximum.v8f16(<8 x half> undef, <8 x half> undef)
-; GFX950-FASTF64-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.maximum.v16f16(<16 x half> undef, <16 x half> undef)
+; GFX950-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.maximum.v3f16(<3 x half> undef, <3 x half> undef)
+; GFX950-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.maximum.v4f16(<4 x half> undef, <4 x half> undef)
+; GFX950-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.maximum.v8f16(<8 x half> undef, <8 x half> undef)
+; GFX950-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.maximum.v16f16(<16 x half> undef, <16 x half> undef)
; GFX950-FASTF64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void
;
; GFX9-LABEL: 'maximum_f16'
; GFX9-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %f16 = call half @llvm.maximum.f16(half undef, half undef)
; GFX9-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %v2f16 = call <2 x half> @llvm.maximum.v2f16(<2 x half> undef, <2 x half> undef)
-; GFX9-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v3f16 = call <3 x half> @llvm.maximum.v3f16(<3 x half> undef, <3 x half> undef)
-; GFX9-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %v4f16 = call <4 x half> @llvm.maximum.v4f16(<4 x half> undef, <4 x half> undef)
-; GFX9-NEXT: Cost Model: Found an estimated cost of 87 for instruction: %v8f16 = call <8 x half> @llvm.maximum.v8f16(<8 x half> undef, <8 x half> undef)
-; GFX9-NEXT: Cost Model: Found an estimated cost of 175 for instruction: %v16f16 = call <16 x half> @llvm.maximum.v16f16(<16 x half> undef, <16 x half> undef)
+; GFX9-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.maximum.v3f16(<3 x half> undef, <3 x half> undef)
+; GFX9-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.maximum.v4f16(<4 x half> undef, <4 x half> undef)
+; GFX9-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.maximum.v8f16(<8 x half> undef, <8 x half> undef)
+; GFX9-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.maximum.v16f16(<16 x half> undef, <16 x half> undef)
; GFX9-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void
;
; SLOWF64-LABEL: 'maximum_f16'
@@ -38,10 +38,10 @@ define void @maximum_f16() {
; GFX9-SIZE-LABEL: 'maximum_f16'
; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.maximum.f16(half undef, half undef)
; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.maximum.v2f16(<2 x half> undef, <2 x half> undef)
-; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.maximum.v3f16(<3 x half> undef, <3 x half> undef)
-; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.maximum.v4f16(<4 x half> undef, <4 x half> undef)
-; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.maximum.v8f16(<8 x half> undef, <8 x half> undef)
-; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.maximum.v16f16(<16 x half> undef, <16 x half> undef)
+; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.maximum.v3f16(<3 x half> undef, <3 x half> undef)
+; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.maximum.v4f16(<4 x half> undef, <4 x half> undef)
+; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.maximum.v8f16(<8 x half> undef, <8 x half> undef)
+; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.maximum.v16f16(<16 x half> undef, <16 x half> undef)
; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;
; SLOW-SIZE-LABEL: 'maximum_f16'
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/minimum.ll b/llvm/test/Analysis/CostModel/AMDGPU/minimum.ll
index 4507ba4929f1b..24b9549dfe3a4 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/minimum.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/minimum.ll
@@ -11,19 +11,19 @@ define void @minimum_f16() {
; GFX950-FASTF64-LABEL: 'minimum_f16'
; GFX950-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minimum.f16(half undef, half undef)
; GFX950-FASTF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.minimum.v2f16(<2 x half> undef, <2 x half> undef)
-; GFX950-FASTF64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.minimum.v3f16(<3 x half> undef, <3 x half> undef)
-; GFX950-FASTF64-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.minimum.v4f16(<4 x half> undef, <4 x half> undef)
-; GFX950-FASTF64-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.minimum.v8f16(<8 x half> undef, <8 x half> undef)
-; GFX950-FASTF64-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.minimum.v16f16(<16 x half> undef, <16 x half> undef)
+; GFX950-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.minimum.v3f16(<3 x half> undef, <3 x half> undef)
+; GFX950-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.minimum.v4f16(<4 x half> undef, <4 x half> undef)
+; GFX950-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.minimum.v8f16(<8 x half> undef, <8 x half> undef)
+; GFX950-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.minimum.v16f16(<16 x half> undef, <16 x half> undef)
; GFX950-FASTF64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void
;
; GFX9-LABEL: 'minimum_f16'
; GFX9-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %f16 = call half @llvm.minimum.f16(half undef, half undef)
; GFX9-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %v2f16 = call <2 x half> @llvm.minimum.v2f16(<2 x half> undef, <2 x half> undef)
-; GFX9-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v3f16 = call <3 x half> @llvm.minimum.v3f16(<3 x half> undef, <3 x half> undef)
-; GFX9-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %v4f16 = call <4 x half> @llvm.minimum.v4f16(<4 x half> undef, <4 x half> undef)
-; GFX9-NEXT: Cost Model: Found an estimated cost of 87 for instruction: %v8f16 = call <8 x half> @llvm.minimum.v8f16(<8 x half> undef, <8 x half> undef)
-; GFX9-NEXT: Cost Model: Found an estimated cost of 175 for instruction: %v16f16 = call <16 x half> @llvm.minimum.v16f16(<16 x half> undef, <16 x half> undef)
+; GFX9-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.minimum.v3f16(<3 x half> undef, <3 x half> undef)
+; GFX9-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.minimum.v4f16(<4 x half> undef, <4 x half> undef)
+; GFX9-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.minimum.v8f16(<8 x half> undef, <8 x half> undef)
+; GFX9-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.minimum.v16f16(<16 x half> undef, <16 x half> undef)
; GFX9-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void
;
; SLOWF64-LABEL: 'minimum_f16'
@@ -38,10 +38,10 @@ define void @minimum_f16() {
; GFX9-SIZE-LABEL: 'minimum_f16'
; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.minimum.f16(half undef, half undef)
; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.minimum.v2f16(<2 x half> undef, <2 x half> undef)
-; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.minimum.v3f16(<3 x half> undef, <3 x half> undef)
-; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.minimum.v4f16(<4 x half> undef, <4 x half> undef)
-; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.minimum.v8f16(<8 x half> undef, <8 x half> undef)
-; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.minimum.v16f16(<16 x half> undef, <16 x half> undef)
+; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.minimum.v3f16(<3 x half> undef, <3 x half> undef)
+; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.minimum.v4f16(<4 x half> undef, <4 x half> undef)
+; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.minimum.v8f16(<8 x half> undef, <8 x half> undef)
+; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.minimum.v16f16(<16 x half> undef, <16 x half> undef)
; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;
; SLOW-SIZE-LABEL: 'minimum_f16'
diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
index 567202be69fa6..53d940e1e6c1a 100644
--- a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
@@ -2375,21 +2375,21 @@ define <3 x half> @v_fmaximum3_v3f16(<3 x half> %a, <3 x half> %b, <3 x half> %c
; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc
; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX942-NEXT: v_pk_max_f16 v2, v1, v3
-; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX942-NEXT: v_pk_max_f16 v6, v1, v3
; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3
-; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc
-; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0
+; GFX942-NEXT: v_pk_max_f16 v2, v4, v2
+; GFX942-NEXT: v_cndmask_b32_e32 v9, v7, v6, vcc
+; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc
-; GFX942-NEXT: v_perm_b32 v1, v1, v6, s0
+; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc
+; GFX942-NEXT: v_perm_b32 v1, v1, v9, s0
; GFX942-NEXT: v_pk_max_f16 v1, v5, v1
-; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v5, v6
-; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0
-; GFX942-NEXT: v_pk_max_f16 v2, v4, v2
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v5, v9
+; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc
; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v4, v8
; GFX942-NEXT: s_nop 1
@@ -2437,21 +2437,21 @@ define <3 x half> @v_fmaximum3_v3f16_commute(<3 x half> %a, <3 x half> %b, <3 x
; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc
; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX942-NEXT: v_pk_max_f16 v2, v1, v3
-; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX942-NEXT: v_pk_max_f16 v6, v1, v3
; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3
-; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc
-; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0
+; GFX942-NEXT: v_pk_max_f16 v2, v2, v4
+; GFX942-NEXT: v_cndmask_b32_e32 v9, v7, v6, vcc
+; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc
-; GFX942-NEXT: v_perm_b32 v1, v1, v6, s0
+; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc
+; GFX942-NEXT: v_perm_b32 v1, v1, v9, s0
; GFX942-NEXT: v_pk_max_f16 v1, v1, v5
-; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v6, v5
-; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0
-; GFX942-NEXT: v_pk_max_f16 v2, v2, v4
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v9, v5
+; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc
; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v8, v4
; GFX942-NEXT: s_nop 1
@@ -2500,40 +2500,40 @@ define <3 x half> @v_fmaximum3_v3f16__fabs_all(<3 x half> %a, <3 x half> %b, <3
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: v_and_b32_e32 v7, 0x7fff7fff, v1
; GFX942-NEXT: v_and_b32_e32 v9, 0x7fff7fff, v3
+; GFX942-NEXT: v_pk_max_f16 v7, v7, v9
; GFX942-NEXT: v_and_b32_e32 v6, 0x7fff7fff, v0
; GFX942-NEXT: v_and_b32_e32 v8, 0x7fff7fff, v2
-; GFX942-NEXT: v_pk_max_f16 v7, v7, v9
-; GFX942-NEXT: v_mov_b32_e32 v12, 0x7e00
; GFX942-NEXT: v_lshrrev_b32_e32 v9, 16, v7
+; GFX942-NEXT: v_mov_b32_e32 v12, 0x7e00
; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, |v1|, |v3| src0_sel:WORD_1 src1_sel:WORD_1
; GFX942-NEXT: v_pk_max_f16 v6, v6, v8
; GFX942-NEXT: s_mov_b32 s0, 0x5040100
; GFX942-NEXT: v_cndmask_b32_e32 v9, v12, v9, vcc
-; GFX942-NEXT: v_lshrrev_b32_e32 v8, 16, v6
-; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, |v0|, |v2| src0_sel:WORD_1 src1_sel:WORD_1
+; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v1|, |v3|
; GFX942-NEXT: v_and_b32_e32 v11, 0x7fff7fff, v4
; GFX942-NEXT: v_and_b32_e32 v10, 0x7fff7fff, v5
-; GFX942-NEXT: v_cndmask_b32_e32 v8, v12, v8, vcc
-; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v1|, |v3|
-; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_cndmask_b32_e32 v1, v12, v7, vcc
+; GFX942-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, |v0|, |v2| src0_sel:WORD_1 src1_sel:WORD_1
+; GFX942-NEXT: v_perm_b32 v3, v9, v1, s0
+; GFX942-NEXT: v_pk_max_f16 v3, v3, v10
+; GFX942-NEXT: v_cndmask_b32_e32 v7, v12, v7, vcc
; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v2|
; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_cndmask_b32_e32 v0, v12, v6, vcc
-; GFX942-NEXT: v_perm_b32 v2, v8, v0, s0
+; GFX942-NEXT: v_perm_b32 v2, v7, v0, s0
; GFX942-NEXT: v_pk_max_f16 v2, v2, v11
-; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v8, |v4| src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX942-NEXT: v_perm_b32 v6, v9, v1, s0
-; GFX942-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc
-; GFX942-NEXT: v_pk_max_f16 v6, v6, v10
+; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v7, |v4| src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v2
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_cndmask_b32_e32 v6, v12, v6, vcc
; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v1, |v5|
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v1, v12, v6, vcc
+; GFX942-NEXT: v_cndmask_b32_e32 v1, v12, v3, vcc
; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, |v4|
; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_cndmask_b32_e32 v0, v12, v2, vcc
-; GFX942-NEXT: v_perm_b32 v0, v3, v0, s0
+; GFX942-NEXT: v_perm_b32 v0, v6, v0, s0
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: v_fmaximum3_v3f16__fabs_all:
@@ -2582,21 +2582,21 @@ define <3 x half> @v_fmaximum3_v3f16__fneg_all(<3 x half> %a, <3 x half> %b, <3
; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc
; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, -v0, -v2 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX942-NEXT: v_pk_max_f16 v2, v1, v3 neg_lo:[1,1] neg_hi:[1,1]
-; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX942-NEXT: v_pk_max_f16 v6, v1, v3 neg_lo:[1,1] neg_hi:[1,1]
; GFX942-NEXT: v_cmp_o_f16_e64 vcc, -v1, -v3
-; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc
-; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0
+; GFX942-NEXT: v_pk_max_f16 v2, v2, v4 neg_lo:[0,1] neg_hi:[0,1]
+; GFX942-NEXT: v_cndmask_b32_e32 v9, v7, v6, vcc
+; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, -v1, -v3 src0_sel:WORD_1 src1_sel:WORD_1
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc
-; GFX942-NEXT: v_perm_b32 v1, v1, v6, s0
+; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc
+; GFX942-NEXT: v_perm_b32 v1, v1, v9, s0
; GFX942-NEXT: v_pk_max_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1]
-; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v6, -v5
-; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0
-; GFX942-NEXT: v_pk_max_f16 v2, v2, v4 neg_lo:[0,1] neg_hi:[0,1]
+; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v9, -v5
+; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc
; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v8, -v4
; GFX942-NEXT: s_nop 1
@@ -2643,22 +2643,21 @@ define <3 x half> @v_fmaximum3_v3f16__inlineimm1(<3 x half> %a, <3 x half> %c) {
; GFX942-NEXT: v_pk_max_f16 v4, v0, 2.0 op_sel_hi:[1,0]
; GFX942-NEXT: v_mov_b32_e32 v5, 0x7e00
; GFX942-NEXT: v_cndmask_b32_sdwa v6, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_pk_max_f16 v7, v1, 2.0
-; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v1
-; GFX942-NEXT: s_mov_b32 s1, 0x5040100
-; GFX942-NEXT: s_movk_i32 s0, 0x7e00
-; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc
; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0
-; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: s_mov_b32 s1, 0x5040100
+; GFX942-NEXT: v_pk_max_f16 v7, v1, 2.0
; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc
; GFX942-NEXT: v_perm_b32 v4, v6, v0, s1
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v1
+; GFX942-NEXT: s_movk_i32 s0, 0x7e00
; GFX942-NEXT: v_pk_max_f16 v4, v4, v2
-; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v6, v2 src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_lshrrev_b32_e32 v7, 16, v4
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_cndmask_b32_e32 v6, v5, v7, vcc
+; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc
; GFX942-NEXT: v_pack_b32_f16 v7, v1, s0
+; GFX942-NEXT: v_lshrrev_b32_e32 v8, 16, v4
+; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v6, v2 src0_sel:DWORD src1_sel:WORD_1
; GFX942-NEXT: v_pk_max_f16 v7, v7, v3
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3
; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc
@@ -2705,21 +2704,21 @@ define <3 x half> @v_fmaximum3_v3f16__inlineimm2(<3 x half> %a, <3 x half> %b) {
; GFX942-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc
; GFX942-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX942-NEXT: v_pk_max_f16 v2, v1, v3
-; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX942-NEXT: v_pk_max_f16 v4, v1, v3
; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3
-; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v2, vcc
-; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX942-NEXT: v_perm_b32 v2, v0, v6, s0
+; GFX942-NEXT: v_pk_max_f16 v2, v2, 4.0 op_sel_hi:[1,0]
+; GFX942-NEXT: v_cndmask_b32_e32 v7, v5, v4, vcc
+; GFX942-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc
-; GFX942-NEXT: v_perm_b32 v1, v1, v4, s0
+; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX942-NEXT: v_perm_b32 v1, v1, v7, s0
; GFX942-NEXT: v_pk_max_f16 v1, v1, 4.0
-; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v4, v4
-; GFX942-NEXT: v_perm_b32 v2, v0, v6, s0
-; GFX942-NEXT: v_pk_max_f16 v2, v2, 4.0 op_sel_hi:[1,0]
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v7, v7
+; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v6, v6
; GFX942-NEXT: s_nop 1
@@ -2765,34 +2764,34 @@ define <4 x half> @v_fmaximum3_v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c
; GFX942-NEXT: s_mov_b32 s0, 0x5040100
; GFX942-NEXT: s_nop 0
; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc
+; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX942-NEXT: v_pk_max_f16 v2, v1, v3
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX942-NEXT: v_pk_max_f16 v6, v1, v3
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3
+; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0
+; GFX942-NEXT: v_pk_max_f16 v2, v4, v2
+; GFX942-NEXT: v_cndmask_b32_e32 v9, v7, v6, vcc
; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_perm_b32 v2, v1, v6, s0
-; GFX942-NEXT: v_pk_max_f16 v2, v5, v2
-; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v5, v6
+; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: v_perm_b32 v3, v1, v9, s0
+; GFX942-NEXT: v_pk_max_f16 v3, v5, v3
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v5, v9
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc
+; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v3, vcc
; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v5, v1 src0_sel:WORD_1 src1_sel:DWORD
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0
-; GFX942-NEXT: v_pk_max_f16 v2, v4, v2
+; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v4, v8
-; GFX942-NEXT: v_perm_b32 v1, v1, v3, s0
+; GFX942-NEXT: v_perm_b32 v1, v1, v6, s0
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_cndmask_b32_e32 v5, v7, v2, vcc
+; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc
; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v4, v0 src0_sel:WORD_1 src1_sel:DWORD
; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_perm_b32 v0, v0, v5, s0
+; GFX942-NEXT: v_perm_b32 v0, v0, v3, s0
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: v_fmaximum3_v4f16:
@@ -2830,34 +2829,34 @@ define <4 x half> @v_fmaximum3_v4f16_commute(<4 x half> %a, <4 x half> %b, <4 x
; GFX942-NEXT: s_mov_b32 s0, 0x5040100
; GFX942-NEXT: s_nop 0
; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc
+; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX942-NEXT: v_pk_max_f16 v2, v1, v3
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX942-NEXT: v_pk_max_f16 v6, v1, v3
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3
+; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0
+; GFX942-NEXT: v_pk_max_f16 v2, v2, v4
+; GFX942-NEXT: v_cndmask_b32_e32 v9, v7, v6, vcc
; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_perm_b32 v2, v1, v6, s0
-; GFX942-NEXT: v_pk_max_f16 v2, v2, v5
-; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v6, v5
+; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: v_perm_b32 v3, v1, v9, s0
+; GFX942-NEXT: v_pk_max_f16 v3, v3, v5
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v9, v5
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc
+; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v3, vcc
; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v5 src0_sel:DWORD src1_sel:WORD_1
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0
-; GFX942-NEXT: v_pk_max_f16 v2, v2, v4
+; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v8, v4
-; GFX942-NEXT: v_perm_b32 v1, v1, v3, s0
+; GFX942-NEXT: v_perm_b32 v1, v1, v6, s0
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_cndmask_b32_e32 v5, v7, v2, vcc
+; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc
; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:DWORD src1_sel:WORD_1
; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_perm_b32 v0, v0, v5, s0
+; GFX942-NEXT: v_perm_b32 v0, v0, v3, s0
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: v_fmaximum3_v4f16_commute:
@@ -2898,42 +2897,40 @@ define <4 x half> @v_fmaximum3_v4f16__fabs_all(<4 x half> %a, <4 x half> %b, <4
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: v_and_b32_e32 v7, 0x7fff7fff, v0
; GFX942-NEXT: v_and_b32_e32 v9, 0x7fff7fff, v2
-; GFX942-NEXT: v_and_b32_e32 v6, 0x7fff7fff, v1
-; GFX942-NEXT: v_and_b32_e32 v8, 0x7fff7fff, v3
; GFX942-NEXT: v_pk_max_f16 v7, v7, v9
; GFX942-NEXT: v_mov_b32_e32 v12, 0x7e00
; GFX942-NEXT: v_lshrrev_b32_e32 v9, 16, v7
; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, |v0|, |v2| src0_sel:WORD_1 src1_sel:WORD_1
+; GFX942-NEXT: v_and_b32_e32 v6, 0x7fff7fff, v1
+; GFX942-NEXT: v_and_b32_e32 v8, 0x7fff7fff, v3
+; GFX942-NEXT: v_cndmask_b32_e32 v9, v12, v9, vcc
+; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v2|
; GFX942-NEXT: v_pk_max_f16 v6, v6, v8
; GFX942-NEXT: s_mov_b32 s0, 0x5040100
-; GFX942-NEXT: v_cndmask_b32_e32 v9, v12, v9, vcc
-; GFX942-NEXT: v_lshrrev_b32_e32 v8, 16, v6
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v12, v7, vcc
; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, |v1|, |v3| src0_sel:WORD_1 src1_sel:WORD_1
; GFX942-NEXT: v_and_b32_e32 v11, 0x7fff7fff, v5
; GFX942-NEXT: v_and_b32_e32 v10, 0x7fff7fff, v4
-; GFX942-NEXT: v_cndmask_b32_e32 v8, v12, v8, vcc
-; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v2|
-; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v0, v12, v7, vcc
+; GFX942-NEXT: v_cndmask_b32_sdwa v7, v12, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v1|, |v3|
-; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_perm_b32 v2, v9, v0, s0
+; GFX942-NEXT: v_pk_max_f16 v2, v2, v10
; GFX942-NEXT: v_cndmask_b32_e32 v1, v12, v6, vcc
-; GFX942-NEXT: v_perm_b32 v2, v8, v1, s0
-; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v8, |v5| src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_pk_max_f16 v2, v2, v11
-; GFX942-NEXT: v_perm_b32 v6, v9, v0, s0
-; GFX942-NEXT: v_cndmask_b32_sdwa v3, v12, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v9, |v4| src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_pk_max_f16 v6, v6, v10
+; GFX942-NEXT: v_perm_b32 v3, v7, v1, s0
+; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v7, |v5| src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: v_pk_max_f16 v3, v3, v11
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_cndmask_b32_sdwa v7, v12, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: v_cndmask_b32_sdwa v6, v12, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v9, |v4| src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_sdwa v7, v12, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v1, |v5|
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v1, v12, v2, vcc
+; GFX942-NEXT: v_cndmask_b32_e32 v1, v12, v3, vcc
; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, |v4|
-; GFX942-NEXT: v_perm_b32 v1, v3, v1, s0
+; GFX942-NEXT: v_perm_b32 v1, v6, v1, s0
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_cndmask_b32_e32 v0, v12, v6, vcc
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v12, v2, vcc
; GFX942-NEXT: v_perm_b32 v0, v7, v0, s0
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
@@ -2981,34 +2978,34 @@ define <4 x half> @v_fmaximum3_v4f16__fneg_all(<4 x half> %a, <4 x half> %b, <4
; GFX942-NEXT: s_mov_b32 s0, 0x5040100
; GFX942-NEXT: s_nop 0
; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc
+; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, -v0, -v2 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX942-NEXT: v_pk_max_f16 v2, v1, v3 neg_lo:[1,1] neg_hi:[1,1]
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_cmp_o_f16_e64 vcc, -v1, -v3
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX942-NEXT: v_pk_max_f16 v6, v1, v3 neg_lo:[1,1] neg_hi:[1,1]
+; GFX942-NEXT: v_cmp_o_f16_e64 vcc, -v1, -v3
+; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0
+; GFX942-NEXT: v_pk_max_f16 v2, v2, v4 neg_lo:[0,1] neg_hi:[0,1]
+; GFX942-NEXT: v_cndmask_b32_e32 v9, v7, v6, vcc
; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, -v1, -v3 src0_sel:WORD_1 src1_sel:WORD_1
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_perm_b32 v2, v1, v6, s0
-; GFX942-NEXT: v_pk_max_f16 v2, v2, v5 neg_lo:[0,1] neg_hi:[0,1]
-; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v6, -v5
+; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: v_perm_b32 v3, v1, v9, s0
+; GFX942-NEXT: v_pk_max_f16 v3, v3, v5 neg_lo:[0,1] neg_hi:[0,1]
+; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v9, -v5
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc
+; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v3, vcc
; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, -v5 src0_sel:DWORD src1_sel:WORD_1
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0
-; GFX942-NEXT: v_pk_max_f16 v2, v2, v4 neg_lo:[0,1] neg_hi:[0,1]
+; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v8, -v4
-; GFX942-NEXT: v_perm_b32 v1, v1, v3, s0
+; GFX942-NEXT: v_perm_b32 v1, v1, v6, s0
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_cndmask_b32_e32 v5, v7, v2, vcc
+; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc
; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, -v4 src0_sel:DWORD src1_sel:WORD_1
; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_perm_b32 v0, v0, v5, s0
+; GFX942-NEXT: v_perm_b32 v0, v0, v3, s0
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: v_fmaximum3_v4f16__fneg_all:
@@ -3046,37 +3043,34 @@ define <4 x half> @v_fmaximum3_v4f16__inlineimm1(<4 x half> %a, <4 x half> %c) {
; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1
; GFX942-NEXT: v_pk_max_f16 v4, v0, 2.0 op_sel_hi:[1,0]
; GFX942-NEXT: v_mov_b32_e32 v5, 0x7e00
-; GFX942-NEXT: v_pk_max_f16 v7, v1, 2.0 op_sel_hi:[1,0]
; GFX942-NEXT: v_cndmask_b32_sdwa v6, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_lshrrev_b32_e32 v8, 16, v7
-; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v1 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX942-NEXT: s_mov_b32 s0, 0x5040100
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_cndmask_b32_e32 v8, v5, v8, vcc
; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0
-; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_pk_max_f16 v7, v1, 2.0 op_sel_hi:[1,0]
+; GFX942-NEXT: s_mov_b32 s0, 0x5040100
; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX942-NEXT: v_perm_b32 v4, v6, v0, s0
+; GFX942-NEXT: v_pk_max_f16 v4, v4, v2
+; GFX942-NEXT: v_cndmask_b32_sdwa v8, v5, v7, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v1
; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc
-; GFX942-NEXT: v_perm_b32 v4, v8, v1, s0
+; GFX942-NEXT: v_perm_b32 v7, v8, v1, s0
; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v8, v3 src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_pk_max_f16 v4, v4, v3
+; GFX942-NEXT: v_pk_max_f16 v7, v7, v3
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_cndmask_b32_sdwa v7, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: v_cndmask_b32_sdwa v8, v5, v7, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v6, v2 src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_perm_b32 v6, v6, v0, s0
-; GFX942-NEXT: v_pk_max_f16 v6, v6, v2
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_cndmask_b32_sdwa v8, v5, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_sdwa v6, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc
; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2
-; GFX942-NEXT: v_perm_b32 v1, v7, v1, s0
+; GFX942-NEXT: v_perm_b32 v1, v8, v1, s0
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc
-; GFX942-NEXT: v_perm_b32 v0, v8, v0, s0
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX942-NEXT: v_perm_b32 v0, v6, v0, s0
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: v_fmaximum3_v4f16__inlineimm1:
@@ -3114,34 +3108,34 @@ define <4 x half> @v_fmaximum3_v4f16__inlineimm2(<4 x half> %a, <4 x half> %b) {
; GFX942-NEXT: s_mov_b32 s0, 0x5040100
; GFX942-NEXT: s_nop 0
; GFX942-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc
+; GFX942-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX942-NEXT: v_pk_max_f16 v2, v1, v3
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_cndmask_b32_sdwa v0, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v2, vcc
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX942-NEXT: v_pk_max_f16 v4, v1, v3
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3
+; GFX942-NEXT: v_perm_b32 v2, v0, v6, s0
+; GFX942-NEXT: v_pk_max_f16 v2, v2, 4.0 op_sel_hi:[1,0]
+; GFX942-NEXT: v_cndmask_b32_e32 v7, v5, v4, vcc
; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_sdwa v1, v5, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_perm_b32 v2, v1, v4, s0
-; GFX942-NEXT: v_pk_max_f16 v2, v2, 4.0 op_sel_hi:[1,0]
-; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v4, v4
+; GFX942-NEXT: v_cndmask_b32_sdwa v1, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: v_perm_b32 v3, v1, v7, s0
+; GFX942-NEXT: v_pk_max_f16 v3, v3, 4.0 op_sel_hi:[1,0]
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v7, v7
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v2, vcc
+; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc
; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v1
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_sdwa v1, v5, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_perm_b32 v2, v0, v6, s0
-; GFX942-NEXT: v_pk_max_f16 v2, v2, 4.0 op_sel_hi:[1,0]
+; GFX942-NEXT: v_cndmask_b32_sdwa v1, v5, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v6, v6
-; GFX942-NEXT: v_perm_b32 v1, v1, v3, s0
+; GFX942-NEXT: v_perm_b32 v1, v1, v4, s0
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v2, vcc
+; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v2, vcc
; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0
; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_cndmask_b32_sdwa v0, v5, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_perm_b32 v0, v0, v4, s0
+; GFX942-NEXT: v_perm_b32 v0, v0, v3, s0
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: v_fmaximum3_v4f16__inlineimm2:
diff --git a/llvm/test/CodeGen/AMDGPU/fminimum3.ll b/llvm/test/CodeGen/AMDGPU/fminimum3.ll
index 81b8e8ebd10e3..d1d0c0dcdb7e0 100644
--- a/llvm/test/CodeGen/AMDGPU/fminimum3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fminimum3.ll
@@ -2375,21 +2375,21 @@ define <3 x half> @v_fminimum3_v3f16(<3 x half> %a, <3 x half> %b, <3 x half> %c
; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc
; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX942-NEXT: v_pk_min_f16 v2, v1, v3
-; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX942-NEXT: v_pk_min_f16 v6, v1, v3
; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3
-; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc
-; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0
+; GFX942-NEXT: v_pk_min_f16 v2, v4, v2
+; GFX942-NEXT: v_cndmask_b32_e32 v9, v7, v6, vcc
+; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc
-; GFX942-NEXT: v_perm_b32 v1, v1, v6, s0
+; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc
+; GFX942-NEXT: v_perm_b32 v1, v1, v9, s0
; GFX942-NEXT: v_pk_min_f16 v1, v5, v1
-; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v5, v6
-; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0
-; GFX942-NEXT: v_pk_min_f16 v2, v4, v2
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v5, v9
+; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc
; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v4, v8
; GFX942-NEXT: s_nop 1
@@ -2437,21 +2437,21 @@ define <3 x half> @v_fminimum3_v3f16_commute(<3 x half> %a, <3 x half> %b, <3 x
; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc
; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX942-NEXT: v_pk_min_f16 v2, v1, v3
-; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX942-NEXT: v_pk_min_f16 v6, v1, v3
; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3
-; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc
-; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0
+; GFX942-NEXT: v_pk_min_f16 v2, v2, v4
+; GFX942-NEXT: v_cndmask_b32_e32 v9, v7, v6, vcc
+; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc
-; GFX942-NEXT: v_perm_b32 v1, v1, v6, s0
+; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc
+; GFX942-NEXT: v_perm_b32 v1, v1, v9, s0
; GFX942-NEXT: v_pk_min_f16 v1, v1, v5
-; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v6, v5
-; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0
-; GFX942-NEXT: v_pk_min_f16 v2, v2, v4
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v9, v5
+; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc
; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v8, v4
; GFX942-NEXT: s_nop 1
@@ -2500,40 +2500,40 @@ define <3 x half> @v_fminimum3_v3f16__fabs_all(<3 x half> %a, <3 x half> %b, <3
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: v_and_b32_e32 v7, 0x7fff7fff, v1
; GFX942-NEXT: v_and_b32_e32 v9, 0x7fff7fff, v3
+; GFX942-NEXT: v_pk_min_f16 v7, v7, v9
; GFX942-NEXT: v_and_b32_e32 v6, 0x7fff7fff, v0
; GFX942-NEXT: v_and_b32_e32 v8, 0x7fff7fff, v2
-; GFX942-NEXT: v_pk_min_f16 v7, v7, v9
-; GFX942-NEXT: v_mov_b32_e32 v12, 0x7e00
; GFX942-NEXT: v_lshrrev_b32_e32 v9, 16, v7
+; GFX942-NEXT: v_mov_b32_e32 v12, 0x7e00
; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, |v1|, |v3| src0_sel:WORD_1 src1_sel:WORD_1
; GFX942-NEXT: v_pk_min_f16 v6, v6, v8
; GFX942-NEXT: s_mov_b32 s0, 0x5040100
; GFX942-NEXT: v_cndmask_b32_e32 v9, v12, v9, vcc
-; GFX942-NEXT: v_lshrrev_b32_e32 v8, 16, v6
-; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, |v0|, |v2| src0_sel:WORD_1 src1_sel:WORD_1
+; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v1|, |v3|
; GFX942-NEXT: v_and_b32_e32 v11, 0x7fff7fff, v4
; GFX942-NEXT: v_and_b32_e32 v10, 0x7fff7fff, v5
-; GFX942-NEXT: v_cndmask_b32_e32 v8, v12, v8, vcc
-; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v1|, |v3|
-; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_cndmask_b32_e32 v1, v12, v7, vcc
+; GFX942-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, |v0|, |v2| src0_sel:WORD_1 src1_sel:WORD_1
+; GFX942-NEXT: v_perm_b32 v3, v9, v1, s0
+; GFX942-NEXT: v_pk_min_f16 v3, v3, v10
+; GFX942-NEXT: v_cndmask_b32_e32 v7, v12, v7, vcc
; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v2|
; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_cndmask_b32_e32 v0, v12, v6, vcc
-; GFX942-NEXT: v_perm_b32 v2, v8, v0, s0
+; GFX942-NEXT: v_perm_b32 v2, v7, v0, s0
; GFX942-NEXT: v_pk_min_f16 v2, v2, v11
-; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v8, |v4| src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX942-NEXT: v_perm_b32 v6, v9, v1, s0
-; GFX942-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc
-; GFX942-NEXT: v_pk_min_f16 v6, v6, v10
+; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v7, |v4| src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v2
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_cndmask_b32_e32 v6, v12, v6, vcc
; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v1, |v5|
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v1, v12, v6, vcc
+; GFX942-NEXT: v_cndmask_b32_e32 v1, v12, v3, vcc
; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, |v4|
; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_cndmask_b32_e32 v0, v12, v2, vcc
-; GFX942-NEXT: v_perm_b32 v0, v3, v0, s0
+; GFX942-NEXT: v_perm_b32 v0, v6, v0, s0
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: v_fminimum3_v3f16__fabs_all:
@@ -2582,21 +2582,21 @@ define <3 x half> @v_fminimum3_v3f16__fneg_all(<3 x half> %a, <3 x half> %b, <3
; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc
; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, -v0, -v2 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX942-NEXT: v_pk_min_f16 v2, v1, v3 neg_lo:[1,1] neg_hi:[1,1]
-; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX942-NEXT: v_pk_min_f16 v6, v1, v3 neg_lo:[1,1] neg_hi:[1,1]
; GFX942-NEXT: v_cmp_o_f16_e64 vcc, -v1, -v3
-; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc
-; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0
+; GFX942-NEXT: v_pk_min_f16 v2, v2, v4 neg_lo:[0,1] neg_hi:[0,1]
+; GFX942-NEXT: v_cndmask_b32_e32 v9, v7, v6, vcc
+; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, -v1, -v3 src0_sel:WORD_1 src1_sel:WORD_1
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc
-; GFX942-NEXT: v_perm_b32 v1, v1, v6, s0
+; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc
+; GFX942-NEXT: v_perm_b32 v1, v1, v9, s0
; GFX942-NEXT: v_pk_min_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1]
-; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v6, -v5
-; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0
-; GFX942-NEXT: v_pk_min_f16 v2, v2, v4 neg_lo:[0,1] neg_hi:[0,1]
+; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v9, -v5
+; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc
; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v8, -v4
; GFX942-NEXT: s_nop 1
@@ -2643,22 +2643,21 @@ define <3 x half> @v_fminimum3_v3f16__inlineimm1(<3 x half> %a, <3 x half> %c) {
; GFX942-NEXT: v_pk_min_f16 v4, v0, 2.0 op_sel_hi:[1,0]
; GFX942-NEXT: v_mov_b32_e32 v5, 0x7e00
; GFX942-NEXT: v_cndmask_b32_sdwa v6, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_pk_min_f16 v7, v1, 2.0
-; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v1
-; GFX942-NEXT: s_mov_b32 s1, 0x5040100
-; GFX942-NEXT: s_movk_i32 s0, 0x7e00
-; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc
; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0
-; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: s_mov_b32 s1, 0x5040100
+; GFX942-NEXT: v_pk_min_f16 v7, v1, 2.0
; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc
; GFX942-NEXT: v_perm_b32 v4, v6, v0, s1
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v1
+; GFX942-NEXT: s_movk_i32 s0, 0x7e00
; GFX942-NEXT: v_pk_min_f16 v4, v4, v2
-; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v6, v2 src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_lshrrev_b32_e32 v7, 16, v4
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_cndmask_b32_e32 v6, v5, v7, vcc
+; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc
; GFX942-NEXT: v_pack_b32_f16 v7, v1, s0
+; GFX942-NEXT: v_lshrrev_b32_e32 v8, 16, v4
+; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v6, v2 src0_sel:DWORD src1_sel:WORD_1
; GFX942-NEXT: v_pk_min_f16 v7, v7, v3
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3
; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc
@@ -2705,21 +2704,21 @@ define <3 x half> @v_fminimum3_v3f16__inlineimm2(<3 x half> %a, <3 x half> %b) {
; GFX942-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc
; GFX942-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX942-NEXT: v_pk_min_f16 v2, v1, v3
-; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX942-NEXT: v_pk_min_f16 v4, v1, v3
; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3
-; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v2, vcc
-; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX942-NEXT: v_perm_b32 v2, v0, v6, s0
+; GFX942-NEXT: v_pk_min_f16 v2, v2, 4.0 op_sel_hi:[1,0]
+; GFX942-NEXT: v_cndmask_b32_e32 v7, v5, v4, vcc
+; GFX942-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc
-; GFX942-NEXT: v_perm_b32 v1, v1, v4, s0
+; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX942-NEXT: v_perm_b32 v1, v1, v7, s0
; GFX942-NEXT: v_pk_min_f16 v1, v1, 4.0
-; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v4, v4
-; GFX942-NEXT: v_perm_b32 v2, v0, v6, s0
-; GFX942-NEXT: v_pk_min_f16 v2, v2, 4.0 op_sel_hi:[1,0]
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v7, v7
+; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v6, v6
; GFX942-NEXT: s_nop 1
@@ -2765,34 +2764,34 @@ define <4 x half> @v_fminimum3_v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c
; GFX942-NEXT: s_mov_b32 s0, 0x5040100
; GFX942-NEXT: s_nop 0
; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc
+; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX942-NEXT: v_pk_min_f16 v2, v1, v3
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX942-NEXT: v_pk_min_f16 v6, v1, v3
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3
+; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0
+; GFX942-NEXT: v_pk_min_f16 v2, v4, v2
+; GFX942-NEXT: v_cndmask_b32_e32 v9, v7, v6, vcc
; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_perm_b32 v2, v1, v6, s0
-; GFX942-NEXT: v_pk_min_f16 v2, v5, v2
-; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v5, v6
+; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: v_perm_b32 v3, v1, v9, s0
+; GFX942-NEXT: v_pk_min_f16 v3, v5, v3
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v5, v9
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc
+; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v3, vcc
; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v5, v1 src0_sel:WORD_1 src1_sel:DWORD
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0
-; GFX942-NEXT: v_pk_min_f16 v2, v4, v2
+; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v4, v8
-; GFX942-NEXT: v_perm_b32 v1, v1, v3, s0
+; GFX942-NEXT: v_perm_b32 v1, v1, v6, s0
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_cndmask_b32_e32 v5, v7, v2, vcc
+; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc
; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v4, v0 src0_sel:WORD_1 src1_sel:DWORD
; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_perm_b32 v0, v0, v5, s0
+; GFX942-NEXT: v_perm_b32 v0, v0, v3, s0
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: v_fminimum3_v4f16:
@@ -2830,34 +2829,34 @@ define <4 x half> @v_fminimum3_v4f16_commute(<4 x half> %a, <4 x half> %b, <4 x
; GFX942-NEXT: s_mov_b32 s0, 0x5040100
; GFX942-NEXT: s_nop 0
; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc
+; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX942-NEXT: v_pk_min_f16 v2, v1, v3
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX942-NEXT: v_pk_min_f16 v6, v1, v3
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3
+; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0
+; GFX942-NEXT: v_pk_min_f16 v2, v2, v4
+; GFX942-NEXT: v_cndmask_b32_e32 v9, v7, v6, vcc
; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_perm_b32 v2, v1, v6, s0
-; GFX942-NEXT: v_pk_min_f16 v2, v2, v5
-; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v6, v5
+; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: v_perm_b32 v3, v1, v9, s0
+; GFX942-NEXT: v_pk_min_f16 v3, v3, v5
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v9, v5
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc
+; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v3, vcc
; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v5 src0_sel:DWORD src1_sel:WORD_1
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0
-; GFX942-NEXT: v_pk_min_f16 v2, v2, v4
+; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v8, v4
-; GFX942-NEXT: v_perm_b32 v1, v1, v3, s0
+; GFX942-NEXT: v_perm_b32 v1, v1, v6, s0
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_cndmask_b32_e32 v5, v7, v2, vcc
+; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc
; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:DWORD src1_sel:WORD_1
; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_perm_b32 v0, v0, v5, s0
+; GFX942-NEXT: v_perm_b32 v0, v0, v3, s0
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: v_fminimum3_v4f16_commute:
@@ -2898,42 +2897,40 @@ define <4 x half> @v_fminimum3_v4f16__fabs_all(<4 x half> %a, <4 x half> %b, <4
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: v_and_b32_e32 v7, 0x7fff7fff, v0
; GFX942-NEXT: v_and_b32_e32 v9, 0x7fff7fff, v2
-; GFX942-NEXT: v_and_b32_e32 v6, 0x7fff7fff, v1
-; GFX942-NEXT: v_and_b32_e32 v8, 0x7fff7fff, v3
; GFX942-NEXT: v_pk_min_f16 v7, v7, v9
; GFX942-NEXT: v_mov_b32_e32 v12, 0x7e00
; GFX942-NEXT: v_lshrrev_b32_e32 v9, 16, v7
; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, |v0|, |v2| src0_sel:WORD_1 src1_sel:WORD_1
+; GFX942-NEXT: v_and_b32_e32 v6, 0x7fff7fff, v1
+; GFX942-NEXT: v_and_b32_e32 v8, 0x7fff7fff, v3
+; GFX942-NEXT: v_cndmask_b32_e32 v9, v12, v9, vcc
+; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v2|
; GFX942-NEXT: v_pk_min_f16 v6, v6, v8
; GFX942-NEXT: s_mov_b32 s0, 0x5040100
-; GFX942-NEXT: v_cndmask_b32_e32 v9, v12, v9, vcc
-; GFX942-NEXT: v_lshrrev_b32_e32 v8, 16, v6
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v12, v7, vcc
; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, |v1|, |v3| src0_sel:WORD_1 src1_sel:WORD_1
; GFX942-NEXT: v_and_b32_e32 v11, 0x7fff7fff, v5
; GFX942-NEXT: v_and_b32_e32 v10, 0x7fff7fff, v4
-; GFX942-NEXT: v_cndmask_b32_e32 v8, v12, v8, vcc
-; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v2|
-; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v0, v12, v7, vcc
+; GFX942-NEXT: v_cndmask_b32_sdwa v7, v12, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v1|, |v3|
-; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_perm_b32 v2, v9, v0, s0
+; GFX942-NEXT: v_pk_min_f16 v2, v2, v10
; GFX942-NEXT: v_cndmask_b32_e32 v1, v12, v6, vcc
-; GFX942-NEXT: v_perm_b32 v2, v8, v1, s0
-; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v8, |v5| src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_pk_min_f16 v2, v2, v11
-; GFX942-NEXT: v_perm_b32 v6, v9, v0, s0
-; GFX942-NEXT: v_cndmask_b32_sdwa v3, v12, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v9, |v4| src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_pk_min_f16 v6, v6, v10
+; GFX942-NEXT: v_perm_b32 v3, v7, v1, s0
+; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v7, |v5| src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: v_pk_min_f16 v3, v3, v11
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_cndmask_b32_sdwa v7, v12, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: v_cndmask_b32_sdwa v6, v12, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v9, |v4| src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_sdwa v7, v12, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v1, |v5|
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v1, v12, v2, vcc
+; GFX942-NEXT: v_cndmask_b32_e32 v1, v12, v3, vcc
; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, |v4|
-; GFX942-NEXT: v_perm_b32 v1, v3, v1, s0
+; GFX942-NEXT: v_perm_b32 v1, v6, v1, s0
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_cndmask_b32_e32 v0, v12, v6, vcc
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v12, v2, vcc
; GFX942-NEXT: v_perm_b32 v0, v7, v0, s0
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
@@ -2981,34 +2978,34 @@ define <4 x half> @v_fminimum3_v4f16__fneg_all(<4 x half> %a, <4 x half> %b, <4
; GFX942-NEXT: s_mov_b32 s0, 0x5040100
; GFX942-NEXT: s_nop 0
; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc
+; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, -v0, -v2 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX942-NEXT: v_pk_min_f16 v2, v1, v3 neg_lo:[1,1] neg_hi:[1,1]
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_cmp_o_f16_e64 vcc, -v1, -v3
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX942-NEXT: v_pk_min_f16 v6, v1, v3 neg_lo:[1,1] neg_hi:[1,1]
+; GFX942-NEXT: v_cmp_o_f16_e64 vcc, -v1, -v3
+; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0
+; GFX942-NEXT: v_pk_min_f16 v2, v2, v4 neg_lo:[0,1] neg_hi:[0,1]
+; GFX942-NEXT: v_cndmask_b32_e32 v9, v7, v6, vcc
; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, -v1, -v3 src0_sel:WORD_1 src1_sel:WORD_1
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_perm_b32 v2, v1, v6, s0
-; GFX942-NEXT: v_pk_min_f16 v2, v2, v5 neg_lo:[0,1] neg_hi:[0,1]
-; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v6, -v5
+; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: v_perm_b32 v3, v1, v9, s0
+; GFX942-NEXT: v_pk_min_f16 v3, v3, v5 neg_lo:[0,1] neg_hi:[0,1]
+; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v9, -v5
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc
+; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v3, vcc
; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, -v5 src0_sel:DWORD src1_sel:WORD_1
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0
-; GFX942-NEXT: v_pk_min_f16 v2, v2, v4 neg_lo:[0,1] neg_hi:[0,1]
+; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v8, -v4
-; GFX942-NEXT: v_perm_b32 v1, v1, v3, s0
+; GFX942-NEXT: v_perm_b32 v1, v1, v6, s0
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_cndmask_b32_e32 v5, v7, v2, vcc
+; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc
; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, -v4 src0_sel:DWORD src1_sel:WORD_1
; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_perm_b32 v0, v0, v5, s0
+; GFX942-NEXT: v_perm_b32 v0, v0, v3, s0
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: v_fminimum3_v4f16__fneg_all:
@@ -3046,37 +3043,34 @@ define <4 x half> @v_fminimum3_v4f16__inlineimm1(<4 x half> %a, <4 x half> %c) {
; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1
; GFX942-NEXT: v_pk_min_f16 v4, v0, 2.0 op_sel_hi:[1,0]
; GFX942-NEXT: v_mov_b32_e32 v5, 0x7e00
-; GFX942-NEXT: v_pk_min_f16 v7, v1, 2.0 op_sel_hi:[1,0]
; GFX942-NEXT: v_cndmask_b32_sdwa v6, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_lshrrev_b32_e32 v8, 16, v7
-; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v1 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX942-NEXT: s_mov_b32 s0, 0x5040100
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_cndmask_b32_e32 v8, v5, v8, vcc
; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0
-; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_pk_min_f16 v7, v1, 2.0 op_sel_hi:[1,0]
+; GFX942-NEXT: s_mov_b32 s0, 0x5040100
; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX942-NEXT: v_perm_b32 v4, v6, v0, s0
+; GFX942-NEXT: v_pk_min_f16 v4, v4, v2
+; GFX942-NEXT: v_cndmask_b32_sdwa v8, v5, v7, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v1
; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc
-; GFX942-NEXT: v_perm_b32 v4, v8, v1, s0
+; GFX942-NEXT: v_perm_b32 v7, v8, v1, s0
; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v8, v3 src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_pk_min_f16 v4, v4, v3
+; GFX942-NEXT: v_pk_min_f16 v7, v7, v3
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_cndmask_b32_sdwa v7, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: v_cndmask_b32_sdwa v8, v5, v7, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v6, v2 src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_perm_b32 v6, v6, v0, s0
-; GFX942-NEXT: v_pk_min_f16 v6, v6, v2
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_cndmask_b32_sdwa v8, v5, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_sdwa v6, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc
; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2
-; GFX942-NEXT: v_perm_b32 v1, v7, v1, s0
+; GFX942-NEXT: v_perm_b32 v1, v8, v1, s0
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc
-; GFX942-NEXT: v_perm_b32 v0, v8, v0, s0
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX942-NEXT: v_perm_b32 v0, v6, v0, s0
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: v_fminimum3_v4f16__inlineimm1:
@@ -3114,34 +3108,34 @@ define <4 x half> @v_fminimum3_v4f16__inlineimm2(<4 x half> %a, <4 x half> %b) {
; GFX942-NEXT: s_mov_b32 s0, 0x5040100
; GFX942-NEXT: s_nop 0
; GFX942-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc
+; GFX942-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX942-NEXT: v_pk_min_f16 v2, v1, v3
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_cndmask_b32_sdwa v0, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v2, vcc
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX942-NEXT: v_pk_min_f16 v4, v1, v3
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3
+; GFX942-NEXT: v_perm_b32 v2, v0, v6, s0
+; GFX942-NEXT: v_pk_min_f16 v2, v2, 4.0 op_sel_hi:[1,0]
+; GFX942-NEXT: v_cndmask_b32_e32 v7, v5, v4, vcc
; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_sdwa v1, v5, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_perm_b32 v2, v1, v4, s0
-; GFX942-NEXT: v_pk_min_f16 v2, v2, 4.0 op_sel_hi:[1,0]
-; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v4, v4
+; GFX942-NEXT: v_cndmask_b32_sdwa v1, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: v_perm_b32 v3, v1, v7, s0
+; GFX942-NEXT: v_pk_min_f16 v3, v3, 4.0 op_sel_hi:[1,0]
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v7, v7
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v2, vcc
+; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc
; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v1
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_sdwa v1, v5, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_perm_b32 v2, v0, v6, s0
-; GFX942-NEXT: v_pk_min_f16 v2, v2, 4.0 op_sel_hi:[1,0]
+; GFX942-NEXT: v_cndmask_b32_sdwa v1, v5, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v6, v6
-; GFX942-NEXT: v_perm_b32 v1, v1, v3, s0
+; GFX942-NEXT: v_perm_b32 v1, v1, v4, s0
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v2, vcc
+; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v2, vcc
; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0
; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_cndmask_b32_sdwa v0, v5, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_perm_b32 v0, v0, v4, s0
+; GFX942-NEXT: v_perm_b32 v0, v0, v3, s0
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: v_fminimum3_v4f16__inlineimm2:
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmaximum.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmaximum.ll
index 389df695ba324..41fad10051dac 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmaximum.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmaximum.ll
@@ -431,131 +431,122 @@ define half @test_vector_reduce_fmaximum_v8half(<8 x half> %v) {
; GFX9-LABEL: test_vector_reduce_fmaximum_v8half:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_max_f16_sdwa v4, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_pk_max_f16 v4, v0, v2
; GFX9-NEXT: v_mov_b32_e32 v5, 0x7e00
-; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2
; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc
-; GFX9-NEXT: v_max_f16_e32 v4, v0, v1
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-NEXT: s_nop 1
; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc
-; GFX9-NEXT: v_max_f16_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_max_f16_e32 v2, v6, v0
+; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v6, v0
; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc
-; GFX9-NEXT: v_max_f16_e32 v1, v0, v2
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc
+; GFX9-NEXT: v_pk_max_f16 v2, v1, v3
+; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v3
; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc
-; GFX9-NEXT: v_max_f16_sdwa v1, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v2, vcc
+; GFX9-NEXT: v_max_f16_e32 v6, v0, v4
+; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v4
; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc
-; GFX9-NEXT: v_max_f16_e32 v1, v0, v3
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v3
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc
+; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc
-; GFX9-NEXT: v_max_f16_sdwa v1, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v3 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_cndmask_b32_sdwa v1, v5, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_max_f16_e32 v2, v0, v1
+; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_vector_reduce_fmaximum_v8half:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_max_f16_sdwa v4, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v4, vcc_lo
-; GFX10-NEXT: v_max_f16_e32 v4, v0, v1
-; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1
-; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v4, vcc_lo
-; GFX10-NEXT: v_max_f16_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v1 src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v4, vcc_lo
-; GFX10-NEXT: v_max_f16_e32 v1, v0, v2
+; GFX10-NEXT: v_pk_max_f16 v4, v0, v2
; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2
-; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX10-NEXT: v_max_f16_sdwa v1, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v2 src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX10-NEXT: v_max_f16_e32 v1, v0, v3
-; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v3
-; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX10-NEXT: v_max_f16_sdwa v1, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v3 src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v4, 0x7e00, v4, vcc_lo
+; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT: v_pk_max_f16 v2, v1, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v5, vcc_lo
+; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3
+; GFX10-NEXT: v_max_f16_e32 v5, v4, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v6, 0x7e00, v2, vcc_lo
+; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v4, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v5, vcc_lo
+; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT: v_mov_b32_e32 v1, 0x7e00
+; GFX10-NEXT: v_max_f16_e32 v4, v0, v6
+; GFX10-NEXT: v_cmp_o_f16_e64 s4, v0, v6
+; GFX10-NEXT: v_cndmask_b32_sdwa v1, v1, v2, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0x7e00, v4, s4
+; GFX10-NEXT: v_max_f16_e32 v2, v0, v1
+; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_fmaximum_v8half:
; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT: v_pk_max_f16 v4, v0, v2
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v0.h, v2.h
+; GFX11-SDAG-TRUE16-NEXT: v_pk_max_f16 v2, v1, v3
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v4.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v4.h, s0
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v1.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v2.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v1.h, v3.h
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, s0
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v2.h, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, s0
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v0.h
; GFX11-SDAG-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.h
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v1.l
-; GFX11-SDAG-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v1.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v1.h
-; GFX11-SDAG-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v1.h
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v2.l
-; GFX11-SDAG-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v2.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v2.h
-; GFX11-SDAG-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v2.h
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v3.l
-; GFX11-SDAG-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v3.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v3.h
-; GFX11-SDAG-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v3.h
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_fmaximum_v8half:
; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry
; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-FAKE16-NEXT: v_max_f16_e32 v5, v0, v4
-; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v4
-; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v5, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v1
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-FAKE16-NEXT: v_max_f16_e32 v4, v0, v1
-; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1
+; GFX11-SDAG-FAKE16-NEXT: v_pk_max_f16 v4, v0, v2
+; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2
+; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0
+; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v4
; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v4, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-FAKE16-NEXT: v_max_f16_e32 v1, v0, v5
-; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v5
-; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-SDAG-FAKE16-NEXT: v_max_f16_e32 v1, v0, v2
+; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v6, v5
+; GFX11-SDAG-FAKE16-NEXT: v_pk_max_f16 v4, v1, v3
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v7, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3
+; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-FAKE16-NEXT: v_max_f16_e32 v5, v0, v2
+; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v6, 0x7e00, v4, vcc_lo
; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2
; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3
-; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-FAKE16-NEXT: v_max_f16_e32 v1, v0, v4
-; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v4
-; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v5, vcc_lo
; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-FAKE16-NEXT: v_max_f16_e32 v1, v0, v3
-; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v3
-; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT: v_max_f16_e32 v3, v0, v6
+; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v6
+; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v3, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v2
+; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo
; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-FAKE16-NEXT: v_max_f16_e32 v1, v0, v2
-; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2
-; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT: v_max_f16_e32 v2, v0, v1
+; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1
+; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_fmaximum_v8half:
@@ -738,180 +729,174 @@ define half @test_vector_reduce_fmaximum_v16half(<16 x half> %v) {
; GFX9-LABEL: test_vector_reduce_fmaximum_v16half:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_max_f16_sdwa v8, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_pk_max_f16 v8, v2, v6
; GFX9-NEXT: v_mov_b32_e32 v9, 0x7e00
-; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v2, v6
+; GFX9-NEXT: s_mov_b32 s0, 0x5040100
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_cndmask_b32_e32 v10, v9, v8, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v2, v6 src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc
-; GFX9-NEXT: v_max_f16_e32 v8, v0, v1
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v9, v8, vcc
+; GFX9-NEXT: v_pk_max_f16 v8, v0, v4
+; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v4
+; GFX9-NEXT: v_perm_b32 v6, v2, v10, s0
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_cndmask_b32_e32 v11, v9, v8, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-NEXT: s_nop 1
; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc
-; GFX9-NEXT: v_max_f16_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_perm_b32 v4, v0, v11, s0
+; GFX9-NEXT: v_pk_max_f16 v4, v4, v6
+; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v11, v10
; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc
-; GFX9-NEXT: v_max_f16_e32 v1, v0, v2
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v9, v4, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2
; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc
-; GFX9-NEXT: v_max_f16_sdwa v1, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc
-; GFX9-NEXT: v_max_f16_e32 v1, v0, v3
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v3
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc
-; GFX9-NEXT: v_max_f16_sdwa v1, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v3 src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc
-; GFX9-NEXT: v_max_f16_e32 v1, v0, v4
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v4
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc
-; GFX9-NEXT: v_max_f16_sdwa v1, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v4, vcc
+; GFX9-NEXT: v_max_f16_e32 v2, v6, v0
+; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v6, v0
+; GFX9-NEXT: v_pk_max_f16 v6, v1, v5
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v2, vcc
+; GFX9-NEXT: v_pk_max_f16 v2, v3, v7
+; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v3, v7
; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc
-; GFX9-NEXT: v_max_f16_e32 v1, v0, v5
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v5
+; GFX9-NEXT: v_cndmask_b32_e32 v4, v9, v2, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v3, v7 src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc
-; GFX9-NEXT: v_max_f16_sdwa v1, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v5 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc
+; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v5
+; GFX9-NEXT: v_perm_b32 v3, v2, v4, s0
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v6, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v5 src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc
-; GFX9-NEXT: v_max_f16_e32 v1, v0, v6
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v6
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v6, vcc
+; GFX9-NEXT: v_perm_b32 v5, v1, v7, s0
+; GFX9-NEXT: v_pk_max_f16 v3, v5, v3
+; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v7, v4
; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc
-; GFX9-NEXT: v_max_f16_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v6 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_cndmask_b32_e32 v4, v9, v3, vcc
+; GFX9-NEXT: v_max_f16_e32 v5, v0, v4
+; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v4
; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc
-; GFX9-NEXT: v_max_f16_e32 v1, v0, v7
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v7
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v5, vcc
+; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v2
; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc
-; GFX9-NEXT: v_max_f16_sdwa v1, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v7 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_cndmask_b32_sdwa v1, v9, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_max_f16_e32 v2, v0, v1
+; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v2, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_vector_reduce_fmaximum_v16half:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_max_f16_sdwa v8, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v8, vcc_lo
-; GFX10-NEXT: v_max_f16_e32 v8, v0, v1
-; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1
-; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v8, vcc_lo
-; GFX10-NEXT: v_max_f16_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v1 src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v8, vcc_lo
-; GFX10-NEXT: v_max_f16_e32 v1, v0, v2
-; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2
-; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX10-NEXT: v_max_f16_sdwa v1, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v2 src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX10-NEXT: v_max_f16_e32 v1, v0, v3
-; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v3
-; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX10-NEXT: v_max_f16_sdwa v1, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v3 src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX10-NEXT: v_max_f16_e32 v1, v0, v4
+; GFX10-NEXT: v_pk_max_f16 v8, v2, v6
+; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v2, v6
+; GFX10-NEXT: v_pk_max_f16 v9, v0, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v8
+; GFX10-NEXT: v_cndmask_b32_e32 v8, 0x7e00, v8, vcc_lo
+; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v2, v6 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v9
+; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v10, vcc_lo
; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v4
-; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX10-NEXT: v_max_f16_sdwa v1, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v4 src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX10-NEXT: v_max_f16_e32 v1, v0, v5
-; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v5
-; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX10-NEXT: v_max_f16_sdwa v1, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v5 src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX10-NEXT: v_max_f16_e32 v1, v0, v6
-; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v6
-; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX10-NEXT: v_max_f16_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v6 src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX10-NEXT: v_max_f16_e32 v1, v0, v7
-; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v7
-; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX10-NEXT: v_max_f16_sdwa v1, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v7 src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v6, 0x7e00, v9, vcc_lo
+; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v4 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT: v_pk_max_f16 v9, v3, v7
+; GFX10-NEXT: v_perm_b32 v4, v2, v8, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v11, vcc_lo
+; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v7
+; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v9
+; GFX10-NEXT: v_pk_max_f16 v11, v1, v5
+; GFX10-NEXT: v_perm_b32 v10, v0, v6, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v9, 0x7e00, v9, vcc_lo
+; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v3, v7 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT: v_pk_max_f16 v4, v10, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v11
+; GFX10-NEXT: v_cndmask_b32_e32 v3, 0x7e00, v12, vcc_lo
+; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v5
+; GFX10-NEXT: v_cndmask_b32_e32 v7, 0x7e00, v11, vcc_lo
+; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v1, v5 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v10, vcc_lo
+; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v6, v8
+; GFX10-NEXT: v_perm_b32 v6, v3, v9, 0x5040100
+; GFX10-NEXT: v_perm_b32 v8, v1, v7, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v4, 0x7e00, v4, vcc_lo
+; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2
+; GFX10-NEXT: v_pk_max_f16 v2, v8, v6
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v5, vcc_lo
+; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v7, v9
+; GFX10-NEXT: v_max_f16_e32 v5, v4, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v6, 0x7e00, v2, vcc_lo
+; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v4, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v5, vcc_lo
+; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3
+; GFX10-NEXT: v_mov_b32_e32 v1, 0x7e00
+; GFX10-NEXT: v_max_f16_e32 v4, v0, v6
+; GFX10-NEXT: v_cmp_o_f16_e64 s4, v0, v6
+; GFX10-NEXT: v_cndmask_b32_sdwa v1, v1, v2, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0x7e00, v4, s4
+; GFX10-NEXT: v_max_f16_e32 v2, v0, v1
+; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_fmaximum_v16half:
; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v2.l, v6.l
+; GFX11-SDAG-TRUE16-NEXT: v_pk_max_f16 v8, v2, v6
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v2.h, v6.h
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e64 s1, v0.l, v4.l
+; GFX11-SDAG-TRUE16-NEXT: v_pk_max_f16 v2, v0, v4
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e64 s2, v0.h, v4.h
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v8.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v8.h, s0
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v3.l, v7.l
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v2.l, 0x7e00, v2.l, s1
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v2.h, 0x7e00, v2.h, s2
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e64 s1, v3.h, v7.h
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e64 s2, v1.l, v5.l
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e64 s3, v1.h, v5.h
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v2.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT: v_pk_max_f16 v4, v2, v0
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v4.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v2.h, v0.h
+; GFX11-SDAG-TRUE16-NEXT: v_pk_max_f16 v2, v3, v7
+; GFX11-SDAG-TRUE16-NEXT: v_pk_max_f16 v3, v1, v5
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v4.h, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x7e00, v2.l, s0
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.h, 0x7e00, v2.h, s1
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v2.l, 0x7e00, v3.l, s2
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v2.h, 0x7e00, v3.h, s3
; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v0.h
; GFX11-SDAG-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.h
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v1.l
-; GFX11-SDAG-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v1.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v1.h
-; GFX11-SDAG-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v1.h
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v2.l
-; GFX11-SDAG-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v2.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v2.h
-; GFX11-SDAG-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v2.h
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v3.l
-; GFX11-SDAG-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v3.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v3.h
-; GFX11-SDAG-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v3.h
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v4.l
-; GFX11-SDAG-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v4.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v4.h
-; GFX11-SDAG-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v4.h
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v5.l
-; GFX11-SDAG-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v5.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v5.h
-; GFX11-SDAG-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v5.h
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v6.l
-; GFX11-SDAG-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v6.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v2.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT: v_pk_max_f16 v3, v2, v1
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v6.h
-; GFX11-SDAG-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v6.h
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v7.l
-; GFX11-SDAG-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v7.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v3.l, s0
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v2.h, v1.h
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v3.h, s0
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v7.h
-; GFX11-SDAG-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v7.h
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.h
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
@@ -919,74 +904,70 @@ define half @test_vector_reduce_fmaximum_v16half(<16 x half> %v) {
; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_fmaximum_v16half:
; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry
; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-FAKE16-NEXT: v_max_f16_e32 v9, v0, v8
-; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v8
-; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v9, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v1
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-FAKE16-NEXT: v_max_f16_e32 v8, v0, v1
-; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1
-; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v8, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v2
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-FAKE16-NEXT: v_max_f16_e32 v1, v0, v9
-; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v9
-; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-SDAG-FAKE16-NEXT: v_max_f16_e32 v1, v0, v2
-; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2
-; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3
-; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-FAKE16-NEXT: v_max_f16_e32 v1, v0, v8
-; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v8
-; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-FAKE16-NEXT: v_max_f16_e32 v1, v0, v3
-; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v3
-; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-SDAG-FAKE16-NEXT: v_max_f16_e32 v1, v0, v2
-; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2
-; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v4
-; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-FAKE16-NEXT: v_max_f16_e32 v1, v0, v4
+; GFX11-SDAG-FAKE16-NEXT: v_pk_max_f16 v8, v2, v6
+; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v2, v6
+; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-SDAG-FAKE16-NEXT: v_pk_max_f16 v10, v0, v4
+; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v4
+; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v9, 0x7e00, v8, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v2, v6
+; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v0
+; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v10
+; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v1
+; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v8, vcc_lo
; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v4
-; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-SDAG-FAKE16-NEXT: v_max_f16_e32 v1, v0, v2
-; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2
-; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v5
-; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-FAKE16-NEXT: v_max_f16_e32 v1, v0, v5
-; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v5
-; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-SDAG-FAKE16-NEXT: v_max_f16_e32 v1, v0, v2
-; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2
-; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v6
-; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-FAKE16-NEXT: v_max_f16_e32 v1, v0, v6
-; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v6
-; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-SDAG-FAKE16-NEXT: v_max_f16_e32 v1, v0, v2
-; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2
-; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v7
-; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-FAKE16-NEXT: v_max_f16_e32 v1, v0, v7
-; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v7
-; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-FAKE16-NEXT: v_max_f16_e32 v1, v0, v2
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT: v_perm_b32 v6, v2, v9, 0x5040100
+; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v10, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v12, v11
+; GFX11-SDAG-FAKE16-NEXT: v_pk_max_f16 v10, v3, v7
+; GFX11-SDAG-FAKE16-NEXT: v_pk_max_f16 v12, v1, v5
+; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v4, 0x7e00, v13, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v7
+; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v5
+; GFX11-SDAG-FAKE16-NEXT: v_perm_b32 v8, v4, v0, 0x5040100
+; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v12
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-SDAG-FAKE16-NEXT: v_pk_max_f16 v6, v8, v6
+; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v8, 0x7e00, v10, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v7
+; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v6
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v3, 0x7e00, v10, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v5
+; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v12, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v14, v13
+; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v5, 0x7e00, v15, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v9
+; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v6, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v4, v2
+; GFX11-SDAG-FAKE16-NEXT: v_perm_b32 v4, v3, v8, 0x5040100
+; GFX11-SDAG-FAKE16-NEXT: v_perm_b32 v6, v5, v1, 0x5040100
+; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v11, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT: v_pk_max_f16 v4, v6, v4
+; GFX11-SDAG-FAKE16-NEXT: v_max_f16_e32 v7, v0, v2
; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2
-; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v7, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v8
+; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-SDAG-FAKE16-NEXT: v_max_f16_e32 v2, v0, v1
+; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v5, v3
+; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT: v_max_f16_e32 v2, v0, v1
+; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_fmaximum_v16half:
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-fminimum.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-fminimum.ll
index 2f628b7cdb281..61819a85dd82c 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-fminimum.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-fminimum.ll
@@ -509,131 +509,122 @@ define half @test_vector_reduce_fminimum_v8half(<8 x half> %v) {
; GFX9-LABEL: test_vector_reduce_fminimum_v8half:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_min_f16_sdwa v4, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_pk_min_f16 v4, v0, v2
; GFX9-NEXT: v_mov_b32_e32 v5, 0x7e00
-; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2
; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc
-; GFX9-NEXT: v_min_f16_e32 v4, v0, v1
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-NEXT: s_nop 1
; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc
-; GFX9-NEXT: v_min_f16_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_min_f16_e32 v2, v6, v0
+; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v6, v0
; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc
-; GFX9-NEXT: v_min_f16_e32 v1, v0, v2
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc
+; GFX9-NEXT: v_pk_min_f16 v2, v1, v3
+; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v3
; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc
-; GFX9-NEXT: v_min_f16_sdwa v1, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v2, vcc
+; GFX9-NEXT: v_min_f16_e32 v6, v0, v4
+; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v4
; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc
-; GFX9-NEXT: v_min_f16_e32 v1, v0, v3
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v3
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc
+; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc
-; GFX9-NEXT: v_min_f16_sdwa v1, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v3 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_cndmask_b32_sdwa v1, v5, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_min_f16_e32 v2, v0, v1
+; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_vector_reduce_fminimum_v8half:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_min_f16_sdwa v4, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v4, vcc_lo
-; GFX10-NEXT: v_min_f16_e32 v4, v0, v1
-; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1
-; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v4, vcc_lo
-; GFX10-NEXT: v_min_f16_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v1 src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v4, vcc_lo
-; GFX10-NEXT: v_min_f16_e32 v1, v0, v2
+; GFX10-NEXT: v_pk_min_f16 v4, v0, v2
; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2
-; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX10-NEXT: v_min_f16_sdwa v1, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v2 src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX10-NEXT: v_min_f16_e32 v1, v0, v3
-; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v3
-; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX10-NEXT: v_min_f16_sdwa v1, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v3 src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v4, 0x7e00, v4, vcc_lo
+; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT: v_pk_min_f16 v2, v1, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v5, vcc_lo
+; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3
+; GFX10-NEXT: v_min_f16_e32 v5, v4, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v6, 0x7e00, v2, vcc_lo
+; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v4, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v5, vcc_lo
+; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT: v_mov_b32_e32 v1, 0x7e00
+; GFX10-NEXT: v_min_f16_e32 v4, v0, v6
+; GFX10-NEXT: v_cmp_o_f16_e64 s4, v0, v6
+; GFX10-NEXT: v_cndmask_b32_sdwa v1, v1, v2, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0x7e00, v4, s4
+; GFX10-NEXT: v_min_f16_e32 v2, v0, v1
+; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_fminimum_v8half:
; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT: v_pk_min_f16 v4, v0, v2
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v0.h, v2.h
+; GFX11-SDAG-TRUE16-NEXT: v_pk_min_f16 v2, v1, v3
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v4.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v4.h, s0
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v1.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v2.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v1.h, v3.h
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, s0
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v2.h, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, s0
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v0.h
; GFX11-SDAG-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v0.h
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v1.l
-; GFX11-SDAG-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v1.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v1.h
-; GFX11-SDAG-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v1.h
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v2.l
-; GFX11-SDAG-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v2.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v2.h
-; GFX11-SDAG-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v2.h
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v3.l
-; GFX11-SDAG-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v3.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v3.h
-; GFX11-SDAG-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v3.h
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_fminimum_v8half:
; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry
; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-FAKE16-NEXT: v_min_f16_e32 v5, v0, v4
-; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v4
-; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v5, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v1
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-FAKE16-NEXT: v_min_f16_e32 v4, v0, v1
-; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1
+; GFX11-SDAG-FAKE16-NEXT: v_pk_min_f16 v4, v0, v2
+; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2
+; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0
+; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v4
; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v4, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-FAKE16-NEXT: v_min_f16_e32 v1, v0, v5
-; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v5
-; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-SDAG-FAKE16-NEXT: v_min_f16_e32 v1, v0, v2
+; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v6, v5
+; GFX11-SDAG-FAKE16-NEXT: v_pk_min_f16 v4, v1, v3
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v7, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3
+; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-FAKE16-NEXT: v_min_f16_e32 v5, v0, v2
+; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v6, 0x7e00, v4, vcc_lo
; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2
; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3
-; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-FAKE16-NEXT: v_min_f16_e32 v1, v0, v4
-; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v4
-; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v5, vcc_lo
; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-FAKE16-NEXT: v_min_f16_e32 v1, v0, v3
-; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v3
-; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT: v_min_f16_e32 v3, v0, v6
+; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v6
+; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v3, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v2
+; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo
; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-FAKE16-NEXT: v_min_f16_e32 v1, v0, v2
-; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2
-; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT: v_min_f16_e32 v2, v0, v1
+; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1
+; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_fminimum_v8half:
@@ -858,180 +849,174 @@ define half @test_vector_reduce_fminimum_v16half(<16 x half> %v) {
; GFX9-LABEL: test_vector_reduce_fminimum_v16half:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_min_f16_sdwa v8, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_pk_min_f16 v8, v2, v6
; GFX9-NEXT: v_mov_b32_e32 v9, 0x7e00
-; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v2, v6
+; GFX9-NEXT: s_mov_b32 s0, 0x5040100
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_cndmask_b32_e32 v10, v9, v8, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v2, v6 src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc
-; GFX9-NEXT: v_min_f16_e32 v8, v0, v1
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v9, v8, vcc
+; GFX9-NEXT: v_pk_min_f16 v8, v0, v4
+; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v4
+; GFX9-NEXT: v_perm_b32 v6, v2, v10, s0
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_cndmask_b32_e32 v11, v9, v8, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-NEXT: s_nop 1
; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc
-; GFX9-NEXT: v_min_f16_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_perm_b32 v4, v0, v11, s0
+; GFX9-NEXT: v_pk_min_f16 v4, v4, v6
+; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v11, v10
; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc
-; GFX9-NEXT: v_min_f16_e32 v1, v0, v2
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v9, v4, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2
; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc
-; GFX9-NEXT: v_min_f16_sdwa v1, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc
-; GFX9-NEXT: v_min_f16_e32 v1, v0, v3
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v3
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc
-; GFX9-NEXT: v_min_f16_sdwa v1, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v3 src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc
-; GFX9-NEXT: v_min_f16_e32 v1, v0, v4
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v4
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc
-; GFX9-NEXT: v_min_f16_sdwa v1, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v4, vcc
+; GFX9-NEXT: v_min_f16_e32 v2, v6, v0
+; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v6, v0
+; GFX9-NEXT: v_pk_min_f16 v6, v1, v5
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v2, vcc
+; GFX9-NEXT: v_pk_min_f16 v2, v3, v7
+; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v3, v7
; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc
-; GFX9-NEXT: v_min_f16_e32 v1, v0, v5
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v5
+; GFX9-NEXT: v_cndmask_b32_e32 v4, v9, v2, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v3, v7 src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc
-; GFX9-NEXT: v_min_f16_sdwa v1, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v5 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc
+; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v5
+; GFX9-NEXT: v_perm_b32 v3, v2, v4, s0
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v6, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v5 src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc
-; GFX9-NEXT: v_min_f16_e32 v1, v0, v6
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v6
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v6, vcc
+; GFX9-NEXT: v_perm_b32 v5, v1, v7, s0
+; GFX9-NEXT: v_pk_min_f16 v3, v5, v3
+; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v7, v4
; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc
-; GFX9-NEXT: v_min_f16_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v6 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_cndmask_b32_e32 v4, v9, v3, vcc
+; GFX9-NEXT: v_min_f16_e32 v5, v0, v4
+; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v4
; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc
-; GFX9-NEXT: v_min_f16_e32 v1, v0, v7
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v7
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v5, vcc
+; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v2
; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc
-; GFX9-NEXT: v_min_f16_sdwa v1, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v7 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_cndmask_b32_sdwa v1, v9, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_min_f16_e32 v2, v0, v1
+; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v2, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_vector_reduce_fminimum_v16half:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_min_f16_sdwa v8, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v8, vcc_lo
-; GFX10-NEXT: v_min_f16_e32 v8, v0, v1
-; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1
-; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v8, vcc_lo
-; GFX10-NEXT: v_min_f16_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v1 src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v8, vcc_lo
-; GFX10-NEXT: v_min_f16_e32 v1, v0, v2
-; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2
-; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX10-NEXT: v_min_f16_sdwa v1, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v2 src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX10-NEXT: v_min_f16_e32 v1, v0, v3
-; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v3
-; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX10-NEXT: v_min_f16_sdwa v1, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v3 src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX10-NEXT: v_min_f16_e32 v1, v0, v4
+; GFX10-NEXT: v_pk_min_f16 v8, v2, v6
+; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v2, v6
+; GFX10-NEXT: v_pk_min_f16 v9, v0, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v8
+; GFX10-NEXT: v_cndmask_b32_e32 v8, 0x7e00, v8, vcc_lo
+; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v2, v6 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v9
+; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v10, vcc_lo
; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v4
-; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX10-NEXT: v_min_f16_sdwa v1, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v4 src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX10-NEXT: v_min_f16_e32 v1, v0, v5
-; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v5
-; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX10-NEXT: v_min_f16_sdwa v1, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v5 src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX10-NEXT: v_min_f16_e32 v1, v0, v6
-; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v6
-; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX10-NEXT: v_min_f16_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v6 src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX10-NEXT: v_min_f16_e32 v1, v0, v7
-; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v7
-; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX10-NEXT: v_min_f16_sdwa v1, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v7 src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v6, 0x7e00, v9, vcc_lo
+; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v4 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT: v_pk_min_f16 v9, v3, v7
+; GFX10-NEXT: v_perm_b32 v4, v2, v8, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v11, vcc_lo
+; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v7
+; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v9
+; GFX10-NEXT: v_pk_min_f16 v11, v1, v5
+; GFX10-NEXT: v_perm_b32 v10, v0, v6, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v9, 0x7e00, v9, vcc_lo
+; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v3, v7 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT: v_pk_min_f16 v4, v10, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v11
+; GFX10-NEXT: v_cndmask_b32_e32 v3, 0x7e00, v12, vcc_lo
+; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v5
+; GFX10-NEXT: v_cndmask_b32_e32 v7, 0x7e00, v11, vcc_lo
+; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v1, v5 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v10, vcc_lo
+; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v6, v8
+; GFX10-NEXT: v_perm_b32 v6, v3, v9, 0x5040100
+; GFX10-NEXT: v_perm_b32 v8, v1, v7, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v4, 0x7e00, v4, vcc_lo
+; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2
+; GFX10-NEXT: v_pk_min_f16 v2, v8, v6
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v5, vcc_lo
+; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v7, v9
+; GFX10-NEXT: v_min_f16_e32 v5, v4, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v6, 0x7e00, v2, vcc_lo
+; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v4, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v5, vcc_lo
+; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3
+; GFX10-NEXT: v_mov_b32_e32 v1, 0x7e00
+; GFX10-NEXT: v_min_f16_e32 v4, v0, v6
+; GFX10-NEXT: v_cmp_o_f16_e64 s4, v0, v6
+; GFX10-NEXT: v_cndmask_b32_sdwa v1, v1, v2, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0x7e00, v4, s4
+; GFX10-NEXT: v_min_f16_e32 v2, v0, v1
+; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_fminimum_v16half:
; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v2.l, v6.l
+; GFX11-SDAG-TRUE16-NEXT: v_pk_min_f16 v8, v2, v6
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v2.h, v6.h
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e64 s1, v0.l, v4.l
+; GFX11-SDAG-TRUE16-NEXT: v_pk_min_f16 v2, v0, v4
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e64 s2, v0.h, v4.h
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v8.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v8.h, s0
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v3.l, v7.l
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v2.l, 0x7e00, v2.l, s1
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v2.h, 0x7e00, v2.h, s2
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e64 s1, v3.h, v7.h
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e64 s2, v1.l, v5.l
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e64 s3, v1.h, v5.h
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v2.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT: v_pk_min_f16 v4, v2, v0
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v4.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v2.h, v0.h
+; GFX11-SDAG-TRUE16-NEXT: v_pk_min_f16 v2, v3, v7
+; GFX11-SDAG-TRUE16-NEXT: v_pk_min_f16 v3, v1, v5
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v4.h, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x7e00, v2.l, s0
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.h, 0x7e00, v2.h, s1
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v2.l, 0x7e00, v3.l, s2
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v2.h, 0x7e00, v3.h, s3
; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v0.h
; GFX11-SDAG-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v0.h
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v1.l
-; GFX11-SDAG-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v1.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v1.h
-; GFX11-SDAG-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v1.h
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v2.l
-; GFX11-SDAG-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v2.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v2.h
-; GFX11-SDAG-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v2.h
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v3.l
-; GFX11-SDAG-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v3.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v3.h
-; GFX11-SDAG-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v3.h
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v4.l
-; GFX11-SDAG-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v4.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v4.h
-; GFX11-SDAG-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v4.h
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v5.l
-; GFX11-SDAG-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v5.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v5.h
-; GFX11-SDAG-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v5.h
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v6.l
-; GFX11-SDAG-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v6.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v2.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT: v_pk_min_f16 v3, v2, v1
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v6.h
-; GFX11-SDAG-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v6.h
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v7.l
-; GFX11-SDAG-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v7.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v3.l, s0
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v2.h, v1.h
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v3.h, s0
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v7.h
-; GFX11-SDAG-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v7.h
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v0.h
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
@@ -1039,74 +1024,70 @@ define half @test_vector_reduce_fminimum_v16half(<16 x half> %v) {
; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_fminimum_v16half:
; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry
; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-FAKE16-NEXT: v_min_f16_e32 v9, v0, v8
-; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v8
-; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v9, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v1
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-FAKE16-NEXT: v_min_f16_e32 v8, v0, v1
-; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1
-; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v8, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v2
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-FAKE16-NEXT: v_min_f16_e32 v1, v0, v9
-; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v9
-; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-SDAG-FAKE16-NEXT: v_min_f16_e32 v1, v0, v2
-; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2
-; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3
-; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-FAKE16-NEXT: v_min_f16_e32 v1, v0, v8
-; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v8
-; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-FAKE16-NEXT: v_min_f16_e32 v1, v0, v3
-; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v3
-; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-SDAG-FAKE16-NEXT: v_min_f16_e32 v1, v0, v2
-; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2
-; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v4
-; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-FAKE16-NEXT: v_min_f16_e32 v1, v0, v4
+; GFX11-SDAG-FAKE16-NEXT: v_pk_min_f16 v8, v2, v6
+; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v2, v6
+; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-SDAG-FAKE16-NEXT: v_pk_min_f16 v10, v0, v4
+; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v4
+; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v9, 0x7e00, v8, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v2, v6
+; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v0
+; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v10
+; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v1
+; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v8, vcc_lo
; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v4
-; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-SDAG-FAKE16-NEXT: v_min_f16_e32 v1, v0, v2
-; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2
-; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v5
-; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-FAKE16-NEXT: v_min_f16_e32 v1, v0, v5
-; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v5
-; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-SDAG-FAKE16-NEXT: v_min_f16_e32 v1, v0, v2
-; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2
-; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v6
-; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-FAKE16-NEXT: v_min_f16_e32 v1, v0, v6
-; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v6
-; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-SDAG-FAKE16-NEXT: v_min_f16_e32 v1, v0, v2
-; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2
-; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v7
-; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-FAKE16-NEXT: v_min_f16_e32 v1, v0, v7
-; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v7
-; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-FAKE16-NEXT: v_min_f16_e32 v1, v0, v2
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT: v_perm_b32 v6, v2, v9, 0x5040100
+; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v10, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v12, v11
+; GFX11-SDAG-FAKE16-NEXT: v_pk_min_f16 v10, v3, v7
+; GFX11-SDAG-FAKE16-NEXT: v_pk_min_f16 v12, v1, v5
+; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v4, 0x7e00, v13, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v7
+; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v5
+; GFX11-SDAG-FAKE16-NEXT: v_perm_b32 v8, v4, v0, 0x5040100
+; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v12
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-SDAG-FAKE16-NEXT: v_pk_min_f16 v6, v8, v6
+; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v8, 0x7e00, v10, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v7
+; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v6
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v3, 0x7e00, v10, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v5
+; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v12, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v14, v13
+; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v5, 0x7e00, v15, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v9
+; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v6, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v4, v2
+; GFX11-SDAG-FAKE16-NEXT: v_perm_b32 v4, v3, v8, 0x5040100
+; GFX11-SDAG-FAKE16-NEXT: v_perm_b32 v6, v5, v1, 0x5040100
+; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v11, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT: v_pk_min_f16 v4, v6, v4
+; GFX11-SDAG-FAKE16-NEXT: v_min_f16_e32 v7, v0, v2
; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2
-; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v7, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v8
+; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-SDAG-FAKE16-NEXT: v_min_f16_e32 v2, v0, v1
+; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v5, v3
+; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT: v_min_f16_e32 v2, v0, v1
+; GFX11-SDAG-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_fminimum_v16half:
More information about the llvm-commits
mailing list